14
14
},
15
15
"cells" : [
16
16
{
17
+ "cell_type" : " markdown" ,
18
+ "metadata" : {
19
+ "id" : " OyyWh-4jHlcI" ,
20
+ "colab_type" : " text"
21
+ },
22
+ "source" : [
23
+ " ## Exercise 2: Missing Value Preprocessing with High Reproducibility"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type" : " code" ,
17
28
"metadata" : {
18
29
"id" : " E0PhwttvxbGz" ,
19
30
"colab_type" : " code" ,
20
31
"colab" : {}
21
32
},
22
- "cell_type" : " code" ,
23
33
"source" : [
24
34
" import numpy as np\n " ,
25
35
" import pandas as pd"
28
38
"outputs" : []
29
39
},
30
40
{
41
+ "cell_type" : " code" ,
31
42
"metadata" : {
32
43
"id" : " Cq_pPqioxdTv" ,
33
44
"colab_type" : " code" ,
34
45
"colab" : {}
35
46
},
36
- "cell_type" : " code" ,
37
47
"source" : [
38
48
" df = pd.read_csv('https://raw.githubusercontent.com/TrainingByPackt/Big-Data-Analysis-with-Python/master/Lesson07/Dataset/bank.csv', sep=';')"
39
49
],
40
50
"execution_count" : 0 ,
41
51
"outputs" : []
42
52
},
43
53
{
54
+ "cell_type" : " code" ,
44
55
"metadata" : {
45
56
"id" : " kO8KhEfmxpgm" ,
46
57
"colab_type" : " code" ,
58
+ "outputId" : " f5026961-ddf5-416f-9401-fbd377720950" ,
47
59
"colab" : {
48
60
"base_uri" : " https://localhost:8080/" ,
49
61
"height" : 204
50
- },
51
- "outputId" : " b373d2f4-35b8-48b0-c25d-257297f5483d"
62
+ }
52
63
},
53
- "cell_type" : " code" ,
54
64
"source" : [
55
65
" df.head(5)"
56
66
],
57
- "execution_count" : 6 ,
67
+ "execution_count" : 3 ,
58
68
"outputs" : [
59
69
{
60
70
"output_type" : " execute_result" ,
221
231
"metadata" : {
222
232
"tags" : []
223
233
},
224
- "execution_count" : 6
234
+ "execution_count" : 3
225
235
}
226
236
]
227
237
},
228
238
{
239
+ "cell_type" : " code" ,
229
240
"metadata" : {
230
241
"id" : " rQhGcWosxrkX" ,
231
242
"colab_type" : " code" ,
232
243
"colab" : {}
233
244
},
234
- "cell_type" : " code" ,
235
245
"source" : [
236
246
" import collections\n " ,
237
247
" import random\n " ,
244
254
"outputs" : []
245
255
},
246
256
{
257
+ "cell_type" : " code" ,
247
258
"metadata" : {
248
259
"id" : " Q2rSdiA80Gp_" ,
249
260
"colab_type" : " code" ,
250
261
"colab" : {}
251
262
},
252
- "cell_type" : " code" ,
253
263
"source" : [
254
264
" for row, col in ix:\n " ,
255
265
" if len(replaced[row]) < df.shape[1] - 1:\n " ,
263
273
"outputs" : []
264
274
},
265
275
{
276
+ "cell_type" : " code" ,
266
277
"metadata" : {
267
278
"id" : " xWHplf2_0IMe" ,
268
279
"colab_type" : " code" ,
280
+ "outputId" : " c8dc8b1b-eedf-4727-c0b2-c4d705719fe3" ,
269
281
"colab" : {
270
282
"base_uri" : " https://localhost:8080/" ,
271
283
"height" : 323
272
- },
273
- "outputId" : " 0f5f40f8-8195-40c2-8f4d-eba87d9db113"
284
+ }
274
285
},
275
- "cell_type" : " code" ,
276
286
"source" : [
277
287
" print(df.isna().sum())\n "
278
288
],
279
- "execution_count" : 12 ,
289
+ "execution_count" : 6 ,
280
290
"outputs" : [
281
291
{
282
292
"output_type" : " stream" ,
283
293
"text" : [
284
- " age 461 \n " ,
285
- " job 470 \n " ,
286
- " marital 462 \n " ,
287
- " education 462 \n " ,
288
- " default 486 \n " ,
289
- " balance 460 \n " ,
290
- " housing 475 \n " ,
291
- " loan 444 \n " ,
292
- " contact 443 \n " ,
293
- " day 438 \n " ,
294
- " month 414 \n " ,
295
- " duration 446 \n " ,
296
- " campaign 496 \n " ,
297
- " pdays 403 \n " ,
298
- " previous 429 \n " ,
299
- " poutcome 443 \n " ,
300
- " y 454 \n " ,
294
+ " age 459 \n " ,
295
+ " job 503 \n " ,
296
+ " marital 487 \n " ,
297
+ " education 477 \n " ,
298
+ " default 458 \n " ,
299
+ " balance 425 \n " ,
300
+ " housing 442 \n " ,
301
+ " loan 420 \n " ,
302
+ " contact 479 \n " ,
303
+ " day 429 \n " ,
304
+ " month 420 \n " ,
305
+ " duration 461 \n " ,
306
+ " campaign 478 \n " ,
307
+ " pdays 444 \n " ,
308
+ " previous 400 \n " ,
309
+ " poutcome 453 \n " ,
310
+ " y 451 \n " ,
301
311
" dtype: int64\n "
302
312
],
303
313
"name" : " stdout"
304
314
}
305
315
]
306
316
},
307
317
{
318
+ "cell_type" : " code" ,
308
319
"metadata" : {
309
320
"id" : " gy3hrHu40J__" ,
310
321
"colab_type" : " code" ,
322
+ "outputId" : " 0f295ef2-74ac-40c8-cf5f-f01bdbbd9ed0" ,
311
323
"colab" : {
312
324
"base_uri" : " https://localhost:8080/" ,
313
325
"height" : 2193
314
- },
315
- "outputId" : " 69f35783-00ff-4a34-aa34-8bd91301bcfd"
326
+ }
316
327
},
317
- "cell_type" : " code" ,
318
328
"source" : [
319
329
" num = df._get_numeric_data()\n " ,
320
330
" Q1 = num.quantile(0.25)\n " ,
323
333
" print(num < (Q1 - 1.5 * IQR))\n " ,
324
334
" print(num > (Q3 + 1.5 * IQR))\n "
325
335
],
326
- "execution_count" : 13 ,
336
+ "execution_count" : 7 ,
327
337
"outputs" : [
328
338
{
329
339
"output_type" : " stream" ,
395
405
" age balance day duration campaign pdays previous\n " ,
396
406
" 0 False False False False False False False\n " ,
397
407
" 1 False True False False False True True\n " ,
398
- " 2 False False False False False True False \n " ,
408
+ " 2 False False False False False True True \n " ,
399
409
" 3 False False False False False False False\n " ,
400
410
" 4 False False False False False False False\n " ,
401
411
" 5 False False False False False True True\n " ,
402
- " 6 False False False False False True True\n " ,
412
+ " 6 False False False False False False True\n " ,
403
413
" 7 False False False False False False False\n " ,
404
414
" 8 False False False False False False False\n " ,
405
415
" 9 False False False False False True True\n " ,
406
416
" 10 False True False False False False False\n " ,
407
417
" 11 False False False False False False False\n " ,
408
418
" 12 False False False False False False False\n " ,
409
419
" 13 False False False False False False False\n " ,
410
- " 14 False False False False False True True \n " ,
420
+ " 14 False False False False False True False \n " ,
411
421
" 15 False False False False False False False\n " ,
412
422
" 16 False True False False False False False\n " ,
413
423
" 17 False False False False False True True\n " ,
428
438
" 4492 False False False False False True True\n " ,
429
439
" 4493 False False False False False False False\n " ,
430
440
" 4494 False False False False False False False\n " ,
431
- " 4495 False False False False False False False\n " ,
441
+ " 4495 False False False True False False False\n " ,
432
442
" 4496 False False False False False False False\n " ,
433
443
" 4497 False False False False False False False\n " ,
434
444
" 4498 False False False True False False False\n " ,
452
462
" 4516 False False False False False False False\n " ,
453
463
" 4517 False False False False False False False\n " ,
454
464
" 4518 False False False False True False False\n " ,
455
- " 4519 False False False False False True True\n " ,
465
+ " 4519 False False False False False False True\n " ,
456
466
" 4520 False False False False False True True\n " ,
457
467
" \n " ,
458
468
" [4521 rows x 7 columns]\n "
462
472
]
463
473
}
464
474
]
465
- }
475
+ }
0 commit comments