4
4
import sys
5
5
np .set_printoptions (threshold = sys .maxsize )
6
6
7
- # load spambase dataset
8
- spambase = np .loadtxt ('naive_bayes/spambase.data' , delimiter = ',' )
9
7
10
- # shuffle and split
11
- np .random .shuffle (spambase )
12
- train = spambase [:2000 , :]
13
- test = spambase [2000 :, :]
14
-
15
-
16
- # quantize features such that values below median = 0, above median = 1
17
8
def quantize_features (arr , median ):
9
+ # quantize features such that values below median = 0, above median = 1
18
10
for r in range (arr .shape [0 ]):
19
- for c in range (arr .shape [1 ]- 1 ):
11
+ for c in range (arr .shape [1 ] - 1 ):
20
12
arr [r ][c ] = 1 if median [c ] <= arr [r ][c ] else 0
21
13
return arr
22
14
23
15
24
- medians = np .median (train , axis = 0 )
25
- X_train = quantize_features (train , medians ).astype (int )
26
- X_test = quantize_features (test , medians ).astype (int )
27
-
28
-
29
- # naive bayes training
30
- model = np .zeros ((57 , 2 , 2 ))
31
- l_zeros = X_train [X_train [:, - 1 ] == 0 ]
32
- l_ones = X_train [X_train [:, - 1 ] == 1 ]
33
- zeros_sum = np .sum (l_zeros , axis = 0 )
34
- ones_sum = np .sum (l_ones , axis = 0 )
35
-
36
- for feature in range (X_train .shape [1 ]- 1 ):
37
- model [feature ][0 ][1 ] = zeros_sum [feature ] / l_zeros .shape [0 ]
38
- model [feature ][0 ][0 ] = 1 - model [feature ][0 ][1 ]
39
- model [feature ][1 ][1 ] = ones_sum [feature ] / l_ones .shape [0 ]
40
- model [feature ][1 ][0 ] = 1 - model [feature ][1 ][1 ]
41
- '''
42
- print("Given not spam, prob feature {} exists: {}, not exists: {}".format(feature,
43
- model[feature][0][1],
44
- model[feature][0][0]))
45
- print("Given spam, prob feature {} exists: {}, not exists: {}".format(feature,
46
- model[feature][1][1],
47
- model[feature][1][0]))
48
- '''
49
-
50
- # test classifier
51
- not_spam_prior = l_zeros .shape [0 ] / X_train .shape [0 ]
52
- spam_prior = 1 - not_spam_prior
53
- print ("Prior not spam: {}, prior spam: {}" .format (not_spam_prior , spam_prior ))
54
- n_test_samples = X_test .shape [0 ]
55
- predictions = np .zeros (n_test_samples , dtype = int )
56
- score = 0
57
-
58
- for idx , sample in enumerate (X_test ):
59
- y_pred_not_spam = not_spam_prior
60
- y_pred_spam = spam_prior
61
- for feature_idx in range (sample .shape [0 ]- 1 ):
62
- y_pred_not_spam *= model [feature_idx ][0 ][sample [feature_idx ]]
63
- y_pred_spam *= model [feature_idx ][1 ][sample [feature_idx ]]
64
- prob_not_spam = y_pred_not_spam / (y_pred_not_spam + y_pred_spam )
65
- prob_spam = y_pred_spam / (y_pred_not_spam + y_pred_spam )
66
- predictions [idx ] = 0 if prob_not_spam >= 0.5 else 1
67
- if sample [- 1 ] == predictions [idx ]:
68
- score += 1
69
-
70
- print ("Classifier accuracy: {}" .format (score / n_test_samples ))
16
+ def preprocess_data (train_size , file ):
17
+ # load spambase dataset
18
+ data = np .loadtxt (file , delimiter = ',' )
19
+
20
+ # shuffle and split
21
+ np .random .shuffle (data )
22
+ n_train = round (train_size * data .shape [0 ])
23
+ train = data [:n_train , :]
24
+ test = data [n_train :, :]
25
+
26
+ medians = np .median (train , axis = 0 )
27
+ train = quantize_features (train , medians ).astype (int )
28
+ test = quantize_features (test , medians ).astype (int )
29
+ return train , test
30
+
31
+
32
+ def train_model (train , log = False ):
33
+ # naive bayes training
34
+ model = np .zeros ((train .shape [1 ]- 1 , 2 , 2 ))
35
+ table_not_spam = train [train [:, - 1 ] == 0 ]
36
+ table_spam = train [train [:, - 1 ] == 1 ]
37
+ zeros_sum = np .sum (table_not_spam , axis = 0 )
38
+ ones_sum = np .sum (table_spam , axis = 0 )
39
+
40
+ for feature in range (train .shape [1 ] - 1 ):
41
+ model [feature ][0 ][1 ] = zeros_sum [feature ] / table_not_spam .shape [0 ]
42
+ model [feature ][0 ][0 ] = 1 - model [feature ][0 ][1 ]
43
+ model [feature ][1 ][1 ] = ones_sum [feature ] / table_spam .shape [0 ]
44
+ model [feature ][1 ][0 ] = 1 - model [feature ][1 ][1 ]
45
+ if log :
46
+ print ("Given not spam, prob feature {} exists: {}, not exists: {}" .format (feature ,
47
+ model [feature ][0 ][1 ],
48
+ model [feature ][0 ][0 ]))
49
+ print ("Given spam, prob feature {} exists: {}, not exists: {}" .format (feature ,
50
+ model [feature ][1 ][1 ],
51
+ model [feature ][1 ][0 ]))
52
+ return model
53
+
54
+
55
+ def get_priors (data , log = False ):
56
+ table_not_spam = data [data [:, - 1 ] == 0 ]
57
+ not_spam_prior = table_not_spam .shape [0 ] / data .shape [0 ]
58
+ spam_prior = 1 - not_spam_prior
59
+ if log :
60
+ print ("Prior not spam: {}, prior spam: {}" .format (not_spam_prior , spam_prior ))
61
+ return not_spam_prior , spam_prior
62
+
63
+
64
+ def test_model (model , priors , test ):
65
+ # naive bayes testing
66
+ not_spam_prior , spam_prior = priors [0 ], priors [1 ]
67
+ n_test_samples = test .shape [0 ]
68
+ predictions = np .zeros (n_test_samples , dtype = int )
69
+ score = 0
70
+
71
+ for idx , sample in enumerate (test ):
72
+ y_pred_not_spam = not_spam_prior
73
+ y_pred_spam = spam_prior
74
+ for feature_idx in range (sample .shape [0 ] - 1 ):
75
+ y_pred_not_spam *= model [feature_idx ][0 ][sample [feature_idx ]]
76
+ y_pred_spam *= model [feature_idx ][1 ][sample [feature_idx ]]
77
+ prob_not_spam = y_pred_not_spam / (y_pred_not_spam + y_pred_spam )
78
+ prob_spam = y_pred_spam / (y_pred_not_spam + y_pred_spam )
79
+ predictions [idx ] = 0 if prob_not_spam >= 0.5 else 1
80
+ if sample [- 1 ] == predictions [idx ]:
81
+ score += 1
82
+ return score / n_test_samples
83
+
84
+
85
+ def main (train_size = .7 ):
86
+ train , test = preprocess_data (train_size , '../data/spambase.data' )
87
+ model = train_model (train )
88
+ priors = get_priors (train )
89
+ accuracy = test_model (model , priors , test )
90
+ print ("Classifier accuracy: {}" .format (accuracy ))
91
+
92
+
93
+ if __name__ == '__main__' :
94
+ main ()
0 commit comments