Skip to content

Commit 7bbe114

Browse files
author
ga53ros
committed
Refactor naive bayes
1 parent 67fcb5a commit 7bbe114

File tree

2 files changed

+81
-57
lines changed

2 files changed

+81
-57
lines changed
File renamed without changes.

naive_bayes/naive_bayes.py

+81-57
Original file line numberDiff line numberDiff line change
@@ -4,67 +4,91 @@
44
import sys
55
np.set_printoptions(threshold=sys.maxsize)
66

7-
# load spambase dataset
8-
spambase = np.loadtxt('naive_bayes/spambase.data', delimiter=',')
97

10-
# shuffle and split
11-
np.random.shuffle(spambase)
12-
train = spambase[:2000, :]
13-
test = spambase[2000:, :]
14-
15-
16-
# quantize features such that values below median = 0, above median = 1
178
def quantize_features(arr, median):
9+
# quantize features such that values below median = 0, above median = 1
1810
for r in range(arr.shape[0]):
19-
for c in range(arr.shape[1]-1):
11+
for c in range(arr.shape[1] - 1):
2012
arr[r][c] = 1 if median[c] <= arr[r][c] else 0
2113
return arr
2214

2315

24-
medians = np.median(train, axis=0)
25-
X_train = quantize_features(train, medians).astype(int)
26-
X_test = quantize_features(test, medians).astype(int)
27-
28-
29-
# naive bayes training
30-
model = np.zeros((57, 2, 2))
31-
l_zeros = X_train[X_train[:, -1] == 0]
32-
l_ones = X_train[X_train[:, -1] == 1]
33-
zeros_sum = np.sum(l_zeros, axis=0)
34-
ones_sum = np.sum(l_ones, axis=0)
35-
36-
for feature in range(X_train.shape[1]-1):
37-
model[feature][0][1] = zeros_sum[feature] / l_zeros.shape[0]
38-
model[feature][0][0] = 1 - model[feature][0][1]
39-
model[feature][1][1] = ones_sum[feature] / l_ones.shape[0]
40-
model[feature][1][0] = 1 - model[feature][1][1]
41-
'''
42-
print("Given not spam, prob feature {} exists: {}, not exists: {}".format(feature,
43-
model[feature][0][1],
44-
model[feature][0][0]))
45-
print("Given spam, prob feature {} exists: {}, not exists: {}".format(feature,
46-
model[feature][1][1],
47-
model[feature][1][0]))
48-
'''
49-
50-
# test classifier
51-
not_spam_prior = l_zeros.shape[0] / X_train.shape[0]
52-
spam_prior = 1 - not_spam_prior
53-
print("Prior not spam: {}, prior spam: {}".format(not_spam_prior, spam_prior))
54-
n_test_samples = X_test.shape[0]
55-
predictions = np.zeros(n_test_samples, dtype=int)
56-
score = 0
57-
58-
for idx, sample in enumerate(X_test):
59-
y_pred_not_spam = not_spam_prior
60-
y_pred_spam = spam_prior
61-
for feature_idx in range(sample.shape[0]-1):
62-
y_pred_not_spam *= model[feature_idx][0][sample[feature_idx]]
63-
y_pred_spam *= model[feature_idx][1][sample[feature_idx]]
64-
prob_not_spam = y_pred_not_spam / (y_pred_not_spam + y_pred_spam)
65-
prob_spam = y_pred_spam / (y_pred_not_spam + y_pred_spam)
66-
predictions[idx] = 0 if prob_not_spam >= 0.5 else 1
67-
if sample[-1] == predictions[idx]:
68-
score += 1
69-
70-
print("Classifier accuracy: {}".format(score/n_test_samples))
16+
def preprocess_data(train_size, file):
17+
# load spambase dataset
18+
data = np.loadtxt(file, delimiter=',')
19+
20+
# shuffle and split
21+
np.random.shuffle(data)
22+
n_train = round(train_size * data.shape[0])
23+
train = data[:n_train, :]
24+
test = data[n_train:, :]
25+
26+
medians = np.median(train, axis=0)
27+
train = quantize_features(train, medians).astype(int)
28+
test = quantize_features(test, medians).astype(int)
29+
return train, test
30+
31+
32+
def train_model(train, log=False):
33+
# naive bayes training
34+
model = np.zeros((train.shape[1]-1, 2, 2))
35+
table_not_spam = train[train[:, -1] == 0]
36+
table_spam = train[train[:, -1] == 1]
37+
zeros_sum = np.sum(table_not_spam, axis=0)
38+
ones_sum = np.sum(table_spam, axis=0)
39+
40+
for feature in range(train.shape[1] - 1):
41+
model[feature][0][1] = zeros_sum[feature] / table_not_spam.shape[0]
42+
model[feature][0][0] = 1 - model[feature][0][1]
43+
model[feature][1][1] = ones_sum[feature] / table_spam.shape[0]
44+
model[feature][1][0] = 1 - model[feature][1][1]
45+
if log:
46+
print("Given not spam, prob feature {} exists: {}, not exists: {}".format(feature,
47+
model[feature][0][1],
48+
model[feature][0][0]))
49+
print("Given spam, prob feature {} exists: {}, not exists: {}".format(feature,
50+
model[feature][1][1],
51+
model[feature][1][0]))
52+
return model
53+
54+
55+
def get_priors(data, log=False):
56+
table_not_spam = data[data[:, -1] == 0]
57+
not_spam_prior = table_not_spam.shape[0] / data.shape[0]
58+
spam_prior = 1 - not_spam_prior
59+
if log:
60+
print("Prior not spam: {}, prior spam: {}".format(not_spam_prior, spam_prior))
61+
return not_spam_prior, spam_prior
62+
63+
64+
def test_model(model, priors, test):
65+
# naive bayes testing
66+
not_spam_prior, spam_prior = priors[0], priors[1]
67+
n_test_samples = test.shape[0]
68+
predictions = np.zeros(n_test_samples, dtype=int)
69+
score = 0
70+
71+
for idx, sample in enumerate(test):
72+
y_pred_not_spam = not_spam_prior
73+
y_pred_spam = spam_prior
74+
for feature_idx in range(sample.shape[0] - 1):
75+
y_pred_not_spam *= model[feature_idx][0][sample[feature_idx]]
76+
y_pred_spam *= model[feature_idx][1][sample[feature_idx]]
77+
prob_not_spam = y_pred_not_spam / (y_pred_not_spam + y_pred_spam)
78+
prob_spam = y_pred_spam / (y_pred_not_spam + y_pred_spam)
79+
predictions[idx] = 0 if prob_not_spam >= 0.5 else 1
80+
if sample[-1] == predictions[idx]:
81+
score += 1
82+
return score / n_test_samples
83+
84+
85+
def main(train_size=.7):
86+
train, test = preprocess_data(train_size, '../data/spambase.data')
87+
model = train_model(train)
88+
priors = get_priors(train)
89+
accuracy = test_model(model, priors, test)
90+
print("Classifier accuracy: {}".format(accuracy))
91+
92+
93+
if __name__ == '__main__':
94+
main()

0 commit comments

Comments
 (0)