-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsentiment.py
155 lines (135 loc) · 4.36 KB
/
sentiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import csv
def replace_two_or_more(s):
# add words to a set
a = set(s.split())
#convert set to list
return list(a)
#function to clean the tweet
def process_tweet(tweet):
# Convert to lower case
tweet = tweet.lower()
tweets = replace_two_or_more(tweet)
feature_vector = []
for word in tweets:
#remove punctuations symbols
word = word.strip()
word = word.replace("\'", "")
word = word.replace("\\", "")
word = word.replace("?", "")
word = word.replace(".", "")
word = word.replace("!", "")
word = word.replace("\"", "")
word = word.replace(",", "")
word = word.replace("\'", "")
word = word.replace(")", "")
word = word.replace("(", "")
word = word.replace("[", "")
word = word.replace("]", "")
#ignore some text
if ((word in stopwords) or (word.startswith("@")) or (
word.startswith("&")) or (word.startswith("www")) or (
word.startswith("#")) or (word.startswith("http")) or (
word.isdigit())):
continue
else:
#add to feature_vector if word is formatted
feature_vector.append(word.lower())
return feature_vector
#function to find max element key in a dictionary
def find_max(mydict):
return max(mydict, key=mydict.get)
stop_words = []
# start getStopWordList
def get_stop_word_list(stop_word_list_file_name):
fp = open(stop_word_list_file_name, 'r')
line = fp.readline()
while line:
word = line.strip()
stop_words.append(word)
line = fp.readline()
fp.close()
return stop_words
# end
total_words = 0
total_sentimental_occurences = {'0': 0, '2': 0, '4': 0}
feature_list = {}
#get probability of occurence of a feature if it belongs to a certain class - P(feature|class)
def get_prob_features_under_class(feature_vector, sentiment):
result = 1
counter = 0
for word in feature_vector:
if word in feature_list:
counter = 1
if total_sentimental_occurences[sentiment] != 0:
result *= feature_list[word][sentiment] / total_sentimental_occurences[sentiment]
else:
return 0
if counter == 0:
return 0
return result
#get probability of occurence of a certain class - P(class)
def get_prob_class(sentiment):
result = total_sentimental_occurences[sentiment] / total_words
return result
#get probability of occurence of a certain feature - P(feature)
def get_prob_features(feature_vector):
result = 1.0000
counter = 0
for feature in feature_vector:
if feature in feature_list:
counter = 1
result *= feature_list[feature]['count'] / total_words
if counter == 0:
return 0
return result
#predict nature of a sentence according to its feature vector
def predict(feature_vector):
prob_acc_to_sentiments = {}
prob_of_features = get_prob_features(feature_vector)
if prob_of_features == 0:
print("No Data - you gotta train me more!")
return
#calculating probability of occuring of sentence in each class
for sentiment in total_sentimental_occurences:
# P(class|features) = P(features|class)*P(class)/P(features)
prob_acc_to_sentiments[sentiment] = get_prob_features_under_class(feature_vector, sentiment) * \
get_prob_class(sentiment) / prob_of_features
#find class with maximum probability
result = find_max(prob_acc_to_sentiments)
key = result
if result == '4':
result = "Positive"
elif result == '0':
result = "Negative"
else:
result = "Neutral"
print("The sentence is ", result)
return key
# Read the tweets one by one and process it
inp_tweets = csv.reader(open('test.csv', 'r'), delimiter=',', quotechar='\"', )
stopwords = get_stop_word_list('stopwordsList.txt')
tweets = []
print("Training started .....")
for row in inp_tweets:
#first element in given dataset gives sentiment and 6th gives the actual tweet
sentiment = row[0]
tweet = row[5]
#extract feature vector from a tweet by cleaning it
feature_vector = process_tweet(tweet)
for feature in feature_vector:
if not feature in feature_list:
#if a word is not in currently maintained list , then add it
feature_list[feature] = {'0': 0, '2': 0, '4': 0, 'count': 0}
#increment the sentiment count of the word
feature_list[feature][sentiment] += 1
feature_list[feature]['count'] += 1
total_sentimental_occurences[sentiment] += 1
total_words += 1
print("Training complete.")
print("Total Words - ", total_words)
print("Total classifications", total_sentimental_occurences)
while True:
#input a sentence from user and predict its nature
tweet = input("Enter a sentence : ")
feature_vector = process_tweet(tweet)
predict(feature_vector)