Skip to content

Commit 1042ebe

Browse files
committed
fix pre-processing symbols to space instead of empty string
1 parent a9210be commit 1042ebe

12 files changed

+2921
-2355
lines changed

README.md

+14-13
Original file line numberDiff line numberDiff line change
@@ -93,50 +93,51 @@ val freq [473 620 620]
9393
Extracting ...
9494
Extracting ...
9595
Training cv 1 ...
96-
[LibSVM]0.725627553999
96+
[LibSVM]0.880910683012
9797
9898
9999
train freq [1890 2479 2479]
100100
val freq [473 620 620]
101101
Extracting ...
102102
Extracting ...
103103
Training cv 2 ...
104-
[LibSVM]0.72270869819
104+
[LibSVM]0.879743140689
105105
106106
107107
train freq [1890 2479 2479]
108108
val freq [473 620 620]
109109
Extracting ...
110110
Extracting ...
111111
Training cv 3 ...
112-
[LibSVM]0.72270869819
112+
[LibSVM]0.877408056042
113+
113114
```
114115
then use it to predict on tweets that mentions AirCanada.
115116

116117
Group the tweets based on the sentiment classified by CNN model, and we can find the most frequent words from each group,
117118

118119
```python
119-
ALL = Prediction(FILE_PATH, 'air.csv', max_len_train=19)
120-
ALL.prepare_data(wv_size=600)
120+
ALL = Prediction(FILE_PATH, 'FOUR_AIRLINES.csv', max_len_train=19)
121+
ALL.prepare_data(['text', 'airline'], wv_size=600)
121122
ALL.get_result(n_preview=10, n_top = 20, name='ALL_result',verbose=False)
122123
```
123124

124125
```
125126
===Positive===
126-
[('thanks', 20), ('great', 20), ('everyone', 11), ('sharing', 11), ('news', 11),
127-
('thank', 7), ('chicagoseminars', 4), ('amazing', 4), ('howierappaport', 2), ('laptop', 2),
128-
('yyz', 2), ('ready', 2), ('service', 2), ('needed', 2), ('much', 2), ('flight', 2),
129-
('beat', 2), ('plugs', 2), ('helpful', 2), ('jet', 2)]
127+
[('thanks', 170), ('thank', 139), ('great', 136), ('flight', 120), ('service', 65),
128+
('love', 48), ('fly', 44), ('crew', 39), ('leggings', 38), ('best', 38),
129+
('flying', 35), ('much', 34), ('night', 34), ('good', 34), ('always', 32),
130+
('us', 31), ('home', 31), ('time', 30), ('last', 30), ('got', 30)]
130131
```
131132
![png](output_2_1.png)
132133

133134

134135
```
135136
===Negative===
136-
[('flight', 173), ('time', 63), ('get', 53), ('thanks', 45), ('service', 42),
137-
('im', 42), ('us', 41), ('great', 41), ('fly', 37), ('hours', 34), ('w', 33),
138-
('plane', 32), ('flights', 32), ('flying', 31), ('one', 31), ('delayed', 31),
139-
('dont', 31), ('travel', 30), ('please', 30), ('home', 29)]
137+
[('flight', 400), ('get', 155), ('stop', 144), ('tickets', 137), ('time', 128),
138+
('seaworld', 127), ('selling', 123), ('via', 122), ('urge', 121), ('service', 108),
139+
('customer', 100), ('still', 95), ('one', 92), ('delayed', 91), ('flights', 83),
140+
('us', 78), ('bag', 75), ('flying', 74), ('hours', 72), ('hour', 70)]
140141
```
141142

142143
![png](output_2_3.png)

files/model/.gitkeep

Whitespace-only changes.

neg.png

26.6 KB
Loading

output_2_1.png

-27.1 KB
Binary file not shown.

output_2_3.png

-24.8 KB
Binary file not shown.

pos.png

26.1 KB
Loading

python_model/Total_result.csv

+2,214-2,214
Large diffs are not rendered by default.

python_model/cnn_svm.ipynb

+118-79
Large diffs are not rendered by default.

python_model/load_data.py

+13-11
Original file line numberDiff line numberDiff line change
@@ -67,24 +67,24 @@ def csv_df(self, csv_fields):
6767
def pre_process(self, df):
6868
print("Note: pre-process changes the dataframe inplace.")
6969
# remove new line char
70-
df['text'].replace(regex=True,inplace=True,to_replace='(\\n|\\r|\\r\\n)',value='')
70+
df['text'].replace(regex=True,inplace=True,to_replace='(\\n|\\r|\\r\\n)',value=' ')
7171
# remove https links
72-
df['text'].replace(regex=True,inplace=True,to_replace=r'(http|https):\/\/[^(\s|\b)]+',value=r'')
72+
df['text'].replace(regex=True,inplace=True,to_replace='(http|https):\/\/[^(\s|\b)]+',value=' ')
7373
# remove user name
74-
df['text'].replace(regex=True,inplace=True,to_replace=r'@\w+',value=r'')
74+
df['text'].replace(regex=True,inplace=True,to_replace='@\w+',value=' ')
7575
# remove non-alphabet, this includes number and punctuation
76-
df['text'].replace(regex=True,inplace=True,to_replace=r'[^a-zA-Z\s]',value=r'')
76+
df['text'].replace(regex=True,inplace=True,to_replace='[^a-zA-Z\s]',value=' ')
7777
# tokenize each tweets to form sentences.
7878
df['tokenized'] = df['text'].apply(lambda row: nltk.word_tokenize(row.lower()))
7979
# remove stop words
8080
stop_words = stopwords.words('english')
8181
add_stop_words = ['amp', 'rt']
8282
stop_words += add_stop_words
83-
# also remove english names
84-
last_names = [x.lower() for x in np.loadtxt(self.FILE_PATH+"last_names.txt", usecols=0, dtype=str)[:5000]]
85-
stop_words += last_names
86-
first_names = [x.lower() for x in np.loadtxt(self.FILE_PATH+"first_names.txt", usecols=0, dtype=str)]
87-
stop_words += first_names
83+
# # also remove english names
84+
# last_names = [x.lower() for x in np.loadtxt(self.FILE_PATH+"last_names.txt", usecols=0, dtype=str)[:5000]]
85+
# stop_words += last_names
86+
# first_names = [x.lower() for x in np.loadtxt(self.FILE_PATH+"first_names.txt", usecols=0, dtype=str)]
87+
# stop_words += first_names
8888
# print "sample stopping words: ", stop_words[:5]
8989
df['tokenized'] = df['tokenized'].apply(lambda x: [item for item in x if item not in stop_words])
9090

@@ -117,7 +117,7 @@ def max_len(self, df):
117117

118118
# initialize empty arry to fill with vector repsentation
119119
def convert2vec(self, df, max_length, model, name='default'):
120-
file_name = self.FILE_PATH + name
120+
file_name = self.FILE_PATH + 'data/' + name
121121
if os.path.isfile(file_name + '.npy'):
122122
print "npy already exists, loading ..."
123123
tweet_vecs = np.load(file_name + '.npy')
@@ -131,6 +131,8 @@ def convert2vec(self, df, max_length, model, name='default'):
131131
tweet_vecs = np.zeros((n,m,self.vec_size))
132132
vocabs = model.wv.vocab.keys()
133133
for i in range(n):
134+
if i%2000 == 0:
135+
print ">>> " + str(i) + " tweets converted ..."
134136
token_i = [x for x in tweet_tokens[i] if x in vocabs]
135137
m_i = len(token_i)
136138

@@ -154,7 +156,7 @@ def standarize(self, tweet_vecs):
154156

155157
# save tweet_vecs to disk in npy
156158
def save_vec(self, tweet_vecs, name='default'):
157-
file_name = self.FILE_PATH + name
159+
file_name = self.FILE_PATH + 'data/' + name
158160
if os.path.isfile(file_name + '.npy') and os.path.isfile(file_name + '.npz'):
159161
print "npy already exists."
160162
else:

python_model/model_predictions.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -136,11 +136,11 @@ def make_prediction(self, data, verbose=True):
136136
data = data.reshape(-1, 1, M, D).astype(theano.config.floatX) # theano needs this way
137137

138138
cnn = build_cnn(M, D)
139-
model_file = self.file_path+'nn_cnn'
139+
model_file = self.file_path+'model/nn_cnn1'
140140
cnn.load_params_from(model_file)
141141

142142
extract_data = extract_features(cnn, data)
143-
clf = joblib.load(self.file_path+'svm-final.pkl')
143+
clf = joblib.load(self.file_path+'model/cnn-svm-final.pkl')
144144
test_pred = clf.predict(extract_data)
145145

146146
return test_pred + 1

python_model/predictions.ipynb

+26-36
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)