samshipengs
diff --git a/‎README.md
+14-13 b/‎README.md
+14-13
diff --git a/‎files/model/.gitkeep b/‎files/model/.gitkeep
diff --git a/‎neg.png
26.6 KB b/‎neg.png
26.6 KB
diff --git a/‎output_2_1.png
-27.1 KB b/‎output_2_1.png
-27.1 KB
diff --git a/‎output_2_3.png
-24.8 KB b/‎output_2_3.png
-24.8 KB
diff --git a/‎pos.png
26.1 KB b/‎pos.png
26.1 KB
diff --git a/‎python_model/Total_result.csv
+2,214-2,214 b/‎python_model/Total_result.csv
+2,214-2,214
diff --git a/‎python_model/cnn_svm.ipynb
+118-79 b/‎python_model/cnn_svm.ipynb
+118-79
diff --git a/‎python_model/load_data.py
+13-11 b/‎python_model/load_data.py
+13-11
diff --git a/‎python_model/model_predictions.py
+2-2 b/‎python_model/model_predictions.py
+2-2
diff --git a/‎python_model/predictions.ipynb
+26-36 b/‎python_model/predictions.ipynb
+26-36
@@ -93,50 +93,51 @@ val freq [473 620 620]
 Extracting ... 
 Extracting ... 
 Training cv 1 ...
-[LibSVM]0.725627553999
+[LibSVM]0.880910683012
 
 
 train freq [1890 2479 2479]
 val freq [473 620 620]
 Extracting ... 
 Extracting ... 
 Training cv 2 ...
-[LibSVM]0.72270869819
+[LibSVM]0.879743140689
 
 
 train freq [1890 2479 2479]
 val freq [473 620 620]
 Extracting ... 
 Extracting ... 
 Training cv 3 ...
-[LibSVM]0.72270869819
+[LibSVM]0.877408056042
+
  ```
  then use it to predict on tweets that mentions AirCanada. 
 
  Group the tweets based on the sentiment classified by CNN model, and we can find the most frequent words from each group,
 
 ```python
-ALL = Prediction(FILE_PATH, 'air.csv', max_len_train=19)
-ALL.prepare_data(wv_size=600)
+ALL = Prediction(FILE_PATH, 'FOUR_AIRLINES.csv', max_len_train=19)
+ALL.prepare_data(['text', 'airline'], wv_size=600)
 ALL.get_result(n_preview=10, n_top = 20, name='ALL_result',verbose=False)
 ```
 
 ```
 ===Positive===
-[('thanks', 20), ('great', 20), ('everyone', 11), ('sharing', 11), ('news', 11), 
-('thank', 7), ('chicagoseminars', 4), ('amazing', 4), ('howierappaport', 2), ('laptop', 2), 
-('yyz', 2), ('ready', 2), ('service', 2), ('needed', 2), ('much', 2), ('flight', 2), 
-('beat', 2), ('plugs', 2), ('helpful', 2), ('jet', 2)]
+[('thanks', 170), ('thank', 139), ('great', 136), ('flight', 120), ('service', 65), 
+ ('love', 48), ('fly', 44), ('crew', 39), ('leggings', 38), ('best', 38), 
+ ('flying', 35), ('much', 34), ('night', 34), ('good', 34), ('always', 32),
+ ('us', 31), ('home', 31), ('time', 30), ('last', 30), ('got', 30)]
 ```
 ![png](output_2_1.png)
 
 
 ```
 ===Negative===
-[('flight', 173), ('time', 63), ('get', 53), ('thanks', 45), ('service', 42), 
-('im', 42), ('us', 41), ('great', 41), ('fly', 37), ('hours', 34), ('w', 33), 
-('plane', 32), ('flights', 32), ('flying', 31), ('one', 31), ('delayed', 31), 
-('dont', 31), ('travel', 30), ('please', 30), ('home', 29)]
+[('flight', 400), ('get', 155), ('stop', 144), ('tickets', 137), ('time', 128),
+ ('seaworld', 127), ('selling', 123), ('via', 122), ('urge', 121), ('service', 108),
+ ('customer', 100), ('still', 95), ('one', 92), ('delayed', 91), ('flights', 83), 
+ ('us', 78), ('bag', 75), ('flying', 74), ('hours', 72), ('hour', 70)]
 ```
 
 ![png](output_2_3.png)
 
@@ -67,24 +67,24 @@ def csv_df(self, csv_fields):
 	def pre_process(self, df):
 		print("Note: pre-process changes the dataframe inplace.")
 		# remove new line char
-		df['text'].replace(regex=True,inplace=True,to_replace='(\\n|\\r|\\r\\n)',value='')
+		df['text'].replace(regex=True,inplace=True,to_replace='(\\n|\\r|\\r\\n)',value=' ')
 		# remove https links
-		df['text'].replace(regex=True,inplace=True,to_replace=r'(http|https):\/\/[^(\s|\b)]+',value=r'')
+		df['text'].replace(regex=True,inplace=True,to_replace='(http|https):\/\/[^(\s|\b)]+',value=' ')
 		# remove user name
-		df['text'].replace(regex=True,inplace=True,to_replace=r'@\w+',value=r'')
+		df['text'].replace(regex=True,inplace=True,to_replace='@\w+',value=' ')
 		# remove non-alphabet, this includes number and punctuation
-		df['text'].replace(regex=True,inplace=True,to_replace=r'[^a-zA-Z\s]',value=r'')
+		df['text'].replace(regex=True,inplace=True,to_replace='[^a-zA-Z\s]',value=' ')
 		# tokenize each tweets to form sentences.
 		df['tokenized'] = df['text'].apply(lambda row: nltk.word_tokenize(row.lower()))
 		# remove stop words
 		stop_words = stopwords.words('english')
 		add_stop_words = ['amp', 'rt']
 		stop_words += add_stop_words
-		# also remove english names
-		last_names = [x.lower() for x in np.loadtxt(self.FILE_PATH+"last_names.txt", usecols=0, dtype=str)[:5000]]
-		stop_words += last_names
-		first_names = [x.lower() for x in np.loadtxt(self.FILE_PATH+"first_names.txt", usecols=0, dtype=str)]
-		stop_words += first_names
+		# # also remove english names
+		# last_names = [x.lower() for x in np.loadtxt(self.FILE_PATH+"last_names.txt", usecols=0, dtype=str)[:5000]]
+		# stop_words += last_names
+		# first_names = [x.lower() for x in np.loadtxt(self.FILE_PATH+"first_names.txt", usecols=0, dtype=str)]
+		# stop_words += first_names
 		#     print "sample stopping words: ", stop_words[:5]
 		df['tokenized'] = df['tokenized'].apply(lambda x: [item for item in x if item not in stop_words])
 
@@ -117,7 +117,7 @@ def max_len(self, df):
 
 	# initialize empty arry to fill with vector repsentation
 	def convert2vec(self, df, max_length, model, name='default'):
-		file_name = self.FILE_PATH + name
+		file_name = self.FILE_PATH + 'data/' + name
 		if os.path.isfile(file_name + '.npy'):
 			print "npy already exists, loading ..."
 			tweet_vecs = np.load(file_name + '.npy')
@@ -131,6 +131,8 @@ def convert2vec(self, df, max_length, model, name='default'):
 			tweet_vecs = np.zeros((n,m,self.vec_size))
 			vocabs = model.wv.vocab.keys()
 			for i in range(n):
+				if i%2000 == 0:
+					print ">>> " + str(i) + " tweets converted ..."
 				token_i = [x for x in tweet_tokens[i] if x in vocabs]
 				m_i = len(token_i)
 
@@ -154,7 +156,7 @@ def standarize(self, tweet_vecs):
 
 	# save tweet_vecs to disk in npy
 	def save_vec(self, tweet_vecs, name='default'):
-		file_name = self.FILE_PATH + name
+		file_name = self.FILE_PATH + 'data/' + name
 		if os.path.isfile(file_name + '.npy') and os.path.isfile(file_name + '.npz'):
 			print "npy already exists."
 		else:
 
@@ -136,11 +136,11 @@ def make_prediction(self, data, verbose=True):
 		data = data.reshape(-1, 1, M, D).astype(theano.config.floatX) # theano needs this way
 
 		cnn = build_cnn(M, D)
-		model_file = self.file_path+'nn_cnn'
+		model_file = self.file_path+'model/nn_cnn1'
 		cnn.load_params_from(model_file)
 
 		extract_data = extract_features(cnn, data)
-		clf = joblib.load(self.file_path+'svm-final.pkl')
+		clf = joblib.load(self.file_path+'model/cnn-svm-final.pkl')
 		test_pred = clf.predict(extract_data)
 
 		return test_pred + 1