raguiar2 · raguiar2 · May 30, 2019 · haroldw · May 31, 2019 · haroldw
diff --git a/engagement_score_model/models.py b/engagement_score_model/models.py
@@ -26,6 +26,7 @@ def forward(self, text, text_lengths):
         packed_output, (hidden, cell) = self.rnn(packed_embedded)
 
         #unpack sequence
+        # TODO: are we using this? 
         output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
 
         #output = [sent len, batch size, hid dim * num directions]

diff --git a/engagement_score_model/rnn_regressor.py b/engagement_score_model/rnn_regressor.py
@@ -30,6 +30,7 @@
 tweetFile = '../twitter/data/test.csv'
 userFile = '../clustering/clusters.csv'
 oFile = './engagement.csv'
+weight_path = 'lstm_model.pt'
 
 torch.manual_seed(SEED)
 torch.backends.cudnn.deterministic = True
@@ -84,6 +85,7 @@
 model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
 model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)
 optimizer = optim.Adam(model.parameters())
+# Should we use L1Loss or SmoothL1Loss? How do we feel about outliers? 
 criterion = nn.MSELoss()
 
 model = model.to(device)
@@ -107,15 +109,15 @@
 
     if valid_loss < best_valid_loss:
         best_valid_loss = valid_loss
-        torch.save(model.state_dict(), 'lstm_model.pt')
+        torch.save(model.state_dict(), weight_path)
 
     print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
     print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
     print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
 
-model.load_state_dict(torch.load('lstm_model.pt'))
+model.load_state_dict(torch.load(weight_path))
 test_loss, test_acc = util.evaluate(model, test_iterator, criterion)
 print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')
 
-util.predict_engagement(model, "This film is terrible", TEXT, device)
-util.predict_engagement(model, "This film is great", TEXT, device)
+util.predict_engagement(model, "There is a climate crisis!", TEXT, device)
+util.predict_engagement(model, "There is a climate issue", TEXT, device)
diff --git a/engagement_score_model/util.py b/engagement_score_model/util.py
@@ -12,7 +12,6 @@ def binary_accuracy(preds, y):
     """
     Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
     """
-    #round predictions to the closest integer
     rounded_preds = torch.round(torch.sigmoid(preds))
     correct = (rounded_preds == y).float() #convert into float for division 
     acc = correct.sum() / len(correct)
@@ -104,7 +103,7 @@ def construct_engagement_score_ds( tweetFile, userFile ):
         parentFollower = tweet['follower_count'].values[0]+1
         curTweetEntry.append( tweet['clean_text'].values[0] ) 
         replyDf = tweetDf[tweetDf[parentIdCol]==pId]
-        #Shifting to polarity to [0 to 2]
+        #Shifting to polarity to [0 to 2] TODO: why aren't we using the continous value as a feature? 
         replyDf.loc[:,egmtCol] = np.log((( replyDf['favorite_count'] +
                 replyDf['retweet_count'] + 1) * ( replyDf['polarity'] + 1 ) / 
                 parentFollower ).values[0] + 1E-15)
@@ -114,6 +113,7 @@ def construct_engagement_score_ds( tweetFile, userFile ):
                 curTweetEntry.append( replyDf[replyDf[usrGrpCol]==grp]
                         [egmtCol].mean() )
             else:
+                # What is this -15? 
                 curTweetEntry.append( -15 )
         engagementDf.append( curTweetEntry )