From 703a9b06f1c802ed7767f17ede818f2de34e506f Mon Sep 17 00:00:00 2001 From: Rui Aguiar Date: Thu, 30 May 2019 16:28:40 -0700 Subject: [PATCH] added comments --- engagement_score_model/models.py | 1 + engagement_score_model/rnn_regressor.py | 10 ++++++---- engagement_score_model/util.py | 4 ++-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/engagement_score_model/models.py b/engagement_score_model/models.py index 8638115..43f2e9b 100644 --- a/engagement_score_model/models.py +++ b/engagement_score_model/models.py @@ -26,6 +26,7 @@ def forward(self, text, text_lengths): packed_output, (hidden, cell) = self.rnn(packed_embedded) #unpack sequence + # TODO: are we using this? output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output) #output = [sent len, batch size, hid dim * num directions] diff --git a/engagement_score_model/rnn_regressor.py b/engagement_score_model/rnn_regressor.py index 06d921a..73d8e98 100644 --- a/engagement_score_model/rnn_regressor.py +++ b/engagement_score_model/rnn_regressor.py @@ -30,6 +30,7 @@ tweetFile = '../twitter/data/test.csv' userFile = '../clustering/clusters.csv' oFile = './engagement.csv' +weight_path = 'lstm_model.pt' torch.manual_seed(SEED) torch.backends.cudnn.deterministic = True @@ -84,6 +85,7 @@ model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM) model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM) optimizer = optim.Adam(model.parameters()) +# Should we use L1Loss or SmoothL1Loss? How do we feel about outliers? criterion = nn.MSELoss() model = model.to(device) @@ -107,15 +109,15 @@ if valid_loss < best_valid_loss: best_valid_loss = valid_loss - torch.save(model.state_dict(), 'lstm_model.pt') + torch.save(model.state_dict(), weight_path) print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s') print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%') print(f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%') -model.load_state_dict(torch.load('lstm_model.pt')) +model.load_state_dict(torch.load(weight_path)) test_loss, test_acc = util.evaluate(model, test_iterator, criterion) print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%') -util.predict_engagement(model, "This film is terrible", TEXT, device) -util.predict_engagement(model, "This film is great", TEXT, device) +util.predict_engagement(model, "There is a climate crisis!", TEXT, device) +util.predict_engagement(model, "There is a climate issue", TEXT, device) diff --git a/engagement_score_model/util.py b/engagement_score_model/util.py index 634fe47..9219fc7 100644 --- a/engagement_score_model/util.py +++ b/engagement_score_model/util.py @@ -12,7 +12,6 @@ def binary_accuracy(preds, y): """ Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8 """ - #round predictions to the closest integer rounded_preds = torch.round(torch.sigmoid(preds)) correct = (rounded_preds == y).float() #convert into float for division acc = correct.sum() / len(correct) @@ -104,7 +103,7 @@ def construct_engagement_score_ds( tweetFile, userFile ): parentFollower = tweet['follower_count'].values[0]+1 curTweetEntry.append( tweet['clean_text'].values[0] ) replyDf = tweetDf[tweetDf[parentIdCol]==pId] - #Shifting to polarity to [0 to 2] + #Shifting to polarity to [0 to 2] TODO: why aren't we using the continous value as a feature? replyDf.loc[:,egmtCol] = np.log((( replyDf['favorite_count'] + replyDf['retweet_count'] + 1) * ( replyDf['polarity'] + 1 ) / parentFollower ).values[0] + 1E-15) @@ -114,6 +113,7 @@ def construct_engagement_score_ds( tweetFile, userFile ): curTweetEntry.append( replyDf[replyDf[usrGrpCol]==grp] [egmtCol].mean() ) else: + # What is this -15? curTweetEntry.append( -15 ) engagementDf.append( curTweetEntry )