-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add code for document classification and language modelling
- Loading branch information
Showing
34 changed files
with
3,711 additions
and
387 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,334 @@ | ||
#!/usr/bin/env python | ||
# coding: utf-8 | ||
|
||
# In[1]: | ||
|
||
|
||
import torch | ||
import gensim | ||
import numpy as np | ||
import pickle as cPickle | ||
import torch.optim as optim | ||
import time | ||
|
||
|
||
# In[2]: | ||
|
||
|
||
import torch.nn as nn | ||
import torch.nn.functional as F | ||
|
||
class LSTMcell(nn.Module): | ||
|
||
def __init__(self,input_size, hidden_size, output_size): | ||
|
||
super(LSTMcell, self).__init__() | ||
|
||
self.hidden_size = hidden_size | ||
|
||
""" | ||
LSTM cell basic operations | ||
""" | ||
self.i2ft = nn.Linear(input_size + hidden_size, hidden_size, bias = True) | ||
self.i2it = nn.Linear(input_size + hidden_size, hidden_size, bias = True) | ||
self.i2cdasht = nn.Linear(input_size + hidden_size, hidden_size, bias = True) | ||
self.i2o = nn.Linear(input_size+hidden_size, hidden_size, bias=True) | ||
|
||
|
||
def forward(self, input, hidden_state, cell_state): | ||
|
||
""" | ||
input dimension = (batch size X 300); where 300 is dimension used for word embedding | ||
hidden state dimension = (batch size X 300); where 300 is hidden state dimension as mentioned in the paper | ||
""" | ||
|
||
combined = torch.cat((input, hidden_state), axis = 1) | ||
|
||
forget_gate = torch.sigmoid(self.i2ft(combined)) | ||
i_t = torch.sigmoid(self.i2it(combined)) | ||
c_dash = torch.tanh(self.i2cdasht(combined)) | ||
cell_state = forget_gate*cell_state + i_t*c_dash | ||
|
||
""" | ||
IMP: Layer normalization [2] to be performed after the computation of the cell state | ||
""" | ||
output_state = torch.sigmoid(self.i2o(combined)) | ||
hidden_state = output_state*torch.tanh(cell_state) | ||
|
||
|
||
return hidden_state, cell_state | ||
|
||
|
||
# In[3]: | ||
|
||
|
||
class LSTMclassifier(nn.Module): | ||
|
||
""" | ||
Classification task on LSTM output | ||
""" | ||
|
||
def __init__(self,input_size, hidden_size, output_size, glove_weights): | ||
|
||
super(LSTMclassifier, self).__init__() | ||
|
||
self.hidden_size = hidden_size | ||
self.labels = output_size | ||
|
||
""" | ||
Glove embeddings initialization | ||
""" | ||
self.embedding = nn.Embedding.from_pretrained(glove_weights) | ||
|
||
self.lstm = LSTMcell(input_size ,hidden_size, output_size) | ||
|
||
""" | ||
Pooling layer: mean pooling across time | ||
pooling layer's input dimension: (batch_size X max_num_of_words X 300) | ||
pooling layer's output dimension: (batch_size X 300) | ||
""" | ||
""" | ||
taking intermediate layer size = 100 | ||
""" | ||
self.layer1 = nn.Linear(self.hidden_size, 100, bias = True) | ||
self.layer2 = nn.Linear(100, self.labels, bias = True) | ||
self.softmax = nn.LogSoftmax(dim=1) | ||
|
||
def forward(self, input, max_num_of_words): | ||
|
||
input = (self.embedding(input)).float() | ||
batch_size = input.size()[0] | ||
hidden_state = torch.zeros(batch_size, self.hidden_size) | ||
cell_state = torch.zeros(batch_size, self.hidden_size) | ||
|
||
""" | ||
output is concatenation of hidden state at all time stamp | ||
""" | ||
output = torch.zeros((batch_size, max_num_of_words, 300)) | ||
if torch.cuda.is_available(): | ||
output = output.cuda() | ||
hidden_state = hidden_state.cuda() | ||
cell_state = cell_state.cuda() | ||
|
||
for i in range(max_num_of_words): | ||
hidden_state, cell_state = self.lstm(input[:,i,:], hidden_state, cell_state) | ||
output[:,i,:] = hidden_state | ||
|
||
pool = nn.AvgPool2d((max_num_of_words,1), stride=1) | ||
time_avg_output = torch.squeeze(pool(output)) | ||
linear_layer = torch.sigmoid(self.layer1(time_avg_output)) | ||
final_output = torch.sigmoid(self.layer2(linear_layer)) | ||
final_output = self.softmax(final_output) | ||
|
||
return final_output | ||
|
||
|
||
# In[5]: | ||
|
||
|
||
class Dataset(object): | ||
|
||
def load_data(self, dataset): | ||
|
||
self.data = dataset | ||
|
||
if self.data == 'yahoo': | ||
self.loadpath = "./data/LEAM_dataset/yahoo.p" | ||
self.embpath = "./data/LEAM_dataset/yahoo_glove.p" | ||
self.num_class = 10 | ||
self.class_name = ['Society Culture', | ||
'Science Mathematics', | ||
'Health' , | ||
'Education Reference' , | ||
'Computers Internet' , | ||
'Sports' , | ||
'Business Finance' , | ||
'Entertainment Music' , | ||
'Family Relationships' , | ||
'Politics Government'] | ||
elif self.data == 'agnews': | ||
self.loadpath = "./data/LEAM_dataset/ag_news.p" | ||
self.embpath = "./data/LEAM_dataset/ag_news_glove.p" | ||
self.num_class = 4 | ||
self.class_name = ['World', | ||
'Sports', | ||
'Business', | ||
'Science'] | ||
elif self.data == 'dbpedia': | ||
self.loadpath = "./data/LEAM_dataset/dbpedia.p" | ||
self.embpath = "./data/LEAM_dataset/dbpedia_glove.p" | ||
self.num_class = 14 | ||
self.class_name = ['Company', | ||
'Educational Institution', | ||
'Artist', | ||
'Athlete', | ||
'Office Holder', | ||
'Mean Of Transportation', | ||
'Building', | ||
'Natural Place', | ||
'Village', | ||
'Animal', | ||
'Plant', | ||
'Album', | ||
'Film', | ||
'Written Work', | ||
] | ||
elif self.data == 'yelp_full': | ||
self.loadpath = "./data/LEAM_dataset/yelp_full.p" | ||
self.embpath = "./data/LEAM_dataset/yelp_full_glove.p" | ||
self.num_class = 5 | ||
self.class_name = ['worst', | ||
'bad', | ||
'middle', | ||
'good', | ||
'best'] | ||
|
||
x = cPickle.load(open(self.loadpath, "rb"), encoding = "latin1") | ||
self.train, self.val, self.test = x[0], x[1], x[2] | ||
self.train_lab, self.val_lab, self.test_lab = x[3], x[4], x[5] | ||
self.wordtoix, self.ixtoword = x[6], x[7] | ||
del x | ||
|
||
print("load data finished:", self.data) | ||
|
||
|
||
# In[6]: | ||
|
||
|
||
def eval_model(model, data, label, batch_size): | ||
total_epoch_loss = 0 | ||
total_epoch_acc = 0 | ||
loss_fn = nn.NLLLoss() | ||
model.eval() | ||
steps = 0 | ||
|
||
with torch.no_grad(): | ||
for iter in range(0, len(data), batch_size): | ||
text = torch.nn.utils.rnn.pad_sequence(data[iter:min((iter+batch_size), len(data))], batch_first = True) | ||
target = label[iter:min((iter+batch_size), len(data))].long() | ||
if torch.cuda.is_available(): | ||
text = text.cuda() | ||
target = target.cuda() | ||
prediction = model(text, text.size()[1]) | ||
loss = loss_fn(prediction, target) | ||
num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).sum() | ||
acc = 100.0 * num_corrects/min(batch_size, (len(data) - iter)) | ||
total_epoch_loss += loss.item() | ||
total_epoch_acc += acc.item() | ||
steps += 1 | ||
|
||
return total_epoch_loss/steps, total_epoch_acc/steps | ||
|
||
|
||
# In[7]: | ||
|
||
|
||
def train_model(model, data, label, batch_size, epoch): | ||
total_epoch_loss = 0 | ||
total_epoch_acc = 0 | ||
|
||
if torch.cuda.is_available(): | ||
model.cuda() | ||
|
||
optim = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001) | ||
steps = 0 | ||
loss_fn = nn.NLLLoss() | ||
model.train() | ||
|
||
for iter in range(0, len(data), batch_size): | ||
text = torch.nn.utils.rnn.pad_sequence(data[iter:min((iter+batch_size), len(data))], batch_first = True) | ||
target = label[iter:min((iter+batch_size), len(data))].long() | ||
if torch.cuda.is_available(): | ||
text = text.cuda() | ||
target = target.cuda() | ||
|
||
optim.zero_grad() | ||
prediction = model(text, text.size()[1]) | ||
loss = loss_fn(prediction, target) | ||
num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).sum() | ||
acc = 100.0 * num_corrects/min(batch_size, (len(data) - iter)) | ||
loss.backward() | ||
optim.step() | ||
steps+=1 | ||
|
||
if steps % 100 == 0: | ||
print (f'Epoch: {epoch+1}, Idx: {iter+1}, Training Loss: {loss.item():.4f}, Training Accuracy: {acc.item(): .2f}%') | ||
|
||
total_epoch_loss += loss.item() | ||
total_epoch_acc += acc.item() | ||
|
||
return total_epoch_loss/steps, total_epoch_acc/steps | ||
|
||
|
||
# In[8]: | ||
|
||
|
||
def main(): | ||
|
||
data = Dataset() | ||
data.load_data('yahoo') | ||
#from sklearn.utils import shuffle | ||
#data.train, data.train_lab = shuffle(data.train, data.train_lab) | ||
data.train = [torch.tensor(x) for x in data.train] | ||
data.test = [torch.tensor(x) for x in data.test] | ||
data.val = [torch.tensor(x) for x in data.val] | ||
data.train_lab = torch.tensor([np.argmax(x) for x in data.train_lab], dtype = torch.int64) | ||
data.test_lab = torch.tensor([np.argmax(x) for x in data.test_lab], dtype = torch.int64) | ||
data.val_lab = torch.tensor([np.argmax(x) for x in data.val_lab], dtype = torch.int64) | ||
|
||
batch_size = 256 | ||
n_hidden = 300 | ||
input_size = 300 #I guess for n-gram it will be n*300 | ||
|
||
W_embd = np.array(cPickle.load(open(data.embpath, 'rb'), encoding = "latin1")) | ||
W_embd = torch.from_numpy(W_embd) | ||
classifier = LSTMclassifier(input_size ,n_hidden, data.num_class, W_embd) | ||
|
||
num_epoch = 10 | ||
for epoch in range(num_epoch): | ||
|
||
start_time = time.time() | ||
train_loss, train_acc = train_model(classifier, data.train, data.train_lab, batch_size, epoch) | ||
end_time = time.time() | ||
elapsed_time = end_time - start_time | ||
hours, rest = divmod(elapsed_time, 3600) | ||
minutes, sec = divmod(rest, 60) | ||
""" | ||
Change the path to save the model weights and results | ||
""" | ||
torch.save(classifier, "./checkpoints/yahoo/lstm1_yahoo_epoch"+str(epoch+1)+".pth") | ||
|
||
val_loss, val_acc = eval_model(classifier, data.val, data.val_lab, batch_size) | ||
test_loss, test_acc = eval_model(classifier, data.test, data.test_lab, batch_size) | ||
print(f'Epoch: {epoch+1:02}, Time(hr,min): {hours, minutes},Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%,Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%') | ||
text_file = open("results_lstm1_yahoo" + ".txt", "a+") | ||
n = text_file.write(f'Epoch: {epoch+1:02}, Time(hr,min): {hours, minutes},Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%,Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%') | ||
m = text_file.write("\n") | ||
text_file.close() | ||
|
||
#test_loss, test_acc = eval_model(classifier, data.test, data.test_lab, batch_size) | ||
#text_file = open("results" + ".txt", "a+") | ||
#n = text_file.write(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%') | ||
#m = text_file.write("\n") | ||
#text_file.close() | ||
|
||
print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%') | ||
|
||
print('done') | ||
|
||
|
||
# In[ ]: | ||
|
||
|
||
if __name__ == '__main__': | ||
main() | ||
|
||
|
||
# In[ ]: | ||
|
||
|
||
|
||
|
Oops, something went wrong.