-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLyrics_Inducer.py
192 lines (119 loc) · 4.79 KB
/
Lyrics_Inducer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
import tensorflow.keras.utils as ku
import numpy as np
import pickle
from keras.models import Sequential
from keras.layers import Dense
from keras.models import model_from_json
import numpy
import os
import tensorflow as tf
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
lyric=open('dataset.txt').read()
#OUTPUT EXAMPLE
# I\'ll give you all my love\nIf you treat me right, baby, I\'ll give you everything\nTalk to me,
# I need to hear you need me like I need you\nFall for me, I wanna know you feel how I feel for you,
# ### 3.2 Preprocessing
#
# #### Converting to lowercase
# Since the words carries capital letters as well, changing it to lowercase.
# Also keeoing dataset as per line would be more apt as it'll learn from the sentences formed to yield better performance
# In[19]:
#lowercase and split the datset
corpus=lyric.lower().split('\n')
for i in range(40,60):
print(corpus[i])
# #### Tokenizing
# Tokenizer creates the token for each line present in the corpus and measuring the number of the tokens created
# In[20]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1
print('\n\n WORDs: ', total_words)
# #### Creating Sequences
# create input sequences using list of tokens. Here we are generating tokens for each word and its preceeding word for each line. i.e:
#
# _come closer, i'll give you_
#
# will become:
#
# [[come],[come,closer],[come,closer,i'll],[come,closer,i'll,give]]
# In[ ]:
input_sequences = []
for line in corpus:
token_list = tokenizer.texts_to_sequences([line])[0]
for i in range(1, len(token_list)):
n_gram_sequence = token_list[:i+1]
input_sequences.append(n_gram_sequence)
# In[22]:
for i in range(20):
print(input_sequences[i])
# #### Padding
# Since the length of the arrays formed is different hence padding of length of the longest array is required in order to make the array length uniform.
#
# We can either do pre-padding or post-padding
# In[23]:
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
print('max seq: ',max_sequence_len)
# ### 4. Building Model
# As we've formed our data into array form, now we can build a model to process the same.
#
# In[24]:
print('\n\n Training model\n\n')
model = Sequential()
model.add(Embedding(total_words, 160, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(200, return_sequences = True)))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words/2, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
# In[ ]:
# create predictors and label
predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
label = ku.to_categorical(label, num_classes=total_words)
# ### 5. Training the Model
# Since in this case we don't have any validation set, we don't have to worry about the overfitting of model.
# In[26]:
history = model.fit(predictors, label, epochs=100, verbose=1)
# ### 6. Analysing the results
# By plotting the Training accuraccy and Training loss using matplot-lib, we can infer the model performance
# In[27]:
acc = history.history['accuracy']
loss = history.history['loss']
epochs = range(len(acc))
model.save('my_model.h5')
print('Model saved!')
#model_json = model.to_json()
#with open("model.json", "w") as json_file:
# json_file.write(model_json)
# serialize weights to HDF5
#model.save_weights("model.h5")
#print("Saved model to disk")
# In[40]:
# ### 10. Conclusion
# Hence, we can say that we sucessfully built the Lyrics-Inducer using NLP and LSTM, the model is more like predicting the NEXT WORD according to previous set of words, hence not much accurate for predicting longer sentences.
# And there are plenty of applications possible using the LSTM and NLP, do try.
# In[ ]:
next_words = 100
seed_text = 'Lemme love you '
# In[29]:
for _ in range(next_words):
token_list = tokenizer.texts_to_sequences([seed_text])[0]
token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
predicted = model.predict_classes(token_list, verbose=2)
output_word = ""
for word, index in tokenizer.word_index.items():
if index == predicted:
output_word = word
break
seed_text += " " + output_word
print(seed_text)