-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathnext_word_prediction_using_universal_sentence_encoder.py
203 lines (137 loc) · 5.2 KB
/
next_word_prediction_using_universal_sentence_encoder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
# -*- coding: utf-8 -*-
"""Next-Word Prediction using Universal Sentence Encoder.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1r2ma5P7w2LE30L1o5mAyNPLE7Qi3JxoL
# **Google drive for local storage**
_NB: All comments are written to facilitate smooth evaluation of the model, that the **Current User** may be less fatigued and see beauty in the good work._
Uncomment text under **PREVIEW OUTPUT** to further scrutinize.
"""
# Commented out IPython magic to ensure Python compatibility.
# This cell will prompt an external url to accept permissions for Colab to access Google Drive
from google.colab import drive
drive.mount("/gdrive")
# %ls
"""# **Import ***"""
# Getting all required libraries
import os
import re
import gdown
import numpy
import string
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from absl import logging
import tensorflow_hub as hub
from tensorflow import keras
import matplotlib.pyplot as plt
from keras.models import Sequential
import tensorflow.keras.backend as K
from keras.layers.recurrent import LSTM
from keras.layers import Dense, Activation
from keras.callbacks import LambdaCallback
from keras.utils.data_utils import get_file
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
"""## **Data preparation - _Generating Corpus_**"""
# Download data from Google drive
'''
ORIGINAL DATASET URL:
https://raw.githubusercontent.com/maxim5/stanford-tensorflow-tutorials/master/data/arxiv_abstracts.txt
'''
url = ' https://drive.google.com/uc?id=1YTBR7FiXssaKXHhOZbUbwoWw6jzQxxKW'
output = 'corpus.txt'
gdown.download(url, output, quiet=False)
# sentence_length = 40
# Read local file from directory
with open('corpus.txt') as subject:
cache = subject.readlines()
translator = str.maketrans('', '', string.punctuation) # Remove punctuation
lines = [doc.lower().translate(translator) for doc in cache] # Switch to lower case
# PREVIEW OUTPUT ::
# print(lines[0][:100])
# len(lines)
# Generate an list of single/independent words
vocabulary = list(set(' '.join(lines).replace('\n','').split(' ')))
primary_store = {}
for strings, texts in enumerate(vocabulary):
primary_store[texts] = strings
# PREVIEW OUTPUT ::
# print(vocabulary[:50])
# len(vocabulary)
# Splitting data into Train sets and test sets
X = []
y = []
for c in lines:
xxxx = c.replace('\n','').split(' ')
X.append(' '.join(xxxx[:-1])) # X from the corpus
yyyy = [0 for i in range(len(vocabulary))] # Generate Y from the Vocabulary
# yyyy[primary_store[xxxx[-1]]] = 1
yyyy[primary_store[xxxx[-1]]] = 1
y.append(yyyy)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
y_test = numpy.array(y_test)
y_train = numpy.array(y_train)
# PREVIEW OUTPUT ::
# print(X_train[:10])
# print(y_train[:10])
# print(X_test[:10])
# print(y_test[:10])
"""## **Embeddings!**"""
# Import the Universal Sentence Encoder's TF Hub module (Here we're making use of version 4)
# This will take a while but won't be long :)
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
appreciate = hub.load(module_url)
# Making it easier - Function for embedding
def embed(goodness):
return appreciate(goodness)
# REVIEW OUTPUT ::
# appreciate.variables
# Wrapping up with the U-S-E
X_train = embed(X_train)
X_test = embed(X_test)
X_train = X_train.numpy()
X_test = X_test.numpy()
# PREVIEW OUTPUT ::
# print(X_train[:10])
# print(y_train[:10])
# print(X_test[:10])
# print(y_test[:10])
# print(X_train.shape, X_test.shape, y_test.shape, y_train.shape)
"""# **Building the model**"""
model = Sequential()
# model.add(Embedding(input_dim=len(vocabulary), output_dim=100))
model = Sequential()
# model.add(LSTM(units=100, input_shape=[512]))
model.add(Dense(512, input_shape=[512], activation = 'relu'))
model.add(Dense(units=len(vocabulary), activation = 'softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
model.summary()
# Training the model.
model.fit(X_train, y_train, batch_size=512, shuffle=True, epochs=20, validation_data=(X_test, y_test), callbacks=[LambdaCallback()])
"""#**Unto the tests!**"""
# Create function to predict and show detailed output
def next_word(collection=[], extent=1):
for item in collection:
text = item
for i in range(extent):
prediction = model.predict(x=embed([item]).numpy())
idx = np.argmax(prediction[-1])
item += ' ' + vocabulary[idx]
print(text + ' --> ' + item + '\nNEXT WORD: ' + item.split(' ')[-1] + '\n')
# Tests - please feel free to explore
single_text = ['and some other essential']
next_word(single_text)
# Testing on a collection of words
text_collection = ['deep convolutional', 'simple and effective', 'a nonconvex', 'a']
next_word(text_collection)
"""## **For the record**
The Dataset is based on a Tensorflow tutorial from Stanford, so all predicted words will be based on Deep learning and Machine learning _common terms_.
"""
# Storing data
vocabulary = numpy.array(vocabulary)
numpy.save('./vocabulary.npy', vocabulary)
model.save('./NWP-USE')
## END OF NOTEBOOK