next_word_prediction_using_universal_sentence_encoder.py

# -*- coding: utf-8 -*-
"""Next-Word Prediction using Universal Sentence Encoder.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1r2ma5P7w2LE30L1o5mAyNPLE7Qi3JxoL

# **Google drive for local storage**

_NB: All comments are written to facilitate smooth evaluation of the model, that the **Current User** may be less fatigued and see beauty in the good work._

Uncomment text under **PREVIEW OUTPUT** to further scrutinize.
"""

# Commented out IPython magic to ensure Python compatibility.
# This cell will prompt an external url to accept permissions for Colab to access Google Drive

from google.colab import drive
drive.mount("/gdrive")

# %ls

"""# **Import ***"""

# Getting all required libraries

import os
import re
import gdown
import numpy
import string
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from absl import logging
import tensorflow_hub as hub
from tensorflow import keras
import matplotlib.pyplot as plt
from keras.models import Sequential
import tensorflow.keras.backend as K
from keras.layers.recurrent import LSTM
from keras.layers import Dense, Activation
from keras.callbacks import LambdaCallback
from keras.utils.data_utils import get_file
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split

"""## **Data preparation - _Generating Corpus_**"""

# Download data from Google drive

'''

ORIGINAL DATASET URL:
    https://raw.githubusercontent.com/maxim5/stanford-tensorflow-tutorials/master/data/arxiv_abstracts.txt

'''

url = ' https://drive.google.com/uc?id=1YTBR7FiXssaKXHhOZbUbwoWw6jzQxxKW'
output = 'corpus.txt'
gdown.download(url, output, quiet=False)

# sentence_length = 40

# Read local file from directory
with open('corpus.txt') as subject:
  cache = subject.readlines()
translator = str.maketrans('', '', string.punctuation) # Remove punctuation
lines = [doc.lower().translate(translator) for doc in cache] # Switch to lower case

# PREVIEW OUTPUT ::

# print(lines[0][:100])
# len(lines)

# Generate an list of single/independent words

vocabulary = list(set(' '.join(lines).replace('\n','').split(' ')))
primary_store = {}
for strings, texts in enumerate(vocabulary):
  primary_store[texts] = strings

# PREVIEW OUTPUT ::

# print(vocabulary[:50])
# len(vocabulary)

# Splitting data into Train sets and test sets

X = [] 
y = []

for c in lines:
  xxxx = c.replace('\n','').split(' ')
  X.append(' '.join(xxxx[:-1])) # X from the corpus

  yyyy = [0 for i in range(len(vocabulary))] # Generate Y from the Vocabulary
  # yyyy[primary_store[xxxx[-1]]] = 1
  yyyy[primary_store[xxxx[-1]]] = 1
  y.append(yyyy)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
y_test = numpy.array(y_test)
y_train = numpy.array(y_train)

# PREVIEW OUTPUT ::

# print(X_train[:10])
# print(y_train[:10])
# print(X_test[:10])
# print(y_test[:10])

"""## **Embeddings!**"""

# Import the Universal Sentence Encoder's TF Hub module (Here we're making use of version 4)
# This will take a while but won't be long :)

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"  
appreciate = hub.load(module_url)

# Making it easier - Function for embedding
def embed(goodness):
  return appreciate(goodness)

# REVIEW OUTPUT ::

# appreciate.variables

# Wrapping up with the U-S-E

X_train = embed(X_train)
X_test = embed(X_test)
X_train = X_train.numpy()
X_test = X_test.numpy()

# PREVIEW OUTPUT ::

# print(X_train[:10])
# print(y_train[:10])
# print(X_test[:10])
# print(y_test[:10])
# print(X_train.shape, X_test.shape, y_test.shape, y_train.shape)

"""# **Building the model**"""

model = Sequential()
# model.add(Embedding(input_dim=len(vocabulary), output_dim=100))
model = Sequential()
# model.add(LSTM(units=100, input_shape=[512]))
model.add(Dense(512, input_shape=[512], activation = 'relu'))
model.add(Dense(units=len(vocabulary), activation = 'softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
model.summary()

# Training the model. 

model.fit(X_train, y_train, batch_size=512, shuffle=True, epochs=20, validation_data=(X_test, y_test), callbacks=[LambdaCallback()])

"""#**Unto the tests!**"""

# Create function to predict and show detailed output

def next_word(collection=[], extent=1):

  for item in collection:
    text = item
    for i in range(extent):
      prediction = model.predict(x=embed([item]).numpy())
      idx = np.argmax(prediction[-1])
      item += ' ' + vocabulary[idx]
      
      print(text + ' --> ' + item + '\nNEXT WORD: ' + item.split(' ')[-1] + '\n')

# Tests - please feel free to explore

single_text = ['and some other essential']

next_word(single_text)

# Testing on a collection of words

text_collection = ['deep convolutional', 'simple and effective', 'a nonconvex', 'a']

next_word(text_collection)

"""## **For the record**

The Dataset is based on a Tensorflow tutorial from Stanford, so all predicted words will be based on Deep learning and Machine learning _common terms_.


"""


# Storing data

vocabulary = numpy.array(vocabulary)
numpy.save('./vocabulary.npy', vocabulary)
model.save('./NWP-USE')

##                                                                  END OF NOTEBOOK