portmanteau.py

# -*- coding: utf-8 -*-
"""Portmanteau

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1SA83JJiEcRatoc1E3Y10FF9EZTX39Bgm

# Character Seq Model
"""

#@title Imports
from __future__ import print_function
from subprocess import check_output
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
from keras.models import model_from_json
from StringIO import StringIO
from scipy.stats import norm
import pandas as pd
import numpy as np
import random
import sys
import io
from hyphen import Hyphenator

#@title Sequence generation

# Text extraction and processing
df = pd.read_csv('data/Names/NationalNames.csv')
names = list(df[(df['Count'] > 3) & (
    df['Name'].str.len() > 4)]['Name'].unique())
print (len(names))
text = '\n\n'.join(names)
text = text.lower()

chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

SEQLEN = 3
step = 1
sentences = []
next_chars = []


def gen_seq():
    for i in range(0, len(text) - SEQLEN, step):
        sentences.append(text[i: i + SEQLEN])
        next_chars.append(text[i + SEQLEN])
    print('nb sequences:', len(sentences))
    print('nb chars:', len(next_chars))

#@title Vectorisation


def vectorise():
    print('Vectorization...')
    x = np.zeros((len(sentences), SEQLEN, len(chars)), dtype=np.bool)
    y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
    for i, sentence in enumerate(sentences):
        for t, char in enumerate(sentence):
            x[i, t, char_indices[char]] = 1
        y[i, char_indices[next_chars[i]]] = 1


def load():
    json_file = open('model_2.json', 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    # load weights into new model
    loaded_model.load_weights("model_2.h5")
    print("Loaded model from disk")
    return loaded_model


def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - SEQLEN - 1)

    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + SEQLEN]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(10):
            x_pred = np.zeros((1, SEQLEN, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()


def hyp(word, factor):
    global h_in
    if len(word) < 4:
        arr = [0]
    else:
        arr = [word.find(i) for i in h_in.syllables(unicode(word))]
    x = np.ones(len(word))
    x.put(arr, factor)
    return x


def gen(seed):
    x_pred = np.zeros((1, SEQLEN, len(chars)))
    for t, char in enumerate(seed[-3:]):
        x_pred[0, t, char_indices[char]] = 1.
        preds = model.predict(x_pred, verbose=0)[0]
    next_index = sample(preds, 0.70)
    next_char = indices_char[next_index]
    if next_index == 1:
        return seed
    else:
        seed += next_char
        return gen(seed)


try:
    model = load()
except:
    raise Exception("Model not found")
    #@title LSTM
    gen_seq()
    vectorise()
    print('Build model...')
    model = Sequential()
    model.add(LSTM(4, input_shape=(SEQLEN, len(chars))))
    model.add(Dense(len(chars)))
    model.add(Activation('softmax'))

    optimizer = RMSprop(lr=0.01)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer)

# print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

#@title Training
# model.fit(x, y,
#           batch_size=128,
#           epochs=30,
#           callbacks=[print_callback])

# model_json = model.to_json()
# with open("model.json", "w") as json_file:
#     json_file.write(model_json)
# # serialize weights to HDF5
# model.save_weights("model.h5")

# from google.colab import files

# auth.authenticate_user()
# gauth = GoogleAuth()
# gauth.credentials = GoogleCredentials.get_application_default()
# drive = GoogleDrive(gauth)
#
# uploaded = drive.CreateFile({'title': 'model2.json'})
# uploaded.SetContentFile('model.json')
# uploaded.Upload()
# print('Uploaded file with ID {}'.format(uploaded.get('id')))
#
# auth.authenticate_user()
# gauth = GoogleAuth()
# gauth.credentials = GoogleCredentials.get_application_default()
# drive = GoogleDrive(gauth)
#
# uploaded = drive.CreateFile({'title': 'model2.h5'})
# uploaded.SetContentFile('model.h5')
# uploaded.Upload()
# print('Uploaded file with ID {}'.format(uploaded.get('id')))

# files.download('./model.json')

"""# Bridge"""

#@title Code
MINLEFT = 3
MINRIGHT = 3
SEQLEN = 3
COMPARE = 3
MAXLEN = 10
ENDPENALTY = 0.7
LEFT_BIAS = [0.07, 0.04, 0.02]
PHONEME_WT = 0.2
h_in = Hyphenator('en_IN')
SYL_INIT_RIGHT = 2.6
SYL_INIT_LEFT = 2.0


def one_hot(word):
    x_pred = np.zeros((1, SEQLEN, len(chars)))

    for t, char in enumerate(word):
        x_pred[0, t, char_indices[char]] = 1.
    return x_pred


def sample_preds(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    # probas = np.random.multinomial(1, preds, 1)
    # return np.argmax(probas)
    return preds


def ohmygauss(length, sigma=1.8):
    rv = norm(loc=0, scale=sigma)
    x = np.arange(length)
    return rv.pdf(x)


def proc(left, right, verbose=False):
    best_matches = {}
    best_i = None
    best_i_score = -1
    syll_factors_l = hyp(left, SYL_INIT_LEFT)
    for i in range(0, len(left) - MINLEFT + 1):
        # print ("I:", i)
        # Searching all sequences of size COMPARE in the right word
        # to find best match
        best_j = None
        best_j_score = -1
        best_matches[i] = {}
        right_bound = len(right) - MINRIGHT + 1
        gaus_factors = ohmygauss(right_bound)
        syll_factors_r = hyp(right, SYL_INIT_RIGHT)
        for j in range(0, right_bound):
            right_chars = right[j:j + COMPARE]
            s = 0
            for x in range(COMPARE):
                # Character on right which is being sampled
                c_index = char_indices[right_chars[x]]
                if verbose:
                    print ("Sampling " + left[i + x:i + SEQLEN] +
                           right[j:j + x] + "-->" + right_chars[x])

                # Generating sequence and getting probability
                Xoh = one_hot(left[i + x:i + SEQLEN] + right[j:j + x])
                preds = model.predict(Xoh, verbose=0)[0]
                pred_probs = sample_preds(preds, 0.7)

                # Getting corresponding character in left word
                left_char = np.zeros((1, len(chars)))
                try:
                    left_char[0, char_indices[left[i + SEQLEN + x]]] = 1
                except IndexError:
                    pass
                # Adding some bias to left_char and adding it to predicted probs
                biased_probs = LEFT_BIAS[x] * left_char + \
                    (1 - LEFT_BIAS[x]) * pred_probs
                # l_preds.append(biased_probs)

                # Adding probability of bridging at c_index to s
                s += biased_probs[0, c_index]

            s = s * gaus_factors[j] * syll_factors_r[j] * syll_factors_l[i]

            if verbose:
                print (i, j, s,)
            best_matches[i][j] = s
            if s > best_j_score:
                best_j = j
                best_j_score = s
#         best_matches[i] = {'index': best_j, 'score': best_j_score}
        if best_j_score > best_i_score and i < len(left) - MINLEFT:
            best_i_score = best_j_score
            best_i = i

    return best_matches, best_i


def generate(left, right, verbose=False):
    left = '\n' + left if len(left) == 3 else left
    right = '\n' + right if len(right) == 3 else right
#     left = '\n' + left + '\n'
#     right = '\n' + right + '\n'
    matches, i = proc(left, right, verbose)
#     print ("Best (" + str(round(matches[i]['score'], 4)) + "): " +
#            (left[:i + SEQLEN] + right[matches[i]['index']:]))
#     print ("Others :")
    words = pd.Series()
    probs = {}
    for i_temp in matches:
        for j_temp in matches[i_temp]:
            word = (left[:i_temp + SEQLEN] + right[j_temp:]
                    ).replace('\n', '').title()
            pcount = get_phoneme_count(word)
            # print (word, (1 / float(pcount)) * 20)
            probs[word] = round(matches[i_temp][j_temp],
                                4) + (1 / float(pcount) * PHONEME_WT)
#         print ("(" + str(matches[i_temp]['score']) + "): " +matches[i_temp]['name'])
    return probs
    # print (matches)


def get_phonemes(word):
    return check_output(["espeak", "-q", "--ipa",
                         '-v', 'en-in',
                         word]).decode('utf-8')


def get_phoneme_count(word):
    try:
        return len(set(list(get_phonemes(word).strip())))
    except:
        print ("Warning : You might need to install espeak to account for phonemes while presicting word score.")
        return 10


def bridge(left, right, verbose=False, reflexive=True):
    left = left.lower()
    right = right.lower()
    m1 = pd.Series(generate(left, right, verbose))
    res1 = m1.sort_values()
    if reflexive:
        m2 = pd.Series(generate(right, left, verbose))
        res2 = m2.sort_values()
        all_words = res1.append(res2)
    else:
        all_words = res1
    return (all_words[all_words.index.str.len() <= MAXLEN].sort_values(ascending=False)[:15])