Skip to content

Commit d0b0f08

Browse files
author
NoWay
committed
Solution commit.
1 parent dfe6dbf commit d0b0f08

9 files changed

+130276
-0
lines changed

Diff for: .ipynb_checkpoints/dlnd_tv_script_generation-checkpoint.ipynb

+1,431
Large diffs are not rendered by default.

Diff for: data/Seinfeld_Scripts.txt

+109,232
Large diffs are not rendered by default.

Diff for: dlnd_tv_script_generation.html

+17,879
Large diffs are not rendered by default.

Diff for: dlnd_tv_script_generation.ipynb

+1,431
Large diffs are not rendered by default.

Diff for: generated_script_1.txt

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
jerry: everything? what as? cantaloupe jerry: sweat? what the with a your! who denim, the not gonna i a they a,.. elaine: the from day i jerry gonna no a he these(.. to the from,.. elaine: that you.. and my? the jerry on but i went,.. and) a just george: the from day kramer:? so like! back being,.. and) a just george: i there? the his hey some with here me 7,.. to) helen: funny a we filled george: the from day the don't! dufus success,.. to the jerry on,.. elaine: yeah? the on,.. to the jerry on,.. elaine: that he i when i jerry gonna a your i've you) got couple, george:.. it) let's all what stay george: i on? the not very! much likes at goal i a can't all was! who molar,.. to yeah? i'm? the here! who seen,.. it that you.. and my? i on? i on that? the from he kramer:? what elaine the yes not : kramer: a" beautiful? the here a your are in her, i on? you're jerry with a your goes was no what the give like good we hitting was i read,.. to that you.. and my? the not youre a there i, the jerry or on that i he,.. to my that up hey hi her you.. elaine: my well not! who cleaning,.. to that- of you.. and my the his the way he of,.. to the jerry on? that you.. to the nothing go of i we're kramer: don't! bar, the jerry gonna a apartment i? just,.. put the jerry on,.. to yeah? the on,.. elaine:) a just george: the his so! bar,.. to the jerry his for the don't, the when

Diff for: helper.py

+55
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import os
2+
import pickle
3+
import torch
4+
5+
6+
SPECIAL_WORDS = {'PADDING': '<PAD>'}
7+
8+
9+
def load_data(path):
10+
"""
11+
Load Dataset from File
12+
"""
13+
input_file = os.path.join(path)
14+
with open(input_file, "r") as f:
15+
data = f.read()
16+
17+
return data
18+
19+
20+
def preprocess_and_save_data(dataset_path, token_lookup, create_lookup_tables):
21+
"""
22+
Preprocess Text Data
23+
"""
24+
text = load_data(dataset_path)
25+
26+
# Ignore notice, since we don't use it for analysing the data
27+
text = text[81:]
28+
29+
token_dict = token_lookup()
30+
for key, token in token_dict.items():
31+
text = text.replace(key, ' {} '.format(token))
32+
33+
text = text.lower()
34+
text = text.split()
35+
36+
vocab_to_int, int_to_vocab = create_lookup_tables(text + list(SPECIAL_WORDS.values()))
37+
int_text = [vocab_to_int[word] for word in text]
38+
pickle.dump((int_text, vocab_to_int, int_to_vocab, token_dict), open('preprocess.p', 'wb'))
39+
40+
41+
def load_preprocess():
42+
"""
43+
Load the Preprocessed Training data and return them in batches of <batch_size> or less
44+
"""
45+
return pickle.load(open('preprocess.p', mode='rb'))
46+
47+
48+
def save_model(filename, decoder):
49+
save_filename = os.path.splitext(os.path.basename(filename))[0] + '.pt'
50+
torch.save(decoder, save_filename)
51+
52+
53+
def load_model(filename):
54+
save_filename = os.path.splitext(os.path.basename(filename))[0] + '.pt'
55+
return torch.load(save_filename)

Diff for: preprocess.p

2.44 MB
Binary file not shown.

Diff for: problem_unittests.py

+247
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,247 @@
1+
from unittest.mock import MagicMock, patch
2+
import numpy as np
3+
import torch
4+
5+
6+
class _TestNN(torch.nn.Module):
7+
def __init__(self, input_size, output_size):
8+
super(_TestNN, self).__init__()
9+
self.decoder = torch.nn.Linear(input_size, output_size)
10+
self.forward_called = False
11+
12+
def forward(self, nn_input, hidden):
13+
self.forward_called = True
14+
output = self.decoder(nn_input)
15+
16+
return output, hidden
17+
18+
19+
def _print_success_message():
20+
print('Tests Passed')
21+
22+
23+
class AssertTest(object):
24+
def __init__(self, params):
25+
self.assert_param_message = '\n'.join([str(k) + ': ' + str(v) + '' for k, v in params.items()])
26+
27+
def test(self, assert_condition, assert_message):
28+
assert assert_condition, assert_message + '\n\nUnit Test Function Parameters\n' + self.assert_param_message
29+
30+
31+
def test_create_lookup_tables(create_lookup_tables):
32+
test_text = '''
33+
Moe_Szyslak Moe's Tavern Where the elite meet to drink
34+
Bart_Simpson Eh yeah hello is Mike there Last name Rotch
35+
Moe_Szyslak Hold on I'll check Mike Rotch Mike Rotch Hey has anybody seen Mike Rotch lately
36+
Moe_Szyslak Listen you little puke One of these days I'm gonna catch you and I'm gonna carve my name on your back with an ice pick
37+
Moe_Szyslak Whats the matter Homer You're not your normal effervescent self
38+
Homer_Simpson I got my problems Moe Give me another one
39+
Moe_Szyslak Homer hey you should not drink to forget your problems
40+
Barney_Gumble Yeah you should only drink to enhance your social skills'''
41+
42+
test_text = test_text.lower()
43+
test_text = test_text.split()
44+
45+
vocab_to_int, int_to_vocab = create_lookup_tables(test_text)
46+
47+
# Check types
48+
assert isinstance(vocab_to_int, dict),\
49+
'vocab_to_int is not a dictionary.'
50+
assert isinstance(int_to_vocab, dict),\
51+
'int_to_vocab is not a dictionary.'
52+
53+
# Compare lengths of dicts
54+
assert len(vocab_to_int) == len(int_to_vocab),\
55+
'Length of vocab_to_int and int_to_vocab don\'t match. ' \
56+
'vocab_to_int is length {}. int_to_vocab is length {}'.format(len(vocab_to_int), len(int_to_vocab))
57+
58+
# Make sure the dicts have the same words
59+
vocab_to_int_word_set = set(vocab_to_int.keys())
60+
int_to_vocab_word_set = set(int_to_vocab.values())
61+
62+
assert not (vocab_to_int_word_set - int_to_vocab_word_set),\
63+
'vocab_to_int and int_to_vocab don\'t have the same words.' \
64+
'{} found in vocab_to_int, but not in int_to_vocab'.format(vocab_to_int_word_set - int_to_vocab_word_set)
65+
assert not (int_to_vocab_word_set - vocab_to_int_word_set),\
66+
'vocab_to_int and int_to_vocab don\'t have the same words.' \
67+
'{} found in int_to_vocab, but not in vocab_to_int'.format(int_to_vocab_word_set - vocab_to_int_word_set)
68+
69+
# Make sure the dicts have the same word ids
70+
vocab_to_int_word_id_set = set(vocab_to_int.values())
71+
int_to_vocab_word_id_set = set(int_to_vocab.keys())
72+
73+
assert not (vocab_to_int_word_id_set - int_to_vocab_word_id_set),\
74+
'vocab_to_int and int_to_vocab don\'t contain the same word ids.' \
75+
'{} found in vocab_to_int, but not in int_to_vocab'.format(vocab_to_int_word_id_set - int_to_vocab_word_id_set)
76+
assert not (int_to_vocab_word_id_set - vocab_to_int_word_id_set),\
77+
'vocab_to_int and int_to_vocab don\'t contain the same word ids.' \
78+
'{} found in int_to_vocab, but not in vocab_to_int'.format(int_to_vocab_word_id_set - vocab_to_int_word_id_set)
79+
80+
# Make sure the dicts make the same lookup
81+
missmatches = [(word, id, id, int_to_vocab[id]) for word, id in vocab_to_int.items() if int_to_vocab[id] != word]
82+
83+
assert not missmatches,\
84+
'Found {} missmatche(s). First missmatch: vocab_to_int[{}] = {} and int_to_vocab[{}] = {}'.format(len(missmatches),
85+
*missmatches[0])
86+
87+
assert len(vocab_to_int) > len(set(test_text))/2,\
88+
'The length of vocab seems too small. Found a length of {}'.format(len(vocab_to_int))
89+
90+
_print_success_message()
91+
92+
93+
def test_tokenize(token_lookup):
94+
symbols = set(['.', ',', '"', ';', '!', '?', '(', ')', '-', '\n'])
95+
token_dict = token_lookup()
96+
97+
# Check type
98+
assert isinstance(token_dict, dict), \
99+
'Returned type is {}.'.format(type(token_dict))
100+
101+
# Check symbols
102+
missing_symbols = symbols - set(token_dict.keys())
103+
unknown_symbols = set(token_dict.keys()) - symbols
104+
105+
assert not missing_symbols, \
106+
'Missing symbols: {}'.format(missing_symbols)
107+
assert not unknown_symbols, \
108+
'Unknown symbols: {}'.format(unknown_symbols)
109+
110+
# Check values type
111+
bad_value_type = [type(val) for val in token_dict.values() if not isinstance(val, str)]
112+
113+
assert not bad_value_type,\
114+
'Found token as {} type.'.format(bad_value_type[0])
115+
116+
# Check for spaces
117+
key_has_spaces = [k for k in token_dict.keys() if ' ' in k]
118+
val_has_spaces = [val for val in token_dict.values() if ' ' in val]
119+
120+
assert not key_has_spaces,\
121+
'The key "{}" includes spaces. Remove spaces from keys and values'.format(key_has_spaces[0])
122+
assert not val_has_spaces,\
123+
'The value "{}" includes spaces. Remove spaces from keys and values'.format(val_has_spaces[0])
124+
125+
# Check for symbols in values
126+
symbol_val = ()
127+
for symbol in symbols:
128+
for val in token_dict.values():
129+
if symbol in val:
130+
symbol_val = (symbol, val)
131+
132+
assert not symbol_val,\
133+
'Don\'t use a symbol that will be replaced in your tokens. Found the symbol {} in value {}'.format(*symbol_val)
134+
135+
_print_success_message()
136+
137+
138+
def test_rnn(RNN, train_on_gpu):
139+
batch_size = 50
140+
sequence_length = 3
141+
vocab_size = 20
142+
output_size=20
143+
embedding_dim=15
144+
hidden_dim = 10
145+
n_layers = 2
146+
147+
# create test RNN
148+
# params: (vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
149+
rnn = RNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
150+
151+
# create test input
152+
a = np.random.randint(vocab_size, size=(batch_size, sequence_length))
153+
#b = torch.LongTensor(a)
154+
b = torch.from_numpy(a)
155+
hidden = rnn.init_hidden(batch_size)
156+
157+
if(train_on_gpu):
158+
rnn.cuda()
159+
b = b.cuda()
160+
161+
output, hidden_out = rnn(b, hidden)
162+
163+
assert_test = AssertTest({
164+
'Input Size': vocab_size,
165+
'Output Size': output_size,
166+
'Hidden Dim': hidden_dim,
167+
'N Layers': n_layers,
168+
'Batch Size': batch_size,
169+
'Sequence Length': sequence_length,
170+
'Input': b})
171+
172+
# initialization
173+
correct_hidden_size = (n_layers, batch_size, hidden_dim)
174+
175+
if type(hidden) == tuple:
176+
# LSTM
177+
assert_condition = hidden[0].size() == correct_hidden_size
178+
else:
179+
# GRU
180+
assert_condition = hidden.size() == correct_hidden_size
181+
182+
assert_message = 'Wrong hidden state size. Expected type {}. Got type {}'.format(correct_hidden_size, hidden[0].size())
183+
assert_test.test(assert_condition, assert_message)
184+
185+
# output of rnn
186+
correct_hidden_size = (n_layers, batch_size, hidden_dim)
187+
188+
if type(hidden) == tuple:
189+
# LSTM
190+
assert_condition = hidden_out[0].size() == correct_hidden_size
191+
else:
192+
# GRU
193+
assert_condition = hidden_out.size() == correct_hidden_size
194+
195+
assert_message = 'Wrong hidden state size. Expected type {}. Got type {}'.format(correct_hidden_size, hidden_out[0].size())
196+
assert_test.test(assert_condition, assert_message)
197+
198+
correct_output_size = (batch_size, output_size)
199+
assert_condition = output.size() == correct_output_size
200+
assert_message = 'Wrong output size. Expected type {}. Got type {}'.format(correct_output_size, output.size())
201+
assert_test.test(assert_condition, assert_message)
202+
203+
_print_success_message()
204+
205+
206+
def test_forward_back_prop(RNN, forward_back_prop, train_on_gpu):
207+
batch_size = 200
208+
input_size = 20
209+
output_size = 10
210+
sequence_length = 3
211+
embedding_dim=15
212+
hidden_dim = 10
213+
n_layers = 2
214+
learning_rate = 0.01
215+
216+
# create test RNN
217+
rnn = RNN(input_size, output_size, embedding_dim, hidden_dim, n_layers)
218+
219+
mock_decoder = MagicMock(wraps=_TestNN(input_size, output_size))
220+
if train_on_gpu:
221+
mock_decoder.cuda()
222+
223+
mock_decoder_optimizer = MagicMock(wraps=torch.optim.Adam(mock_decoder.parameters(), lr=learning_rate))
224+
mock_criterion = MagicMock(wraps=torch.nn.CrossEntropyLoss())
225+
226+
with patch.object(torch.autograd, 'backward', wraps=torch.autograd.backward) as mock_autograd_backward:
227+
inp = torch.FloatTensor(np.random.rand(batch_size, input_size))
228+
target = torch.LongTensor(np.random.randint(output_size, size=batch_size))
229+
230+
hidden = rnn.init_hidden(batch_size)
231+
232+
loss, hidden_out = forward_back_prop(mock_decoder, mock_decoder_optimizer, mock_criterion, inp, target, hidden)
233+
234+
if type(hidden_out) == tuple:
235+
# LSTM
236+
assert (hidden_out[0][0]==hidden[0][0]).sum()==batch_size*hidden_dim, 'Returned hidden state is the incorrect size.'
237+
else:
238+
# GRU
239+
assert (hidden_out[0]==hidden[0]).sum()==batch_size*hidden_dim, 'Returned hidden state is the incorrect size.'
240+
241+
assert mock_decoder.zero_grad.called or mock_decoder_optimizer.zero_grad.called, 'Didn\'t set the gradients to 0.'
242+
assert mock_decoder.forward_called, 'Forward propagation not called.'
243+
assert mock_autograd_backward.called, 'Backward propagation not called'
244+
assert mock_decoder_optimizer.step.called, 'Optimization step not performed'
245+
assert type(loss) == float, 'Wrong return type. Expected {}, got {}'.format(float, type(loss))
246+
247+
_print_success_message()

Diff for: trained_rnn.pt

37.3 MB
Binary file not shown.

0 commit comments

Comments
 (0)