@@ -9,6 +9,8 @@ local Word2Vec = torch.class("Word2Vec")
9
9
10
10
function Word2Vec :__init (config )
11
11
self .tensortype = torch .getdefaulttensortype ()
12
+ self .gpu = config .gpu -- 1 if train on gpu, otherwise cpu
13
+ self .stream = config .stream -- 1 if stream from hard drive, 0 otherwise
12
14
self .neg_samples = config .neg_samples
13
15
self .minfreq = config .minfreq
14
16
self .dim = config .dim
@@ -27,7 +29,7 @@ function Word2Vec:__init(config)
27
29
self .total_count = 0
28
30
end
29
31
30
- -- change to cuda
32
+ -- move to cuda
31
33
function Word2Vec :cuda ()
32
34
require (" cunn" )
33
35
self .word = self .word :cuda ()
@@ -77,7 +79,7 @@ function Word2Vec:build_vocab(corpus)
77
79
self .w2v .modules [1 ]:add (self .word_vecs )
78
80
self .w2v :add (nn .MM (false , true )) -- dot prod and sigmoid to get probabilities
79
81
self .w2v :add (nn .Sigmoid ())
80
- self .decay = (self .min_lr - self .lr )/ self .total_count
82
+ self .decay = (self .min_lr - self .lr )/ ( self .total_count * self . window )
81
83
end
82
84
83
85
-- Build a table of unigram frequencies from which to obtain negative samples
@@ -127,8 +129,9 @@ function Word2Vec:sample_contexts(context)
127
129
end
128
130
end
129
131
130
- -- Train on sentences
131
- function Word2Vec :train_model (corpus )
132
+ -- Train on sentences that are streamed from the hard drive
133
+ -- Check train_mem function to train from memory (after pre-loading data into tensor)
134
+ function Word2Vec :train_stream (corpus )
132
135
print (" Training..." )
133
136
local start = sys .clock ()
134
137
local c = 0
@@ -147,14 +150,14 @@ function Word2Vec:train_model(corpus)
147
150
if context_idx ~= nil then -- valid context
148
151
self :sample_contexts (context_idx ) -- update pos/neg contexts
149
152
self :train_pair (self .word , self .contexts ) -- train word context pair
153
+ c = c + 1
154
+ self .lr = math.max (self .min_lr , self .lr + self .decay )
155
+ if c % 100000 == 0 then
156
+ print (string.format (" %d words trained in %.2f seconds. Learning rate: %.4f" , c , sys .clock () - start , self .lr ))
157
+ end
150
158
end
151
159
end
152
- end
153
- c = c + 1
154
- self .lr = math.max (self .min_lr , self .lr + self .decay )
155
- if c % 100000 == 0 then
156
- print (string.format (" %d words trained in %.2f seconds. Learning rate: %.4f" , c , sys .clock () - start , self .lr ))
157
- end
160
+ end
158
161
end
159
162
end
160
163
end
173
176
-- w can be a string such as "king" or a vector for ("king" - "queen" + "man")
174
177
function Word2Vec :get_sim_words (w , k )
175
178
if self .word_vecs_norm == nil then
176
- self .word_vecs_norm = self :normalize (self .word_vecs .weight )
179
+ self .word_vecs_norm = self :normalize (self .word_vecs .weight : double () )
177
180
end
178
181
if type (w ) == " string" then
179
182
if self .word2index [w ] == nil then
@@ -216,3 +219,66 @@ function Word2Vec:split(input, sep)
216
219
end
217
220
return t
218
221
end
222
+
223
+ -- pre-load data as a torch tensor instead of streaming it. this requires a lot of memory,
224
+ -- so if the corpus is huge you should partition into smaller sets
225
+ function Word2Vec :preload_data (corpus )
226
+ print (" Preloading training corpus into tensors (Warning: this takes a lot of memory)" )
227
+ local start = sys .clock ()
228
+ local c = 0
229
+ f = io.open (corpus , " r" )
230
+ self .train_words = {}; self .train_contexts = {}
231
+ for line in f :lines () do
232
+ sentence = self :split (line )
233
+ for i , word in ipairs (sentence ) do
234
+ word_idx = self .word2index [word ]
235
+ if word_idx ~= nil then -- word exists in vocab
236
+ local reduced_window = torch .random (self .window ) -- pick random window size
237
+ self .word [1 ] = word_idx -- update current word
238
+ for j = i - reduced_window , i + reduced_window do -- loop through contexts
239
+ local context = sentence [j ]
240
+ if context ~= nil and j ~= i then -- possible context
241
+ context_idx = self .word2index [context ]
242
+ if context_idx ~= nil then -- valid context
243
+ c = c + 1
244
+ self :sample_contexts (context_idx ) -- update pos/neg contexts
245
+ if self .gpu == 1 then
246
+ self .train_words [c ] = self .word :clone ():cuda ()
247
+ self .train_contexts [c ] = self .contexts :clone ():cuda ()
248
+ else
249
+ self .train_words [c ] = self .word :clone ()
250
+ self .train_contexts [c ] = self .contexts :clone ()
251
+ end
252
+ end
253
+ end
254
+ end
255
+ end
256
+ end
257
+ end
258
+ print (string.format (" %d word-contexts processed in %.2f seconds" , c , sys .clock () - start ))
259
+ end
260
+
261
+ -- train from memory. this is needed to speed up GPU training
262
+ function Word2Vec :train_mem ()
263
+ local start = sys .clock ()
264
+ for i = 1 , # self .train_words do
265
+ self :train_pair (self .train_words [i ], self .train_contexts [i ])
266
+ self .lr = math.max (self .min_lr , self .lr + self .decay )
267
+ if i % 100000 == 0 then
268
+ print (string.format (" %d words trained in %.2f seconds. Learning rate: %.4f" , i , sys .clock () - start , self .lr ))
269
+ end
270
+ end
271
+ end
272
+
273
+ -- train the model using config parameters
274
+ function Word2Vec :train_model (corpus )
275
+ if self .gpu == 1 then
276
+ self :cuda ()
277
+ end
278
+ if self .stream == 1 then
279
+ self :train_stream (corpus )
280
+ else
281
+ self :preload_data (corpus )
282
+ self :train_mem ()
283
+ end
284
+ end
0 commit comments