Skip to content

Commit f8c10fc

Browse files
committedMay 21, 2015
stream/gpu functionality
1 parent b7118c8 commit f8c10fc

File tree

3 files changed

+91
-13
lines changed

3 files changed

+91
-13
lines changed
 

‎README

+7-1
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,15 @@ Arguments are mostly self-explanatory (see main.lua for default arguments)
2020
-lr : starting learning rate
2121
-min_lr : minimum learning rate--lr will linearly decay to this value
2222
-epochs : number of epochs to run
23+
-stream : whether to stream text data from HD or store in memory (1 = stream, 0 = not)
24+
-gpu : whether to use gpu (1 = use gpu, 0 = not)
2325

2426
For example:
2527

26-
th main.lua -corpus corpus.txt -window 3 -dim 100 -minfreq 10
28+
CPU:
29+
th main.lua -corpus corpus.txt -window 3 -dim 100 -minfreq 10 -stream 1 -gpu 0
30+
31+
GPU:
32+
th main.lua -corpus corpus.txt -window 3 -dim 100 -minfreq 10 -stream 0 -gpu 1
2733

2834

‎main.lua

+7-1
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,15 @@ config.corpus = "corpus.txt" -- input data
1414
config.window = 5 -- (maximum) window size
1515
config.dim = 100 -- dimensionality of word embeddings
1616
config.alpha = 0.75 -- smooth out unigram frequencies
17-
config.table_size = 1e7 -- table size from which to sample neg samples
17+
config.table_size = 1e8 -- table size from which to sample neg samples
1818
config.neg_samples = 5 -- number of negative samples for each positive sample
1919
config.minfreq = 10 --threshold for vocab frequency
2020
config.lr = 0.025 -- initial learning rate
2121
config.min_lr = 0.001 -- min learning rate
2222
config.epochs = 3 -- number of epochs to train
23+
config.gpu = 0 -- 1 = use gpu, 0 = use cpu
24+
config.stream = 1 -- 1 = stream from hard drive 0 = copy to memory first
25+
2326
-- Parse input arguments
2427
cmd = torch.CmdLine()
2528
cmd:option("-corpus", config.corpus)
@@ -31,6 +34,8 @@ cmd:option("-min_lr", config.min_lr)
3134
cmd:option("-neg_samples", config.neg_samples)
3235
cmd:option("-table_size", config.table_size)
3336
cmd:option("-epochs", config.epochs)
37+
cmd:option("-gpu", config.gpu)
38+
cmd:option("-stream", config.stream)
3439
params = cmd:parse(arg)
3540

3641
for param, value in pairs(params) do
@@ -44,6 +49,7 @@ end
4449
m = Word2Vec(config)
4550
m:build_vocab(config.corpus)
4651
m:build_table()
52+
4753
for k = 1, config.epochs do
4854
m.lr = config.lr -- reset learning rate at each epoch
4955
m:train_model(config.corpus)

‎word2vec.lua

+77-11
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ local Word2Vec = torch.class("Word2Vec")
99

1010
function Word2Vec:__init(config)
1111
self.tensortype = torch.getdefaulttensortype()
12+
self.gpu = config.gpu -- 1 if train on gpu, otherwise cpu
13+
self.stream = config.stream -- 1 if stream from hard drive, 0 otherwise
1214
self.neg_samples = config.neg_samples
1315
self.minfreq = config.minfreq
1416
self.dim = config.dim
@@ -27,7 +29,7 @@ function Word2Vec:__init(config)
2729
self.total_count = 0
2830
end
2931

30-
-- change to cuda
32+
-- move to cuda
3133
function Word2Vec:cuda()
3234
require("cunn")
3335
self.word = self.word:cuda()
@@ -77,7 +79,7 @@ function Word2Vec:build_vocab(corpus)
7779
self.w2v.modules[1]:add(self.word_vecs)
7880
self.w2v:add(nn.MM(false, true)) -- dot prod and sigmoid to get probabilities
7981
self.w2v:add(nn.Sigmoid())
80-
self.decay = (self.min_lr-self.lr)/self.total_count
82+
self.decay = (self.min_lr-self.lr)/(self.total_count*self.window)
8183
end
8284

8385
-- Build a table of unigram frequencies from which to obtain negative samples
@@ -127,8 +129,9 @@ function Word2Vec:sample_contexts(context)
127129
end
128130
end
129131

130-
-- Train on sentences
131-
function Word2Vec:train_model(corpus)
132+
-- Train on sentences that are streamed from the hard drive
133+
-- Check train_mem function to train from memory (after pre-loading data into tensor)
134+
function Word2Vec:train_stream(corpus)
132135
print("Training...")
133136
local start = sys.clock()
134137
local c = 0
@@ -147,14 +150,14 @@ function Word2Vec:train_model(corpus)
147150
if context_idx ~= nil then -- valid context
148151
self:sample_contexts(context_idx) -- update pos/neg contexts
149152
self:train_pair(self.word, self.contexts) -- train word context pair
153+
c = c + 1
154+
self.lr = math.max(self.min_lr, self.lr + self.decay)
155+
if c % 100000 ==0 then
156+
print(string.format("%d words trained in %.2f seconds. Learning rate: %.4f", c, sys.clock() - start, self.lr))
157+
end
150158
end
151159
end
152-
end
153-
c = c + 1
154-
self.lr = math.max(self.min_lr, self.lr + self.decay)
155-
if c % 100000 ==0 then
156-
print(string.format("%d words trained in %.2f seconds. Learning rate: %.4f", c, sys.clock() - start, self.lr))
157-
end
160+
end
158161
end
159162
end
160163
end
@@ -173,7 +176,7 @@ end
173176
-- w can be a string such as "king" or a vector for ("king" - "queen" + "man")
174177
function Word2Vec:get_sim_words(w, k)
175178
if self.word_vecs_norm == nil then
176-
self.word_vecs_norm = self:normalize(self.word_vecs.weight)
179+
self.word_vecs_norm = self:normalize(self.word_vecs.weight:double())
177180
end
178181
if type(w) == "string" then
179182
if self.word2index[w] == nil then
@@ -216,3 +219,66 @@ function Word2Vec:split(input, sep)
216219
end
217220
return t
218221
end
222+
223+
-- pre-load data as a torch tensor instead of streaming it. this requires a lot of memory,
224+
-- so if the corpus is huge you should partition into smaller sets
225+
function Word2Vec:preload_data(corpus)
226+
print("Preloading training corpus into tensors (Warning: this takes a lot of memory)")
227+
local start = sys.clock()
228+
local c = 0
229+
f = io.open(corpus, "r")
230+
self.train_words = {}; self.train_contexts = {}
231+
for line in f:lines() do
232+
sentence = self:split(line)
233+
for i, word in ipairs(sentence) do
234+
word_idx = self.word2index[word]
235+
if word_idx ~= nil then -- word exists in vocab
236+
local reduced_window = torch.random(self.window) -- pick random window size
237+
self.word[1] = word_idx -- update current word
238+
for j = i - reduced_window, i + reduced_window do -- loop through contexts
239+
local context = sentence[j]
240+
if context ~= nil and j ~= i then -- possible context
241+
context_idx = self.word2index[context]
242+
if context_idx ~= nil then -- valid context
243+
c = c + 1
244+
self:sample_contexts(context_idx) -- update pos/neg contexts
245+
if self.gpu==1 then
246+
self.train_words[c] = self.word:clone():cuda()
247+
self.train_contexts[c] = self.contexts:clone():cuda()
248+
else
249+
self.train_words[c] = self.word:clone()
250+
self.train_contexts[c] = self.contexts:clone()
251+
end
252+
end
253+
end
254+
end
255+
end
256+
end
257+
end
258+
print(string.format("%d word-contexts processed in %.2f seconds", c, sys.clock() - start))
259+
end
260+
261+
-- train from memory. this is needed to speed up GPU training
262+
function Word2Vec:train_mem()
263+
local start = sys.clock()
264+
for i = 1, #self.train_words do
265+
self:train_pair(self.train_words[i], self.train_contexts[i])
266+
self.lr = math.max(self.min_lr, self.lr + self.decay)
267+
if i%100000==0 then
268+
print(string.format("%d words trained in %.2f seconds. Learning rate: %.4f", i, sys.clock() - start, self.lr))
269+
end
270+
end
271+
end
272+
273+
-- train the model using config parameters
274+
function Word2Vec:train_model(corpus)
275+
if self.gpu==1 then
276+
self:cuda()
277+
end
278+
if self.stream==1 then
279+
self:train_stream(corpus)
280+
else
281+
self:preload_data(corpus)
282+
self:train_mem()
283+
end
284+
end

0 commit comments

Comments
 (0)
Please sign in to comment.