Skip to content

Commit

Permalink
changing default LSTM initialization to use biases of 1.0 for the for…
Browse files Browse the repository at this point in the history
…get gates, which is a common trick and encourages remembering in the beginning of training. I ran 2 medium-sized experiments and saw small improvements from this choice in both cases.
  • Loading branch information
karpathy committed Sep 20, 2015
1 parent 065c5ff commit 0dfeaa4
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 4 deletions.
6 changes: 3 additions & 3 deletions model/LSTM.lua
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ function LSTM.lstm(input_size, rnn_size, n, dropout)
input_size_L = rnn_size
end
-- evaluate the input sums at once for efficiency
local i2h = nn.Linear(input_size_L, 4 * rnn_size)(x)
local h2h = nn.Linear(rnn_size, 4 * rnn_size)(prev_h)
local i2h = nn.Linear(input_size_L, 4 * rnn_size)(x):annotate{name='i2h_'..L}
local h2h = nn.Linear(rnn_size, 4 * rnn_size)(prev_h):annotate{name='h2h_'..L}
local all_input_sums = nn.CAddTable()({i2h, h2h})

local reshaped = nn.Reshape(4, rnn_size)(all_input_sums)
Expand All @@ -54,7 +54,7 @@ function LSTM.lstm(input_size, rnn_size, n, dropout)
-- set up the decoder
local top_h = outputs[#outputs]
if dropout > 0 then top_h = nn.Dropout(dropout)(top_h) end
local proj = nn.Linear(rnn_size, input_size)(top_h)
local proj = nn.Linear(rnn_size, input_size)(top_h):annotate{name='decoder'}
local logsoft = nn.LogSoftMax()(proj)
table.insert(outputs, logsoft)

Expand Down
14 changes: 13 additions & 1 deletion train.lua
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,19 @@ params, grad_params = model_utils.combine_all_parameters(protos.rnn)

-- initialization
if do_random_init then
params:uniform(-0.08, 0.08) -- small numbers uniform
params:uniform(-0.08, 0.08) -- small uniform numbers
end
-- initialize the LSTM forget gates with slightly higher biases to encourage remembering in the beginning
if opt.model == 'lstm' then
for layer_idx = 1, opt.num_layers do
for _,node in ipairs(protos.rnn.forwardnodes) do
if node.data.annotations.name == "i2h_" .. layer_idx then
print('setting forget gate biases to 1 in LSTM layer ' .. layer_idx)
-- the gates are, in order, i,f,o,g, so f is the 2nd block of weights
node.data.module.bias[{{opt.rnn_size+1, 2*opt.rnn_size}}]:fill(1.0)
end
end
end
end

print('number of parameters in the model: ' .. params:nElement())
Expand Down

0 comments on commit 0dfeaa4

Please sign in to comment.