changing default LSTM initialization to use biases of 1.0 for the for…

…get gates, which is a common trick and encourages remembering in the beginning of training. I ran 2 medium-sized experiments and saw small improvements from this choice in both cases.
psfoley · Sep 20, 2015 · 0dfeaa4 · 0dfeaa4
1 parent 065c5ff
commit 0dfeaa4
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 4 deletions.
diff --git a/model/LSTM.lua b/model/LSTM.lua
@@ -27,8 +27,8 @@ function LSTM.lstm(input_size, rnn_size, n, dropout)
       input_size_L = rnn_size
     end
     -- evaluate the input sums at once for efficiency
-    local i2h = nn.Linear(input_size_L, 4 * rnn_size)(x)
-    local h2h = nn.Linear(rnn_size, 4 * rnn_size)(prev_h)
+    local i2h = nn.Linear(input_size_L, 4 * rnn_size)(x):annotate{name='i2h_'..L}
+    local h2h = nn.Linear(rnn_size, 4 * rnn_size)(prev_h):annotate{name='h2h_'..L}
     local all_input_sums = nn.CAddTable()({i2h, h2h})
 
     local reshaped = nn.Reshape(4, rnn_size)(all_input_sums)
@@ -54,7 +54,7 @@ function LSTM.lstm(input_size, rnn_size, n, dropout)
   -- set up the decoder
   local top_h = outputs[#outputs]
   if dropout > 0 then top_h = nn.Dropout(dropout)(top_h) end
-  local proj = nn.Linear(rnn_size, input_size)(top_h)
+  local proj = nn.Linear(rnn_size, input_size)(top_h):annotate{name='decoder'}
   local logsoft = nn.LogSoftMax()(proj)
   table.insert(outputs, logsoft)
 

diff --git a/train.lua b/train.lua
@@ -171,7 +171,19 @@ params, grad_params = model_utils.combine_all_parameters(protos.rnn)
 
 -- initialization
 if do_random_init then
-params:uniform(-0.08, 0.08) -- small numbers uniform
+    params:uniform(-0.08, 0.08) -- small uniform numbers
+end
+-- initialize the LSTM forget gates with slightly higher biases to encourage remembering in the beginning
+if opt.model == 'lstm' then
+    for layer_idx = 1, opt.num_layers do
+        for _,node in ipairs(protos.rnn.forwardnodes) do
+            if node.data.annotations.name == "i2h_" .. layer_idx then
+                print('setting forget gate biases to 1 in LSTM layer ' .. layer_idx)
+                -- the gates are, in order, i,f,o,g, so f is the 2nd block of weights
+                node.data.module.bias[{{opt.rnn_size+1, 2*opt.rnn_size}}]:fill(1.0)
+            end
+        end
+    end
 end
 
 print('number of parameters in the model: ' .. params:nElement())