jimfleming · Apr 11, 2017
diff --git a/‎LICENSE
+21 b/‎LICENSE
+21
diff --git a/‎README.md
+41-5 b/‎README.md
+41-5
diff --git a/‎download_datasets.sh
+4 b/‎download_datasets.sh
+4
diff --git a/‎entity_networks/activations.py
+2-4 b/‎entity_networks/activations.py
+2-4
diff --git a/‎entity_networks/dataset.py
+43-83 b/‎entity_networks/dataset.py
+43-83
diff --git a/‎entity_networks/dynamic_memory_cell.py
+26-21 b/‎entity_networks/dynamic_memory_cell.py
+26-21
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2016 Jim Fleming
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -1,16 +1,52 @@
 # Recurrent Entity Networks
 
-This repository contains a TensorFlow implementation of recurrent entity networks from [Tracking the World State with
-Recurrent Entity Networks](https://openreview.net/forum?id=rJTKKKqeg).
+This repository contains an independent TensorFlow implementation of recurrent entity networks from [Tracking the World State with
+Recurrent Entity Networks](https://openreview.net/forum?id=rJTKKKqeg). This paper introduces the first method to solve all of the bAbI tasks using 10k training examples. The author's original Torch implementation is available [here](https://github.com/facebook/MemNN/tree/master/EntNet-babi).
 
-![Diagram of recurrent entity network](images/diagram.png)
+<img src="images/diagram.png" alt="Diagram of recurrent entity network" width="886" height="658">
+
+## Results
+
+Percent error for each task, comparing those in the paper to the implementation contained in this repository.
+
+Task | EntNet (paper) | EntNet (repo)
+--- | --- | ---
+1: 1 supporting fact | 0 | 0
+2: 2 supporting facts | 0.1 | 3.0
+3: 3 supporting facts | 4.1 | ?
+4: 2 argument relations | 0 | 0
+5: 3 argument relations | 0.3 | ?
+6: yes/no questions | 0.2 | 0.1
+7: counting | 0 | ?
+8: lists/sets | 0.5 | ?
+9: simple negation | 0.1 | 0.7
+10: indefinite knowledge | 0.6 | 0.1
+11: basic coreference | 0.3 | 0
+12: conjunction | 0 | 0
+13: compound coreference | 1.3 | 0
+14: time reasoning | 0 | 4.5
+15: basic deduction | 0 | 0
+16: basic induction | 0.2 | 54.0 ([#5](../../issues/5))
+17: positional reasoning | 0.5 | 1.7
+18: size reasoning | 0.3 | 1.5
+19: path finding | 2.3 | 41.9 ([#5](../../issues/5))
+20: agents motivation | 0 | 0.2
+**Failed Tasks** | 0 | ?
+**Mean Error** | 0.5 | ?
 
 ## Setup
 
 1. Download the datasets by running [download_datasets.sh](download_datasets.sh) or from [The bAbI Project](https://research.facebook.com/research/babi/).
 2. Run [prep_datasets.py](prep_datasets.py) which will convert the datasets into [TFRecords](https://www.tensorflow.org/versions/r0.11/how_tos/reading_data/index.html#standard_tensorflow_format).
-3. Run `python -m entity_networks.main` to begin training.
+3. Run `python -m entity_networks.main` to begin training on QA1.
+4. Run `./run_all.sh` to train on all tasks.
 
 ## Dependencies
 
-- TensorFlow v0.11rc0
+- TensorFlow v0.11
+
+## Thanks!
+
+- Thanks to Mikael Henaff for providing details about their paper over Thanksgiving break. :)
+- Thanks to Andy Zhang ([@zhangandyx](https://twitter.com/zhangandyx)) for helping me troubleshoot numerical instabilities.
+- Thanks to Mike Young for providing results on some of the longer tasks.
@@ -1,5 +1,9 @@
 #!/bin/bash
 
+if [ ! -d ./datasets ]; then
+  mkdir -p ./datasets
+fi
+
 BABI_TASKS=datasets/babi_tasks_data_1_20_v1.2.tar.gz
 DIALOG_TASKS=datasets/dialog_babi_tasks_data_1_6.tgz
 CHILDRENS_BOOK=datasets/childrens_book_test.tgz
 
@@ -8,10 +8,8 @@ def prelu(features, initializer=None, scope=None):
     """
     Implementation of [Parametric ReLU](https://arxiv.org/abs/1502.01852) borrowed from Keras.
     """
-    with tf.variable_scope(scope, 'PReLU'):
-        alpha = tf.get_variable('alpha',
-            shape=features.get_shape().as_list()[1:],
-            initializer=initializer)
+    with tf.variable_scope(scope, 'PReLU', initializer=initializer):
+        alpha = tf.get_variable('alpha', features.get_shape().as_list()[1:])
         pos = tf.nn.relu(features)
         neg = alpha * (features - tf.abs(features)) * 0.5
         return pos + neg
@@ -2,90 +2,50 @@
 from __future__ import print_function
 from __future__ import division
 
+import os
+import json
 import tensorflow as tf
 
-MAX_SENTENCE_LENGTH = 7
-MAX_STORY_LENGTH = 10
-MAX_QUERY_LENGTH = 4
-
-DATASET_SIZE = 10000
-VOCAB_SIZE = 22
-
-def record_reader(filename_queue):
-    reader = tf.TFRecordReader()
-    _, serialized = reader.read(filename_queue)
-
-    features = tf.parse_single_example(serialized, features={
-        "story": tf.FixedLenFeature([MAX_STORY_LENGTH, MAX_SENTENCE_LENGTH], dtype=tf.int64),
-        "query": tf.FixedLenFeature([1, MAX_QUERY_LENGTH], dtype=tf.int64),
-        "answer": tf.FixedLenFeature([], dtype=tf.int64),
-    })
-
-    story = features['story']
-    query = features['query']
-    answer = features['answer']
-
-    return story, query, answer
-
 class Dataset(object):
 
-    def __init__(self, filename, batch_size, shuffle=False):
-        self._batch_size = batch_size
-
-        filename_queue = tf.train.string_input_producer([filename], shuffle=shuffle)
-        records = record_reader(filename_queue)
-
-        min_after_dequeue = DATASET_SIZE
-        capacity = min_after_dequeue + 100 * batch_size
-
-        if shuffle:
-            self._story_batch, self._query_batch, self._answer_batch = \
-                tf.train.shuffle_batch(records,
-                    batch_size=batch_size,
-                    min_after_dequeue=min_after_dequeue,
-                    capacity=capacity)
-        else:
-            self._story_batch, self._query_batch, self._answer_batch = \
-                tf.train.batch(records,
-                    batch_size=batch_size,
-                    capacity=capacity)
-
-    @property
-    def story_batch(self):
-        return self._story_batch
-
-    @property
-    def query_batch(self):
-        return self._query_batch
-
-    @property
-    def answer_batch(self):
-        return self._answer_batch
-
-    @property
-    def batch_size(self):
-        return self._batch_size
-
-    @property
-    def max_sentence_length(self):
-        return MAX_SENTENCE_LENGTH
-
-    @property
-    def max_story_length(self):
-        return MAX_STORY_LENGTH
-
-    @property
-    def max_query_length(self):
-        return MAX_QUERY_LENGTH
-
-    @property
-    def vocab_size(self):
-        return VOCAB_SIZE
-
-    @property
-    def size(self):
-        return DATASET_SIZE
-
-    @property
-    def num_batches(self):
-        return DATASET_SIZE // self._batch_size
+    def __init__(self, dataset_path, batch_size):
+        self.dataset_dir = os.path.dirname(dataset_path)
+        self.batch_size = batch_size
+        self.examples_per_epoch = 10000
+
+        with open(dataset_path) as f:
+            metadata = json.load(f)
+
+        self.max_sentence_length = metadata['max_sentence_length']
+        self.max_story_length = metadata['max_story_length']
+        self.max_query_length = metadata['max_query_length']
+        self.dataset_size = metadata['dataset_size']
+        self.vocab_size = metadata['vocab_size']
+        self.tokens = metadata['tokens']
+        self.datasets = metadata['datasets']
+
+    @property
+    def steps_per_epoch(self):
+        return self.batch_size * self.examples_per_epoch
+
+    def get_input_fn(self, name, num_epochs, shuffle):
+        def input_fn():
+            features = {
+                "story": tf.FixedLenFeature([self.max_story_length, self.max_sentence_length], dtype=tf.int64),
+                "query": tf.FixedLenFeature([1, self.max_query_length], dtype=tf.int64),
+                "answer": tf.FixedLenFeature([], dtype=tf.int64),
+            }
+
+            dataset_path = os.path.join(self.dataset_dir, self.datasets[name])
+            features = tf.contrib.learn.read_batch_record_features(dataset_path,
+                features=features,
+                batch_size=self.batch_size,
+                randomize_input=shuffle,
+                num_epochs=num_epochs)
+
+            story = features['story']
+            query = features['query']
+            answer = features['answer']
+
+            return {'story': story, 'query': query}, answer
+        return input_fn
@@ -11,10 +11,12 @@ class DynamicMemoryCell(tf.nn.rnn_cell.RNNCell):
     The cell's hidden state is divided into blocks and each block's weights are tied.
     """
 
-    def __init__(self, num_blocks, num_units_per_block, activation=tf.nn.relu):
+    def __init__(self, num_blocks, num_units_per_block, keys, initializer=None, activation=tf.nn.relu):
         self._num_blocks = num_blocks # M
         self._num_units_per_block = num_units_per_block # d
+        self._keys = keys
         self._activation = activation # \phi
+        self._initializer = initializer
 
     @property
     def state_size(self):
@@ -24,14 +26,22 @@ def state_size(self):
     def output_size(self):
         return self._num_blocks * self._num_units_per_block
 
-    def get_gate(self, inputs, state_j, key_j):
+    def zero_state(self, batch_size, dtype):
         """
-        Implements the gate (a scalar for each block). Equation 2:
+        We initialize the memory to the key values.
+        """
+        zero_state = tf.concat(1, [tf.expand_dims(key, 0) for key in self._keys])
+        zero_state_batch = tf.tile(zero_state, tf.pack([batch_size, 1]))
+        return zero_state_batch
+
+    def get_gate(self, state_j, key_j, inputs):
+        """
+        Implements the gate (scalar for each block). Equation 2:
 
         g_j <- \sigma(s_t^T h_j + s_t^T w_j)
         """
         a = tf.reduce_sum(inputs * state_j, reduction_indices=[1])
-        b = tf.reduce_sum(inputs * key_j, reduction_indices=[1])
+        b = tf.reduce_sum(inputs * tf.expand_dims(key_j, 0), reduction_indices=[1])
         return tf.sigmoid(a + b)
 
     def get_candidate(self, state_j, key_j, inputs, U, V, W):
@@ -41,41 +51,36 @@ def get_candidate(self, state_j, key_j, inputs, U, V, W):
 
         h_j^~ <- \phi(U h_j + V w_j + W s_t)
         """
+        key_V = tf.matmul(tf.expand_dims(key_j, 0), V)
         state_U = tf.matmul(state_j, U)
         inputs_W = tf.matmul(inputs, W)
-        key_V = tf.matmul(tf.expand_dims(key_j, 0), V)
         return self._activation(state_U + key_V + inputs_W)
 
     def __call__(self, inputs, state, scope=None):
-        with tf.variable_scope(scope or type(self).__name__):
+        with tf.variable_scope(scope or type(self).__name__, initializer=self._initializer):
             # Split the hidden state into blocks (each U, V, W are shared across blocks).
             state = tf.split(1, self._num_blocks, state)
 
-            U = tf.get_variable('U',
-                shape=[self._num_units_per_block, self._num_units_per_block],
-                initializer=tf.random_normal_initializer(0.1))
-            V = tf.get_variable('V',
-                shape=[self._num_units_per_block, self._num_units_per_block],
-                initializer=tf.random_normal_initializer(0.1))
-            W = tf.get_variable('W',
-                shape=[self._num_units_per_block, self._num_units_per_block],
-                initializer=tf.random_normal_initializer(0.1))
+            # TODO: ortho init?
+            U = tf.get_variable('U', [self._num_units_per_block, self._num_units_per_block])
+            V = tf.get_variable('V', [self._num_units_per_block, self._num_units_per_block])
+            W = tf.get_variable('W', [self._num_units_per_block, self._num_units_per_block])
+
+            # TODO: layer norm?
 
             next_states = []
             for j, state_j in enumerate(state): # Hidden State (j)
-                key_j = tf.get_variable('key_{}'.format(j),
-                    shape=[self._num_units_per_block],
-                    initializer=tf.random_normal_initializer(0.1))
-                gate_j = self.get_gate(inputs, state_j, key_j)
+                key_j = self._keys[j]
+                gate_j = self.get_gate(state_j, key_j, inputs)
                 candidate_j = self.get_candidate(state_j, key_j, inputs, U, V, W)
 
                 # Equation 4: h_j <- h_j + g_j * h_j^~
                 # Perform an update of the hidden state (memory).
                 state_j_next = state_j + tf.expand_dims(gate_j, -1) * candidate_j
 
                 # Equation 5: h_j <- h_j / \norm{h_j}
-                # Forgot previous memories by normalization.
-                state_j_next = tf.nn.l2_normalize(state_j_next, -1)
+                # Forget previous memories by normalization.
+                state_j_next = tf.nn.l2_normalize(state_j_next, -1, epsilon=1e-7) # TODO: Is epsilon necessary?
 
                 next_states.append(state_j_next)
             state_next = tf.concat(1, next_states)