internal-external fixes and enable tests

Ryan Sepassi · Ryan Sepassi · commit 36766d84aa3d · 2017-07-27T12:05:52.000-07:00
PiperOrigin-RevId: 163370562
diff --git a/setup.py b/setup.py
@@ -12,6 +12,7 @@
     url='http://github.com/tensorflow/tensor2tensor',
     license='Apache 2.0',
     packages=find_packages(),
+    package_data={'tensor2tensor.data_generators': ['test_data/*']},
     scripts=[
         'tensor2tensor/bin/t2t-trainer',
         'tensor2tensor/bin/t2t-datagen',
@@ -26,6 +27,8 @@
         'tensorflow': ['tensorflow>=1.2.0rc1'],
         'tensorflow_gpu': ['tensorflow-gpu>=1.2.0rc1'],
     },
+    tests_require=['nose'],
+    test_suite='nose.collector',
     classifiers=[
         'Development Status :: 4 - Beta',
         'Intended Audience :: Developers',
diff --git a/tensor2tensor/data_generators/concatenate_examples.py b/tensor2tensor/data_generators/concatenate_examples.py
@@ -34,7 +34,7 @@
               + subtokenizer.encode("target French Je t'aime.") + [1])
 }
 
-We add a dummy feature "inputs"=[0] for compatability with seq-to-seq models.
+We add a dummy feature "inputs"=[0] for compatibility with seq-to-seq models.
 
 If FLAGS.combine_to_length is nonzero, then we combine multiple examples into
 examples of a constant length, possibly with some padding at the end.
@@ -53,34 +53,33 @@
 from tensor2tensor.data_generators import text_encoder
 import tensorflow as tf
 
-tf.app.flags.DEFINE_string("vocab_file", "",
-                           "SubwordTextEncoder vocabulary file")
+tf.flags.DEFINE_string("vocab_file", "", "SubwordTextEncoder vocabulary file")
 
-tf.app.flags.DEFINE_boolean(
+tf.flags.DEFINE_boolean(
     "random_reverse", False,
     "If true, write half of the example with source/target reversed")
 
-tf.app.flags.DEFINE_boolean(
+tf.flags.DEFINE_boolean(
     "count_everything", False,
     "If true, assign positive weights to designators, source and target. "
     "If false, assign positive weights only to target.")
 
-tf.app.flags.DEFINE_string("source_domain_string", "English", "")
-tf.app.flags.DEFINE_string("target_domain_string", "French", "")
+tf.flags.DEFINE_string("source_domain_string", "English", "")
+tf.flags.DEFINE_string("target_domain_string", "French", "")
 
-tf.app.flags.DEFINE_integer(
+tf.flags.DEFINE_integer(
     "combine_to_length", 0,
     "If positive, concatenate examples to form examples with target length "
     " equal to this value. Targets are padded with subtoken id=0.")
 
-tf.app.flags.DEFINE_string("in_file", "", "input filename")
+tf.flags.DEFINE_string("in_file", "", "input filename")
 
-tf.app.flags.DEFINE_string(
+tf.flags.DEFINE_string(
     "out_prefix", "/usr/local/google/tmp/concat",
     "The output filename is equal to out_prefix plus "
     "the last 15 characters of in_file. (e.g. -00001-of-00100)")
 
-FLAGS = tf.app.flags.FLAGS
+FLAGS = tf.flags.FLAGS
 
 
 def _make_example(ids, weights, raw_num_bytes):
diff --git a/tensor2tensor/data_generators/inspect.py b/tensor2tensor/data_generators/inspect.py
@@ -32,19 +32,16 @@
 
 import tensorflow as tf
 
-tf.app.flags.DEFINE_string("subword_text_encoder_filename", "",
-                           "SubwordTextEncoder vocabulary file")
-tf.app.flags.DEFINE_string("token_text_encoder_filename", "",
-                           "TokenTextEncoder vocabulary file")
-tf.app.flags.DEFINE_bool("byte_text_encoder", False,
-                         "use a ByteTextEncoder")
-tf.app.flags.DEFINE_string("input_filename", "", "input filename")
-tf.app.flags.DEFINE_bool("print_inputs", False,
-                         "Print decoded inputs to stdout")
-tf.app.flags.DEFINE_bool("print_targets", False,
-                         "Print decoded targets to stdout")
+tf.flags.DEFINE_string("subword_text_encoder_filename", "",
+                       "SubwordTextEncoder vocabulary file")
+tf.flags.DEFINE_string("token_text_encoder_filename", "",
+                       "TokenTextEncoder vocabulary file")
+tf.flags.DEFINE_bool("byte_text_encoder", False, "use a ByteTextEncoder")
+tf.flags.DEFINE_string("input_filename", "", "input filename")
+tf.flags.DEFINE_bool("print_inputs", False, "Print decoded inputs to stdout")
+tf.flags.DEFINE_bool("print_targets", False, "Print decoded targets to stdout")
 
-FLAGS = tf.app.flags.FLAGS
+FLAGS = tf.flags.FLAGS
 
 
 def main(_):
@@ -53,8 +50,7 @@ def main(_):
     encoder = text_encoder.SubwordTextEncoder(
         FLAGS.subword_text_encoder_filename)
   elif FLAGS.token_text_encoder_filename:
-    encoder = text_encoder.TokenTextEncoder(
-        FLAGS.token_text_encoder_filename)
+    encoder = text_encoder.TokenTextEncoder(FLAGS.token_text_encoder_filename)
   elif FLAGS.byte_text_encoder:
     encoder = text_encoder.ByteTextEncoder()
   else:
diff --git a/tensor2tensor/data_generators/text_encoder_build_subword.py b/tensor2tensor/data_generators/text_encoder_build_subword.py
@@ -39,19 +39,18 @@
 
 import tensorflow as tf
 
-tf.app.flags.DEFINE_string('output_filename', '/tmp/my.subword_text_encoder',
-                           'where to store the SubwordTextEncoder')
-tf.app.flags.DEFINE_string('corpus_filepattern', '',
-                           'Corpus of one or more text files')
-tf.app.flags.DEFINE_string('vocab_filepattern', '',
-                           'One or more vocabulary files '
-                           '(one word per line as "word,count")')
-tf.app.flags.DEFINE_integer('min_count', 5, 'Minimum subtoken count in corpus')
-tf.app.flags.DEFINE_integer('corpus_max_lines', 10000,
-                            'How many lines of corpus to read')
-tf.app.flags.DEFINE_integer('num_iterations', 4, 'Number of iterations')
-tf.app.flags.DEFINE_bool('split_on_newlines', True, 'Break corpus into lines.')
-FLAGS = tf.app.flags.FLAGS
+tf.flags.DEFINE_string('output_filename', '/tmp/my.subword_text_encoder',
+                       'where to store the SubwordTextEncoder')
+tf.flags.DEFINE_string('corpus_filepattern', '',
+                       'Corpus of one or more text files')
+tf.flags.DEFINE_string('vocab_filepattern', '', 'One or more vocabulary files '
+                       '(one word per line as "word,count")')
+tf.flags.DEFINE_integer('min_count', 5, 'Minimum subtoken count in corpus')
+tf.flags.DEFINE_integer('corpus_max_lines', 10000,
+                        'How many lines of corpus to read')
+tf.flags.DEFINE_integer('num_iterations', 4, 'Number of iterations')
+tf.flags.DEFINE_bool('split_on_newlines', True, 'Break corpus into lines.')
+FLAGS = tf.flags.FLAGS
 
 
 def main(unused_argv):
@@ -61,20 +60,21 @@ def main(unused_argv):
 
   elif FLAGS.corpus_filepattern:
     token_counts = tokenizer.corpus_token_counts(
-        FLAGS.corpus_filepattern, FLAGS.corpus_max_lines,
+        FLAGS.corpus_filepattern,
+        FLAGS.corpus_max_lines,
         split_on_newlines=FLAGS.split_on_newlines)
 
   elif FLAGS.vocab_filepattern:
-    token_counts = tokenizer.vocab_token_counts(
-        FLAGS.vocab_filepattern, FLAGS.corpus_max_lines)
+    token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern,
+                                                FLAGS.corpus_max_lines)
 
   else:
     raise ValueError(
         'Must provide one of --corpus_filepattern or --vocab_filepattern')
 
   encoder = text_encoder.SubwordTextEncoder()
-  encoder.build_from_token_counts(
-      token_counts, FLAGS.min_count, FLAGS.num_iterations)
+  encoder.build_from_token_counts(token_counts, FLAGS.min_count,
+                                  FLAGS.num_iterations)
   encoder.store_to_file(FLAGS.output_fn)
 
 
diff --git a/tensor2tensor/data_generators/tokenizer.py b/tensor2tensor/data_generators/tokenizer.py
@@ -121,7 +121,7 @@ def _read_filepattern(filepattern, max_lines=None, split_on_newlines=True):
     The contents of the files as lines, if split_on_newlines is True, or
     the entire contents of each file if False.
   """
-  filenames = tf.gfile.Glob(filepattern)
+  filenames = sorted(tf.gfile.Glob(filepattern))
   lines_read = 0
   for filename in filenames:
     with tf.gfile.Open(filename) as f:
diff --git a/tensor2tensor/data_generators/tokenizer_test.py b/tensor2tensor/data_generators/tokenizer_test.py
@@ -30,9 +30,10 @@
 from tensor2tensor.data_generators import tokenizer
 import tensorflow as tf
 
-FLAGS = tf.app.flags.FLAGS
+FLAGS = tf.flags.FLAGS
 
-_TESTDATA = "google3/third_party/py/tensor2tensor/data_generators/test_data"
+pkg_dir, _ = os.path.split(__file__)
+_TESTDATA = os.path.join(pkg_dir, "test_data")
 
 
 class TokenizerTest(tf.test.TestCase):
@@ -41,18 +42,13 @@ def test_encode(self):
     self.assertListEqual(
         [u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."],
         tokenizer.encode(u"Dude - that's so cool."))
-    self.assertListEqual(
-        [u"Łukasz", u"est", u"né", u"en", u"1981", u"."],
-        tokenizer.encode(u"Łukasz est né en 1981."))
-    self.assertListEqual(
-        [u" ", u"Spaces", u"at", u"the", u"ends", u" "],
-        tokenizer.encode(u" Spaces at the ends "))
-    self.assertListEqual(
-        [u"802", u".", u"11b"],
-        tokenizer.encode(u"802.11b"))
-    self.assertListEqual(
-        [u"two", u". \n", u"lines"],
-        tokenizer.encode(u"two. \nlines"))
+    self.assertListEqual([u"Łukasz", u"est", u"né", u"en", u"1981", u"."],
+                         tokenizer.encode(u"Łukasz est né en 1981."))
+    self.assertListEqual([u" ", u"Spaces", u"at", u"the", u"ends", u" "],
+                         tokenizer.encode(u" Spaces at the ends "))
+    self.assertListEqual([u"802", u".", u"11b"], tokenizer.encode(u"802.11b"))
+    self.assertListEqual([u"two", u". \n", u"lines"],
+                         tokenizer.encode(u"two. \nlines"))
 
   def test_decode(self):
     self.assertEqual(
@@ -62,19 +58,16 @@ def test_decode(self):
 
   def test_invertibility_on_random_strings(self):
     for _ in xrange(1000):
-      s = u"".join(
-          six.unichr(random.randint(0, 65535)) for _ in xrange(10))
+      s = u"".join(six.unichr(random.randint(0, 65535)) for _ in xrange(10))
       self.assertEqual(s, tokenizer.decode(tokenizer.encode(s)))
 
 
 class TestTokenCounts(tf.test.TestCase):
 
   def setUp(self):
     super(TestTokenCounts, self).setUp()
-    self.corpus_path = os.path.join(
-        FLAGS.test_srcdir, _TESTDATA, "corpus-*.txt")
-    self.vocab_path = os.path.join(
-        FLAGS.test_srcdir, _TESTDATA, "vocab-*.txt")
+    self.corpus_path = os.path.join(_TESTDATA, "corpus-*.txt")
+    self.vocab_path = os.path.join(_TESTDATA, "vocab-*.txt")
 
   def test_corpus_token_counts_split_on_newlines(self):
     token_counts = tokenizer.corpus_token_counts(
@@ -117,31 +110,33 @@ def test_corpus_token_counts_no_split_with_max_lines(self):
 
     self.assertIn(u"slept", token_counts)
     self.assertNotIn(u"Mitch", token_counts)
-    self.assertDictContainsSubset(
-        {u".\n\n": 1, u"\n": 2, u".\n": 1}, token_counts)
+    self.assertDictContainsSubset({
+        u".\n\n": 1,
+        u"\n": 2,
+        u".\n": 1
+    }, token_counts)
 
   def test_vocab_token_counts(self):
-    token_counts = tokenizer.vocab_token_counts(
-        self.vocab_path, 0)
+    token_counts = tokenizer.vocab_token_counts(self.vocab_path, 0)
 
     expected = {
-        "lollipop": 8,
-        "reverberated": 12,
-        "kattywampus": 11,
-        "balderdash": 10,
-        "jiggery-pokery": 14,
+        u"lollipop": 8,
+        u"reverberated": 12,
+        u"kattywampus": 11,
+        u"balderdash": 10,
+        u"jiggery-pokery": 14,
     }
     self.assertDictEqual(expected, token_counts)
 
   def test_vocab_token_counts_with_max_lines(self):
-    token_counts = tokenizer.vocab_token_counts(
-        self.vocab_path, 4)
+    # vocab-1 has 2 lines, vocab-2 has 3
+    token_counts = tokenizer.vocab_token_counts(self.vocab_path, 4)
 
     expected = {
-        "lollipop": 8,
-        "reverberated": 12,
-        "kattywampus": 11,
-        "balderdash": 10,
+        u"lollipop": 8,
+        u"reverberated": 12,
+        u"kattywampus": 11,
+        u"balderdash": 10,
     }
     self.assertDictEqual(expected, token_counts)
 
diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py
@@ -167,7 +167,7 @@ def create_experiment(output_dir, data_dir, model_name, train_steps,
       model_name=model_name)
   eval_metrics = metrics.create_evaluation_metrics(
       zip(FLAGS.problems.split("-"), hparams.problem_instances))
-  if ("autotune" in FLAGS and FLAGS.autotune and
+  if (hasattr(FLAGS, "autotune") and FLAGS.autotune and
       FLAGS.objective not in eval_metrics):
     raise ValueError("Tuning objective %s not among evaluation metrics %s" %
                      (FLAGS.objective, eval_metrics.keys()))
@@ -572,7 +572,7 @@ def nth_model(n):
     # Define the train_op for the TRAIN mode.
     opt = _ConditionalOptimizer(hparams.optimizer, learning_rate, hparams)
     tf.logging.info("Computing gradients for global model_fn.")
-    opt_summaries = ["learning_rate", "loss", "global_gradient_norm"]
+    opt_summaries = ["learning_rate", "loss"]
     if hparams.summarize_grads:
       opt_summaries.extend(["gradients", "gradient_norm"])
     train_op = tf.contrib.layers.optimize_loss(