Skip to content
This repository was archived by the owner on Jul 7, 2023. It is now read-only.

Commit 36766d8

Browse files
author
Ryan Sepassi
committed
internal-external fixes and enable tests
PiperOrigin-RevId: 163370562
1 parent 175a125 commit 36766d8

File tree

7 files changed

+74
-81
lines changed

7 files changed

+74
-81
lines changed

setup.py

+3
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
url='http://github.com/tensorflow/tensor2tensor',
1313
license='Apache 2.0',
1414
packages=find_packages(),
15+
package_data={'tensor2tensor.data_generators': ['test_data/*']},
1516
scripts=[
1617
'tensor2tensor/bin/t2t-trainer',
1718
'tensor2tensor/bin/t2t-datagen',
@@ -26,6 +27,8 @@
2627
'tensorflow': ['tensorflow>=1.2.0rc1'],
2728
'tensorflow_gpu': ['tensorflow-gpu>=1.2.0rc1'],
2829
},
30+
tests_require=['nose'],
31+
test_suite='nose.collector',
2932
classifiers=[
3033
'Development Status :: 4 - Beta',
3134
'Intended Audience :: Developers',

tensor2tensor/data_generators/concatenate_examples.py

+10-11
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
+ subtokenizer.encode("target French Je t'aime.") + [1])
3535
}
3636
37-
We add a dummy feature "inputs"=[0] for compatability with seq-to-seq models.
37+
We add a dummy feature "inputs"=[0] for compatibility with seq-to-seq models.
3838
3939
If FLAGS.combine_to_length is nonzero, then we combine multiple examples into
4040
examples of a constant length, possibly with some padding at the end.
@@ -53,34 +53,33 @@
5353
from tensor2tensor.data_generators import text_encoder
5454
import tensorflow as tf
5555

56-
tf.app.flags.DEFINE_string("vocab_file", "",
57-
"SubwordTextEncoder vocabulary file")
56+
tf.flags.DEFINE_string("vocab_file", "", "SubwordTextEncoder vocabulary file")
5857

59-
tf.app.flags.DEFINE_boolean(
58+
tf.flags.DEFINE_boolean(
6059
"random_reverse", False,
6160
"If true, write half of the example with source/target reversed")
6261

63-
tf.app.flags.DEFINE_boolean(
62+
tf.flags.DEFINE_boolean(
6463
"count_everything", False,
6564
"If true, assign positive weights to designators, source and target. "
6665
"If false, assign positive weights only to target.")
6766

68-
tf.app.flags.DEFINE_string("source_domain_string", "English", "")
69-
tf.app.flags.DEFINE_string("target_domain_string", "French", "")
67+
tf.flags.DEFINE_string("source_domain_string", "English", "")
68+
tf.flags.DEFINE_string("target_domain_string", "French", "")
7069

71-
tf.app.flags.DEFINE_integer(
70+
tf.flags.DEFINE_integer(
7271
"combine_to_length", 0,
7372
"If positive, concatenate examples to form examples with target length "
7473
" equal to this value. Targets are padded with subtoken id=0.")
7574

76-
tf.app.flags.DEFINE_string("in_file", "", "input filename")
75+
tf.flags.DEFINE_string("in_file", "", "input filename")
7776

78-
tf.app.flags.DEFINE_string(
77+
tf.flags.DEFINE_string(
7978
"out_prefix", "/usr/local/google/tmp/concat",
8079
"The output filename is equal to out_prefix plus "
8180
"the last 15 characters of in_file. (e.g. -00001-of-00100)")
8281

83-
FLAGS = tf.app.flags.FLAGS
82+
FLAGS = tf.flags.FLAGS
8483

8584

8685
def _make_example(ids, weights, raw_num_bytes):

tensor2tensor/data_generators/inspect.py

+10-14
Original file line numberDiff line numberDiff line change
@@ -32,19 +32,16 @@
3232

3333
import tensorflow as tf
3434

35-
tf.app.flags.DEFINE_string("subword_text_encoder_filename", "",
36-
"SubwordTextEncoder vocabulary file")
37-
tf.app.flags.DEFINE_string("token_text_encoder_filename", "",
38-
"TokenTextEncoder vocabulary file")
39-
tf.app.flags.DEFINE_bool("byte_text_encoder", False,
40-
"use a ByteTextEncoder")
41-
tf.app.flags.DEFINE_string("input_filename", "", "input filename")
42-
tf.app.flags.DEFINE_bool("print_inputs", False,
43-
"Print decoded inputs to stdout")
44-
tf.app.flags.DEFINE_bool("print_targets", False,
45-
"Print decoded targets to stdout")
35+
tf.flags.DEFINE_string("subword_text_encoder_filename", "",
36+
"SubwordTextEncoder vocabulary file")
37+
tf.flags.DEFINE_string("token_text_encoder_filename", "",
38+
"TokenTextEncoder vocabulary file")
39+
tf.flags.DEFINE_bool("byte_text_encoder", False, "use a ByteTextEncoder")
40+
tf.flags.DEFINE_string("input_filename", "", "input filename")
41+
tf.flags.DEFINE_bool("print_inputs", False, "Print decoded inputs to stdout")
42+
tf.flags.DEFINE_bool("print_targets", False, "Print decoded targets to stdout")
4643

47-
FLAGS = tf.app.flags.FLAGS
44+
FLAGS = tf.flags.FLAGS
4845

4946

5047
def main(_):
@@ -53,8 +50,7 @@ def main(_):
5350
encoder = text_encoder.SubwordTextEncoder(
5451
FLAGS.subword_text_encoder_filename)
5552
elif FLAGS.token_text_encoder_filename:
56-
encoder = text_encoder.TokenTextEncoder(
57-
FLAGS.token_text_encoder_filename)
53+
encoder = text_encoder.TokenTextEncoder(FLAGS.token_text_encoder_filename)
5854
elif FLAGS.byte_text_encoder:
5955
encoder = text_encoder.ByteTextEncoder()
6056
else:

tensor2tensor/data_generators/text_encoder_build_subword.py

+18-18
Original file line numberDiff line numberDiff line change
@@ -39,19 +39,18 @@
3939

4040
import tensorflow as tf
4141

42-
tf.app.flags.DEFINE_string('output_filename', '/tmp/my.subword_text_encoder',
43-
'where to store the SubwordTextEncoder')
44-
tf.app.flags.DEFINE_string('corpus_filepattern', '',
45-
'Corpus of one or more text files')
46-
tf.app.flags.DEFINE_string('vocab_filepattern', '',
47-
'One or more vocabulary files '
48-
'(one word per line as "word,count")')
49-
tf.app.flags.DEFINE_integer('min_count', 5, 'Minimum subtoken count in corpus')
50-
tf.app.flags.DEFINE_integer('corpus_max_lines', 10000,
51-
'How many lines of corpus to read')
52-
tf.app.flags.DEFINE_integer('num_iterations', 4, 'Number of iterations')
53-
tf.app.flags.DEFINE_bool('split_on_newlines', True, 'Break corpus into lines.')
54-
FLAGS = tf.app.flags.FLAGS
42+
tf.flags.DEFINE_string('output_filename', '/tmp/my.subword_text_encoder',
43+
'where to store the SubwordTextEncoder')
44+
tf.flags.DEFINE_string('corpus_filepattern', '',
45+
'Corpus of one or more text files')
46+
tf.flags.DEFINE_string('vocab_filepattern', '', 'One or more vocabulary files '
47+
'(one word per line as "word,count")')
48+
tf.flags.DEFINE_integer('min_count', 5, 'Minimum subtoken count in corpus')
49+
tf.flags.DEFINE_integer('corpus_max_lines', 10000,
50+
'How many lines of corpus to read')
51+
tf.flags.DEFINE_integer('num_iterations', 4, 'Number of iterations')
52+
tf.flags.DEFINE_bool('split_on_newlines', True, 'Break corpus into lines.')
53+
FLAGS = tf.flags.FLAGS
5554

5655

5756
def main(unused_argv):
@@ -61,20 +60,21 @@ def main(unused_argv):
6160

6261
elif FLAGS.corpus_filepattern:
6362
token_counts = tokenizer.corpus_token_counts(
64-
FLAGS.corpus_filepattern, FLAGS.corpus_max_lines,
63+
FLAGS.corpus_filepattern,
64+
FLAGS.corpus_max_lines,
6565
split_on_newlines=FLAGS.split_on_newlines)
6666

6767
elif FLAGS.vocab_filepattern:
68-
token_counts = tokenizer.vocab_token_counts(
69-
FLAGS.vocab_filepattern, FLAGS.corpus_max_lines)
68+
token_counts = tokenizer.vocab_token_counts(FLAGS.vocab_filepattern,
69+
FLAGS.corpus_max_lines)
7070

7171
else:
7272
raise ValueError(
7373
'Must provide one of --corpus_filepattern or --vocab_filepattern')
7474

7575
encoder = text_encoder.SubwordTextEncoder()
76-
encoder.build_from_token_counts(
77-
token_counts, FLAGS.min_count, FLAGS.num_iterations)
76+
encoder.build_from_token_counts(token_counts, FLAGS.min_count,
77+
FLAGS.num_iterations)
7878
encoder.store_to_file(FLAGS.output_fn)
7979

8080

tensor2tensor/data_generators/tokenizer.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ def _read_filepattern(filepattern, max_lines=None, split_on_newlines=True):
121121
The contents of the files as lines, if split_on_newlines is True, or
122122
the entire contents of each file if False.
123123
"""
124-
filenames = tf.gfile.Glob(filepattern)
124+
filenames = sorted(tf.gfile.Glob(filepattern))
125125
lines_read = 0
126126
for filename in filenames:
127127
with tf.gfile.Open(filename) as f:

tensor2tensor/data_generators/tokenizer_test.py

+30-35
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,10 @@
3030
from tensor2tensor.data_generators import tokenizer
3131
import tensorflow as tf
3232

33-
FLAGS = tf.app.flags.FLAGS
33+
FLAGS = tf.flags.FLAGS
3434

35-
_TESTDATA = "google3/third_party/py/tensor2tensor/data_generators/test_data"
35+
pkg_dir, _ = os.path.split(__file__)
36+
_TESTDATA = os.path.join(pkg_dir, "test_data")
3637

3738

3839
class TokenizerTest(tf.test.TestCase):
@@ -41,18 +42,13 @@ def test_encode(self):
4142
self.assertListEqual(
4243
[u"Dude", u" - ", u"that", u"'", u"s", u"so", u"cool", u"."],
4344
tokenizer.encode(u"Dude - that's so cool."))
44-
self.assertListEqual(
45-
[u"Łukasz", u"est", u"né", u"en", u"1981", u"."],
46-
tokenizer.encode(u"Łukasz est né en 1981."))
47-
self.assertListEqual(
48-
[u" ", u"Spaces", u"at", u"the", u"ends", u" "],
49-
tokenizer.encode(u" Spaces at the ends "))
50-
self.assertListEqual(
51-
[u"802", u".", u"11b"],
52-
tokenizer.encode(u"802.11b"))
53-
self.assertListEqual(
54-
[u"two", u". \n", u"lines"],
55-
tokenizer.encode(u"two. \nlines"))
45+
self.assertListEqual([u"Łukasz", u"est", u"né", u"en", u"1981", u"."],
46+
tokenizer.encode(u"Łukasz est né en 1981."))
47+
self.assertListEqual([u" ", u"Spaces", u"at", u"the", u"ends", u" "],
48+
tokenizer.encode(u" Spaces at the ends "))
49+
self.assertListEqual([u"802", u".", u"11b"], tokenizer.encode(u"802.11b"))
50+
self.assertListEqual([u"two", u". \n", u"lines"],
51+
tokenizer.encode(u"two. \nlines"))
5652

5753
def test_decode(self):
5854
self.assertEqual(
@@ -62,19 +58,16 @@ def test_decode(self):
6258

6359
def test_invertibility_on_random_strings(self):
6460
for _ in xrange(1000):
65-
s = u"".join(
66-
six.unichr(random.randint(0, 65535)) for _ in xrange(10))
61+
s = u"".join(six.unichr(random.randint(0, 65535)) for _ in xrange(10))
6762
self.assertEqual(s, tokenizer.decode(tokenizer.encode(s)))
6863

6964

7065
class TestTokenCounts(tf.test.TestCase):
7166

7267
def setUp(self):
7368
super(TestTokenCounts, self).setUp()
74-
self.corpus_path = os.path.join(
75-
FLAGS.test_srcdir, _TESTDATA, "corpus-*.txt")
76-
self.vocab_path = os.path.join(
77-
FLAGS.test_srcdir, _TESTDATA, "vocab-*.txt")
69+
self.corpus_path = os.path.join(_TESTDATA, "corpus-*.txt")
70+
self.vocab_path = os.path.join(_TESTDATA, "vocab-*.txt")
7871

7972
def test_corpus_token_counts_split_on_newlines(self):
8073
token_counts = tokenizer.corpus_token_counts(
@@ -117,31 +110,33 @@ def test_corpus_token_counts_no_split_with_max_lines(self):
117110

118111
self.assertIn(u"slept", token_counts)
119112
self.assertNotIn(u"Mitch", token_counts)
120-
self.assertDictContainsSubset(
121-
{u".\n\n": 1, u"\n": 2, u".\n": 1}, token_counts)
113+
self.assertDictContainsSubset({
114+
u".\n\n": 1,
115+
u"\n": 2,
116+
u".\n": 1
117+
}, token_counts)
122118

123119
def test_vocab_token_counts(self):
124-
token_counts = tokenizer.vocab_token_counts(
125-
self.vocab_path, 0)
120+
token_counts = tokenizer.vocab_token_counts(self.vocab_path, 0)
126121

127122
expected = {
128-
"lollipop": 8,
129-
"reverberated": 12,
130-
"kattywampus": 11,
131-
"balderdash": 10,
132-
"jiggery-pokery": 14,
123+
u"lollipop": 8,
124+
u"reverberated": 12,
125+
u"kattywampus": 11,
126+
u"balderdash": 10,
127+
u"jiggery-pokery": 14,
133128
}
134129
self.assertDictEqual(expected, token_counts)
135130

136131
def test_vocab_token_counts_with_max_lines(self):
137-
token_counts = tokenizer.vocab_token_counts(
138-
self.vocab_path, 4)
132+
# vocab-1 has 2 lines, vocab-2 has 3
133+
token_counts = tokenizer.vocab_token_counts(self.vocab_path, 4)
139134

140135
expected = {
141-
"lollipop": 8,
142-
"reverberated": 12,
143-
"kattywampus": 11,
144-
"balderdash": 10,
136+
u"lollipop": 8,
137+
u"reverberated": 12,
138+
u"kattywampus": 11,
139+
u"balderdash": 10,
145140
}
146141
self.assertDictEqual(expected, token_counts)
147142

tensor2tensor/utils/trainer_utils.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ def create_experiment(output_dir, data_dir, model_name, train_steps,
167167
model_name=model_name)
168168
eval_metrics = metrics.create_evaluation_metrics(
169169
zip(FLAGS.problems.split("-"), hparams.problem_instances))
170-
if ("autotune" in FLAGS and FLAGS.autotune and
170+
if (hasattr(FLAGS, "autotune") and FLAGS.autotune and
171171
FLAGS.objective not in eval_metrics):
172172
raise ValueError("Tuning objective %s not among evaluation metrics %s" %
173173
(FLAGS.objective, eval_metrics.keys()))
@@ -572,7 +572,7 @@ def nth_model(n):
572572
# Define the train_op for the TRAIN mode.
573573
opt = _ConditionalOptimizer(hparams.optimizer, learning_rate, hparams)
574574
tf.logging.info("Computing gradients for global model_fn.")
575-
opt_summaries = ["learning_rate", "loss", "global_gradient_norm"]
575+
opt_summaries = ["learning_rate", "loss"]
576576
if hparams.summarize_grads:
577577
opt_summaries.extend(["gradients", "gradient_norm"])
578578
train_op = tf.contrib.layers.optimize_loss(

0 commit comments

Comments
 (0)