Skip to content
This repository was archived by the owner on Jul 7, 2023. It is now read-only.

Commit a55c4cf

Browse files
authored
Merge pull request #185 from rsepassi/push
v1.1.2
2 parents 62a0ee7 + 36766d8 commit a55c4cf

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

93 files changed

+1890
-719
lines changed

README.md

-1
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,6 @@ mkdir -p $DATA_DIR $TMP_DIR $TRAIN_DIR
8686
t2t-datagen \
8787
--data_dir=$DATA_DIR \
8888
--tmp_dir=$TMP_DIR \
89-
--num_shards=100 \
9089
--problem=$PROBLEM
9190
9291
# Train

setup.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,14 @@
55

66
setup(
77
name='tensor2tensor',
8-
version='1.1.1',
8+
version='1.1.2',
99
description='Tensor2Tensor',
1010
author='Google Inc.',
1111
author_email='[email protected]',
1212
url='http://github.com/tensorflow/tensor2tensor',
1313
license='Apache 2.0',
1414
packages=find_packages(),
15+
package_data={'tensor2tensor.data_generators': ['test_data/*']},
1516
scripts=[
1617
'tensor2tensor/bin/t2t-trainer',
1718
'tensor2tensor/bin/t2t-datagen',
@@ -26,6 +27,8 @@
2627
'tensorflow': ['tensorflow>=1.2.0rc1'],
2728
'tensorflow_gpu': ['tensorflow-gpu>=1.2.0rc1'],
2829
},
30+
tests_require=['nose'],
31+
test_suite='nose.collector',
2932
classifiers=[
3033
'Development Status :: 4 - Beta',
3134
'Intended Audience :: Developers',

tensor2tensor/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# coding=utf-8
12
# Copyright 2017 The Tensor2Tensor Authors.
23
#
34
# Licensed under the Apache License, Version 2.0 (the "License");

tensor2tensor/bin/t2t-datagen

+16-4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#!/usr/bin/env python
2+
# coding=utf-8
23
# Copyright 2017 The Tensor2Tensor Authors.
34
#
45
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -62,10 +63,12 @@ flags.DEFINE_string("problem", "",
6263
"The name of the problem to generate data for.")
6364
flags.DEFINE_string("exclude_problems", "",
6465
"Comma-separates list of problems to exclude.")
65-
flags.DEFINE_integer("num_shards", 10, "How many shards to use.")
66+
flags.DEFINE_integer("num_shards", 0, "How many shards to use. Ignored for "
67+
"registered Problems.")
6668
flags.DEFINE_integer("max_cases", 0,
6769
"Maximum number of cases to generate (unbounded if 0).")
6870
flags.DEFINE_integer("random_seed", 429459, "Random seed to use.")
71+
flags.DEFINE_integer("task_id", -1, "For distributed data generation.")
6972
flags.DEFINE_string("t2t_usr_dir", "",
7073
"Path to a Python module that will be imported. The "
7174
"__init__.py file should include the necessary imports. "
@@ -108,6 +111,10 @@ _SUPPORTED_PROBLEM_GENERATORS = {
108111
lambda: lm1b.generator(FLAGS.tmp_dir, True),
109112
lambda: lm1b.generator(FLAGS.tmp_dir, False)
110113
),
114+
"lm1b_characters": (
115+
lambda: lm1b.generator(FLAGS.tmp_dir, True, characters=True),
116+
lambda: lm1b.generator(FLAGS.tmp_dir, False, characters=True)
117+
),
111118
"wiki_32k": (
112119
lambda: wiki.generator(FLAGS.tmp_dir, True),
113120
1000
@@ -246,7 +253,7 @@ def generate_data_for_problem(problem):
246253
if isinstance(dev_gen, int):
247254
# The dev set and test sets are generated as extra shards using the
248255
# training generator. The integer specifies the number of training
249-
# shards. FLAGS.num_shards is ignored.
256+
# shards. FLAGS.num_shards is ignored.
250257
num_training_shards = dev_gen
251258
tf.logging.info("Generating data for %s.", problem)
252259
all_output_files = generator_utils.combined_data_filenames(
@@ -257,10 +264,11 @@ def generate_data_for_problem(problem):
257264
else:
258265
# usual case - train data and dev data are generated using separate
259266
# generators.
267+
num_shards = FLAGS.num_shards or 10
260268
tf.logging.info("Generating training data for %s.", problem)
261269
train_output_files = generator_utils.train_data_filenames(
262270
problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir,
263-
FLAGS.num_shards)
271+
num_shards)
264272
generator_utils.generate_files(training_gen(), train_output_files,
265273
FLAGS.max_cases)
266274
tf.logging.info("Generating development data for %s.", problem)
@@ -275,10 +283,14 @@ def generate_data_for_problem(problem):
275283

276284

277285
def generate_data_for_registered_problem(problem_name):
286+
tf.logging.info("Generating training data for %s.", problem_name)
287+
if FLAGS.num_shards:
288+
raise ValueError("--num_shards should not be set for registered Problem.")
278289
problem = registry.problem(problem_name)
290+
task_id = None if FLAGS.task_id < 0 else FLAGS.task_id
279291
problem.generate_data(os.path.expanduser(FLAGS.data_dir),
280292
os.path.expanduser(FLAGS.tmp_dir),
281-
FLAGS.num_shards)
293+
task_id=task_id)
282294

283295

284296
if __name__ == "__main__":

tensor2tensor/bin/t2t-make-tf-configs

+1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#!/usr/bin/env python
2+
# coding=utf-8
23
# Copyright 2017 The Tensor2Tensor Authors.
34
#
45
# Licensed under the Apache License, Version 2.0 (the "License");

tensor2tensor/bin/t2t-trainer

+1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#!/usr/bin/env python
2+
# coding=utf-8
23
# Copyright 2017 The Tensor2Tensor Authors.
34
#
45
# Licensed under the Apache License, Version 2.0 (the "License");

tensor2tensor/data_generators/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# coding=utf-8
12
# Copyright 2017 The Tensor2Tensor Authors.
23
#
34
# Licensed under the Apache License, Version 2.0 (the "License");

tensor2tensor/data_generators/algorithmic.py

+5-7
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# coding=utf-8
12
# Copyright 2017 The Tensor2Tensor Authors.
23
#
34
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -65,10 +66,7 @@ def dev_size(self):
6566
def num_shards(self):
6667
return 10
6768

68-
def generate_data(self, data_dir, _, num_shards=None):
69-
if num_shards is None:
70-
num_shards = self.num_shards
71-
69+
def generate_data(self, data_dir, _, task_id=-1):
7270
def generator_eos(generator):
7371
"""Shift by NUM_RESERVED_IDS and append EOS token."""
7472
for case in generator:
@@ -86,7 +84,7 @@ def generator_eos(generator):
8684

8785
utils.generate_dataset_and_shuffle(
8886
train_generator_eos(),
89-
self.training_filepaths(data_dir, num_shards, shuffled=True),
87+
self.training_filepaths(data_dir, self.num_shards, shuffled=True),
9088
dev_generator_eos(),
9189
self.dev_filepaths(data_dir, 1, shuffled=True),
9290
shuffle=False)
@@ -253,7 +251,7 @@ def zipf_distribution(nbr_symbols, alpha):
253251

254252

255253
def zipf_random_sample(distr_map, sample_len):
256-
"""Helper function: Generate a random Zipf sample of given lenght.
254+
"""Helper function: Generate a random Zipf sample of given length.
257255
258256
Args:
259257
distr_map: list of float, Zipf's distribution over nbr_symbols.
@@ -286,7 +284,7 @@ def reverse_generator_nlplike(nbr_symbols,
286284
max_length: integer, maximum length of sequences to generate.
287285
nbr_cases: the number of cases to generate.
288286
scale_std_dev: float, Normal distribution's standard deviation scale factor
289-
used to draw the lenght of sequence. Default = 1% of the max_length.
287+
used to draw the length of sequence. Default = 1% of the max_length.
290288
alpha: float, Zipf's Law Distribution parameter. Default = 1.5.
291289
Usually for modelling natural text distribution is in
292290
the range [1.1-1.6].

tensor2tensor/data_generators/algorithmic_math.py

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# coding=utf-8
12
# Copyright 2017 The Tensor2Tensor Authors.
23
#
34
# Licensed under the Apache License, Version 2.0 (the "License");

tensor2tensor/data_generators/algorithmic_math_test.py

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# coding=utf-8
12
# Copyright 2017 The Tensor2Tensor Authors.
23
#
34
# Licensed under the Apache License, Version 2.0 (the "License");

tensor2tensor/data_generators/algorithmic_test.py

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# coding=utf-8
12
# Copyright 2017 The Tensor2Tensor Authors.
23
#
34
# Licensed under the Apache License, Version 2.0 (the "License");

tensor2tensor/data_generators/all_problems.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# coding=utf-8
12
# Copyright 2017 The Tensor2Tensor Authors.
23
#
34
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -33,7 +34,7 @@
3334
# pylint: disable=g-import-not-at-top
3435
try:
3536
# Requires h5py
36-
from tensor2tensor.data_generators import genetics
37+
from tensor2tensor.data_generators import gene_expression
3738
except ImportError:
3839
pass
3940
# pylint: enable=g-import-not-at-top

tensor2tensor/data_generators/audio.py

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# coding=utf-8
12
# Copyright 2017 The Tensor2Tensor Authors.
23
#
34
# Licensed under the Apache License, Version 2.0 (the "License");

tensor2tensor/data_generators/audio_test.py

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# coding=utf-8
12
# Copyright 2017 The Tensor2Tensor Authors.
23
#
34
# Licensed under the Apache License, Version 2.0 (the "License");

tensor2tensor/data_generators/concatenate_examples.py

+11-11
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# coding=utf-8
12
# Copyright 2017 The Tensor2Tensor Authors.
23
#
34
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -33,7 +34,7 @@
3334
+ subtokenizer.encode("target French Je t'aime.") + [1])
3435
}
3536
36-
We add a dummy feature "inputs"=[0] for compatability with seq-to-seq models.
37+
We add a dummy feature "inputs"=[0] for compatibility with seq-to-seq models.
3738
3839
If FLAGS.combine_to_length is nonzero, then we combine multiple examples into
3940
examples of a constant length, possibly with some padding at the end.
@@ -52,34 +53,33 @@
5253
from tensor2tensor.data_generators import text_encoder
5354
import tensorflow as tf
5455

55-
tf.app.flags.DEFINE_string("vocab_file", "",
56-
"SubwordTextEncoder vocabulary file")
56+
tf.flags.DEFINE_string("vocab_file", "", "SubwordTextEncoder vocabulary file")
5757

58-
tf.app.flags.DEFINE_boolean(
58+
tf.flags.DEFINE_boolean(
5959
"random_reverse", False,
6060
"If true, write half of the example with source/target reversed")
6161

62-
tf.app.flags.DEFINE_boolean(
62+
tf.flags.DEFINE_boolean(
6363
"count_everything", False,
6464
"If true, assign positive weights to designators, source and target. "
6565
"If false, assign positive weights only to target.")
6666

67-
tf.app.flags.DEFINE_string("source_domain_string", "English", "")
68-
tf.app.flags.DEFINE_string("target_domain_string", "French", "")
67+
tf.flags.DEFINE_string("source_domain_string", "English", "")
68+
tf.flags.DEFINE_string("target_domain_string", "French", "")
6969

70-
tf.app.flags.DEFINE_integer(
70+
tf.flags.DEFINE_integer(
7171
"combine_to_length", 0,
7272
"If positive, concatenate examples to form examples with target length "
7373
" equal to this value. Targets are padded with subtoken id=0.")
7474

75-
tf.app.flags.DEFINE_string("in_file", "", "input filename")
75+
tf.flags.DEFINE_string("in_file", "", "input filename")
7676

77-
tf.app.flags.DEFINE_string(
77+
tf.flags.DEFINE_string(
7878
"out_prefix", "/usr/local/google/tmp/concat",
7979
"The output filename is equal to out_prefix plus "
8080
"the last 15 characters of in_file. (e.g. -00001-of-00100)")
8181

82-
FLAGS = tf.app.flags.FLAGS
82+
FLAGS = tf.flags.FLAGS
8383

8484

8585
def _make_example(ids, weights, raw_num_bytes):

0 commit comments

Comments
 (0)