tensorflow
diff --git a/Diff for: ‎tensor2tensor/bin/t2t-datagen
100644100755
+1-1 b/Diff for: ‎tensor2tensor/bin/t2t-datagen
100644100755
+1-1
diff --git a/Diff for: ‎tensor2tensor/bin/t2t-decoder
100644100755 b/Diff for: ‎tensor2tensor/bin/t2t-decoder
100644100755
diff --git a/Diff for: ‎tensor2tensor/bin/t2t-make-tf-configs
100644100755 b/Diff for: ‎tensor2tensor/bin/t2t-make-tf-configs
100644100755
diff --git a/Diff for: ‎tensor2tensor/bin/t2t-trainer
100644100755 b/Diff for: ‎tensor2tensor/bin/t2t-trainer
100644100755
diff --git a/Diff for: ‎tensor2tensor/data_generators/all_problems.py
+6-1 b/Diff for: ‎tensor2tensor/data_generators/all_problems.py
+6-1
diff --git a/Diff for: ‎tensor2tensor/data_generators/generator_utils.py
+14-45 b/Diff for: ‎tensor2tensor/data_generators/generator_utils.py
+14-45
diff --git a/Diff for: ‎tensor2tensor/data_generators/ice_parsing.py
+1-1 b/Diff for: ‎tensor2tensor/data_generators/ice_parsing.py
+1-1
diff --git a/Diff for: ‎tensor2tensor/data_generators/translate.py
+262 b/Diff for: ‎tensor2tensor/data_generators/translate.py
+262
@@ -43,7 +43,7 @@ from tensor2tensor.data_generators import all_problems  # pylint: disable=unused
 from tensor2tensor.data_generators import audio
 from tensor2tensor.data_generators import generator_utils
 from tensor2tensor.data_generators import snli
-from tensor2tensor.data_generators import wmt
+from tensor2tensor.data_generators import translate
 from tensor2tensor.data_generators import wsj_parsing
 from tensor2tensor.utils import registry
 from tensor2tensor.utils import usr_dir
 
@@ -33,7 +33,12 @@
 from tensor2tensor.data_generators import ptb
 from tensor2tensor.data_generators import snli
 from tensor2tensor.data_generators import wiki
-from tensor2tensor.data_generators import wmt
+from tensor2tensor.data_generators import translate
+from tensor2tensor.data_generators import translate_enfr
+from tensor2tensor.data_generators import translate_ende
+from tensor2tensor.data_generators import translate_encs
+from tensor2tensor.data_generators import translate_enzh
+from tensor2tensor.data_generators import translate_enmk
 from tensor2tensor.data_generators import wsj_parsing
 
 
 
@@ -263,42 +263,6 @@ def gunzip_file(gz_path, new_path):
       for line in gz_file:
         new_file.write(line)
 
-
-# TODO(aidangomez): en-fr tasks are significantly over-represented below
-_DATA_FILE_URLS = [
-    # German-English
-    [
-        "http://data.statmt.org/wmt16/translation-task/training-parallel-nc-v11.tgz",  # pylint: disable=line-too-long
-        [
-            "training-parallel-nc-v11/news-commentary-v11.de-en.en",
-            "training-parallel-nc-v11/news-commentary-v11.de-en.de"
-        ]
-    ],
-    # German-English & French-English
-    [
-        "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz", [
-            "commoncrawl.de-en.en", "commoncrawl.de-en.de",
-            "commoncrawl.fr-en.en", "commoncrawl.fr-en.fr"
-        ]
-    ],
-    [
-        "http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz", [
-            "training/europarl-v7.de-en.en", "training/europarl-v7.de-en.de",
-            "training/europarl-v7.fr-en.en", "training/europarl-v7.fr-en.fr"
-        ]
-    ],
-    # French-English
-    [
-        "http://www.statmt.org/wmt10/training-giga-fren.tar",
-        ["giga-fren.release2.fixed.en.gz", "giga-fren.release2.fixed.fr.gz"]
-    ],
-    [
-        "http://www.statmt.org/wmt13/training-parallel-un.tgz",
-        ["un/undoc.2000.fr-en.en", "un/undoc.2000.fr-en.fr"]
-    ],
-]
-
-
 def get_or_generate_vocab_inner(data_dir, vocab_filename, vocab_size,
                                 generator):
   """Inner implementation for vocab generators.
@@ -341,9 +305,8 @@ def get_or_generate_vocab(data_dir,
                           tmp_dir,
                           vocab_filename,
                           vocab_size,
-                          sources=None):
-  """Generate a vocabulary from the datasets in sources (_DATA_FILE_URLS)."""
-  sources = sources or _DATA_FILE_URLS
+                          sources):
+  """Generate a vocabulary from the datasets in sources."""
 
   def generate():
     tf.logging.info("Generating vocab from: %s", str(sources))
@@ -375,13 +338,19 @@ def generate():
 
         # Use Tokenizer to count the word occurrences.
         with tf.gfile.GFile(filepath, mode="r") as source_file:
-          file_byte_budget = 3.5e5 if filepath.endswith("en") else 7e5
+          file_byte_budget = 1e6
+          counter = 0
+          countermax = int(source_file.size() / file_byte_budget / 2)
           for line in source_file:
-            if file_byte_budget <= 0:
-              break
-            line = line.strip()
-            file_byte_budget -= len(line)
-            yield line
+            if counter < countermax:
+              counter += 1
+            else:
+              if file_byte_budget <= 0:
+                break
+              line = line.strip()
+              file_byte_budget -= len(line)
+              counter = 0
+              yield line
 
   return get_or_generate_vocab_inner(data_dir, vocab_filename, vocab_size,
                                      generate())
 
@@ -32,7 +32,7 @@
 from tensor2tensor.data_generators import generator_utils
 from tensor2tensor.data_generators import problem
 from tensor2tensor.data_generators import text_encoder
-from tensor2tensor.data_generators.wmt import tabbed_generator
+from tensor2tensor.data_generators.translate import tabbed_generator
 from tensor2tensor.utils import registry
 
 
 
@@ -0,0 +1,262 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Data generators for translation data-sets."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tarfile
+
+# Dependency imports
+
+from tensor2tensor.data_generators import generator_utils
+from tensor2tensor.data_generators import problem
+from tensor2tensor.data_generators import text_encoder
+from tensor2tensor.data_generators import wsj_parsing
+from tensor2tensor.utils import registry
+
+import tensorflow as tf
+
+FLAGS = tf.flags.FLAGS
+
+
+class TranslateProblem(problem.Text2TextProblem):
+  """Base class for translation problems."""
+
+  @property
+  def is_character_level(self):
+    return False
+
+  @property
+  def num_shards(self):
+    return 100
+
+  @property
+  def use_subword_tokenizer(self):
+    return True
+
+
+# Generic generators used later for multiple problems.
+
+
+def character_generator(source_path, target_path, character_vocab, eos=None):
+  """Generator for sequence-to-sequence tasks that just uses characters.
+
+  This generator assumes the files at source_path and target_path have
+  the same number of lines and yields dictionaries of "inputs" and "targets"
+  where inputs are characters from the source lines converted to integers,
+  and targets are characters from the target lines, also converted to integers.
+
+  Args:
+    source_path: path to the file with source sentences.
+    target_path: path to the file with target sentences.
+    character_vocab: a TextEncoder to encode the characters.
+    eos: integer to append at the end of each sequence (default: None).
+
+  Yields:
+    A dictionary {"inputs": source-line, "targets": target-line} where
+    the lines are integer lists converted from characters in the file lines.
+  """
+  eos_list = [] if eos is None else [eos]
+  with tf.gfile.GFile(source_path, mode="r") as source_file:
+    with tf.gfile.GFile(target_path, mode="r") as target_file:
+      source, target = source_file.readline(), target_file.readline()
+      while source and target:
+        source_ints = character_vocab.encode(source.strip()) + eos_list
+        target_ints = character_vocab.encode(target.strip()) + eos_list
+        yield {"inputs": source_ints, "targets": target_ints}
+        source, target = source_file.readline(), target_file.readline()
+
+
+def tabbed_generator(source_path, source_vocab, target_vocab, eos=None):
+  r"""Generator for sequence-to-sequence tasks using tabbed files.
+
+  Tokens are derived from text files where each line contains both
+  a source and a target string. The two strings are separated by a tab
+  character ('\t'). It yields dictionaries of "inputs" and "targets" where
+  inputs are characters from the source lines converted to integers, and
+  targets are characters from the target lines, also converted to integers.
+
+  Args:
+    source_path: path to the file with source and target sentences.
+    source_vocab: a SubwordTextEncoder to encode the source string.
+    target_vocab: a SubwordTextEncoder to encode the target string.
+    eos: integer to append at the end of each sequence (default: None).
+
+  Yields:
+    A dictionary {"inputs": source-line, "targets": target-line} where
+    the lines are integer lists converted from characters in the file lines.
+  """
+  eos_list = [] if eos is None else [eos]
+  with tf.gfile.GFile(source_path, mode="r") as source_file:
+    for line in source_file:
+      if line and "\t" in line:
+        parts = line.split("\t", 1)
+        source, target = parts[0].strip(), parts[1].strip()
+        source_ints = source_vocab.encode(source) + eos_list
+        target_ints = target_vocab.encode(target) + eos_list
+        yield {"inputs": source_ints, "targets": target_ints}
+
+
+def token_generator(source_path, target_path, token_vocab, eos=None):
+  """Generator for sequence-to-sequence tasks that uses tokens.
+
+  This generator assumes the files at source_path and target_path have
+  the same number of lines and yields dictionaries of "inputs" and "targets"
+  where inputs are token ids from the " "-split source (and target, resp.) lines
+  converted to integers using the token_map.
+
+  Args:
+    source_path: path to the file with source sentences.
+    target_path: path to the file with target sentences.
+    token_vocab: text_encoder.TextEncoder object.
+    eos: integer to append at the end of each sequence (default: None).
+
+  Yields:
+    A dictionary {"inputs": source-line, "targets": target-line} where
+    the lines are integer lists converted from tokens in the file lines.
+  """
+  eos_list = [] if eos is None else [eos]
+  with tf.gfile.GFile(source_path, mode="r") as source_file:
+    with tf.gfile.GFile(target_path, mode="r") as target_file:
+      source, target = source_file.readline(), target_file.readline()
+      while source and target:
+        source_ints = token_vocab.encode(source.strip()) + eos_list
+        target_ints = token_vocab.encode(target.strip()) + eos_list
+        yield {"inputs": source_ints, "targets": target_ints}
+        source, target = source_file.readline(), target_file.readline()
+
+
+def bi_vocabs_token_generator(source_path,
+                              target_path,
+                              source_token_vocab,
+                              target_token_vocab,
+                              eos=None):
+  """Generator for sequence-to-sequence tasks that uses tokens.
+
+  This generator assumes the files at source_path and target_path have
+  the same number of lines and yields dictionaries of "inputs" and "targets"
+  where inputs are token ids from the " "-split source (and target, resp.) lines
+  converted to integers using the token_map.
+
+  Args:
+    source_path: path to the file with source sentences.
+    target_path: path to the file with target sentences.
+    source_token_vocab: text_encoder.TextEncoder object.
+    target_token_vocab: text_encoder.TextEncoder object.
+    eos: integer to append at the end of each sequence (default: None).
+
+  Yields:
+    A dictionary {"inputs": source-line, "targets": target-line} where
+    the lines are integer lists converted from tokens in the file lines.
+  """
+  eos_list = [] if eos is None else [eos]
+  with tf.gfile.GFile(source_path, mode="r") as source_file:
+    with tf.gfile.GFile(target_path, mode="r") as target_file:
+      source, target = source_file.readline(), target_file.readline()
+      while source and target:
+        source_ints = source_token_vocab.encode(source.strip()) + eos_list
+        target_ints = target_token_vocab.encode(target.strip()) + eos_list
+        yield {"inputs": source_ints, "targets": target_ints}
+        source, target = source_file.readline(), target_file.readline()
+
+def _preprocess_sgm(line, is_sgm):
+  """Preprocessing to strip tags in SGM files."""
+  if not is_sgm:
+    return line
+  # In SGM files, remove <srcset ...>, <p>, <doc ...> lines.
+  if line.startswith("<srcset") or line.startswith("</srcset"):
+    return ""
+  if line.startswith("<doc") or line.startswith("</doc"):
+    return ""
+  if line.startswith("<p>") or line.startswith("</p>"):
+    return ""
+  # Strip <seg> tags.
+  line = line.strip()
+  if line.startswith("<seg") and line.endswith("</seg>"):
+    i = line.index(">")
+    return line[i + 1:-6]  # Strip first <seg ...> and last </seg>.
+
+def _compile_data(tmp_dir, datasets, filename):
+  """Concatenate all `datasets` and save to `filename`."""
+  filename = os.path.join(tmp_dir, filename)
+  with tf.gfile.GFile(filename + ".lang1", mode="w") as lang1_resfile:
+    with tf.gfile.GFile(filename + ".lang2", mode="w") as lang2_resfile:
+      for dataset in datasets:
+        url = dataset[0]
+        compressed_filename = os.path.basename(url)
+        compressed_filepath = os.path.join(tmp_dir, compressed_filename)
+
+        generator_utils.maybe_download(tmp_dir, compressed_filename, url)
+
+        if dataset[1][0] == "tsv":
+          _, src_column, trg_column, glob_pattern = dataset[1]
+          filenames = tf.gfile.Glob(os.path.join(tmp_dir, glob_pattern))
+          if not filenames:
+            # Capture *.tgz and *.tar.gz too.
+            mode = "r:gz" if compressed_filepath.endswith("gz") else "r"
+            with tarfile.open(compressed_filepath, mode) as corpus_tar:
+              corpus_tar.extractall(tmp_dir)
+            filenames = tf.gfile.Glob(os.path.join(tmp_dir, glob_pattern))
+          for tsv_filename in filenames:
+            if tsv_filename.endswith(".gz"):
+              new_filename = tsv_filename.strip(".gz")
+              generator_utils.gunzip_file(tsv_filename, new_filename)
+              tsv_filename = new_filename
+            with tf.gfile.GFile(tsv_filename, mode="r") as tsv_file:
+              for line in tsv_file:
+                if line and "\t" in line:
+                  parts = line.split("\t")
+                  source, target = parts[src_column], parts[trg_column]
+                  lang1_resfile.write(source.strip() + "\n")
+                  lang2_resfile.write(target.strip() + "\n")
+        else:
+          lang1_filename, lang2_filename = dataset[1]
+          lang1_filepath = os.path.join(tmp_dir, lang1_filename)
+          lang2_filepath = os.path.join(tmp_dir, lang2_filename)
+          is_sgm = (lang1_filename.endswith("sgm") and
+                    lang2_filename.endswith("sgm"))
+
+          if not (os.path.exists(lang1_filepath) and
+                  os.path.exists(lang2_filepath)):
+            # For .tar.gz and .tgz files, we read compressed.
+            mode = "r:gz" if compressed_filepath.endswith("gz") else "r"
+            with tarfile.open(compressed_filepath, mode) as corpus_tar:
+              corpus_tar.extractall(tmp_dir)
+          if lang1_filepath.endswith(".gz"):
+            new_filepath = lang1_filepath.strip(".gz")
+            generator_utils.gunzip_file(lang1_filepath, new_filepath)
+            lang1_filepath = new_filepath
+          if lang2_filepath.endswith(".gz"):
+            new_filepath = lang2_filepath.strip(".gz")
+            generator_utils.gunzip_file(lang2_filepath, new_filepath)
+            lang2_filepath = new_filepath
+          with tf.gfile.GFile(lang1_filepath, mode="r") as lang1_file:
+            with tf.gfile.GFile(lang2_filepath, mode="r") as lang2_file:
+              line1, line2 = lang1_file.readline(), lang2_file.readline()
+              while line1 or line2:
+                line1res = _preprocess_sgm(line1, is_sgm)
+                line2res = _preprocess_sgm(line2, is_sgm)
+                if line1res or line2res:
+                  lang1_resfile.write(line1res.strip() + "\n")
+                  lang2_resfile.write(line2res.strip() + "\n")
+                line1, line2 = lang1_file.readline(), lang2_file.readline()
+
+  return filename
+
+