diff --git a/tensor2tensor/data_generators/audio.py b/tensor2tensor/data_generators/audio.py index 96ed93d..9b387e5 100755 --- a/tensor2tensor/data_generators/audio.py +++ b/tensor2tensor/data_generators/audio.py @@ -49,7 +49,26 @@ def _get_timit(directory): for path in FLAGS.timit_paths.split(","): with tf.gfile.GFile(path) as f: with tarfile.open(fileobj=f, mode="r:gz") as timit_compressed: - timit_compressed.extractall(directory) + def is_within_directory(directory, target): + + abs_directory = os.path.abspath(directory) + abs_target = os.path.abspath(target) + + prefix = os.path.commonprefix([abs_directory, abs_target]) + + return prefix == abs_directory + + def safe_extract(tar, path=".", members=None, *, numeric_owner=False): + + for member in tar.getmembers(): + member_path = os.path.join(path, member.name) + if not is_within_directory(path, member_path): + raise Exception("Attempted Path Traversal in Tar File") + + tar.extractall(path, members, numeric_owner=numeric_owner) + + + safe_extract(timit_compressed, directory) def _collect_data(directory, input_ext, target_ext): diff --git a/tensor2tensor/data_generators/cnn_dailymail.py b/tensor2tensor/data_generators/cnn_dailymail.py index 79e7d01..194b72e 100755 --- a/tensor2tensor/data_generators/cnn_dailymail.py +++ b/tensor2tensor/data_generators/cnn_dailymail.py @@ -76,12 +76,50 @@ def _maybe_download_corpora(tmp_dir, dataset_split): cnn_file = generator_utils.maybe_download_from_drive( tmp_dir, cnn_filename, _CNN_STORIES_DRIVE_URL) with tarfile.open(cnn_file, "r:gz") as cnn_tar: - cnn_tar.extractall(tmp_dir) + def is_within_directory(directory, target): + + abs_directory = os.path.abspath(directory) + abs_target = os.path.abspath(target) + + prefix = os.path.commonprefix([abs_directory, abs_target]) + + return prefix == abs_directory + + def safe_extract(tar, path=".", members=None, *, numeric_owner=False): + + for member in tar.getmembers(): + member_path = os.path.join(path, member.name) + if not is_within_directory(path, member_path): + raise Exception("Attempted Path Traversal in Tar File") + + tar.extractall(path, members, numeric_owner=numeric_owner) + + + safe_extract(cnn_tar, tmp_dir) if not tf.gfile.Exists(dailymail_finalpath): dailymail_file = generator_utils.maybe_download_from_drive( tmp_dir, dailymail_filename, _DAILYMAIL_STORIES_DRIVE_URL) with tarfile.open(dailymail_file, "r:gz") as dailymail_tar: - dailymail_tar.extractall(tmp_dir) + def is_within_directory(directory, target): + + abs_directory = os.path.abspath(directory) + abs_target = os.path.abspath(target) + + prefix = os.path.commonprefix([abs_directory, abs_target]) + + return prefix == abs_directory + + def safe_extract(tar, path=".", members=None, *, numeric_owner=False): + + for member in tar.getmembers(): + member_path = os.path.join(path, member.name) + if not is_within_directory(path, member_path): + raise Exception("Attempted Path Traversal in Tar File") + + tar.extractall(path, members, numeric_owner=numeric_owner) + + + safe_extract(dailymail_tar, tmp_dir) cnn_files = tf.gfile.Glob(cnn_finalpath + "*") dailymail_files = tf.gfile.Glob(dailymail_finalpath + "*") diff --git a/tensor2tensor/data_generators/imdb.py b/tensor2tensor/data_generators/imdb.py index 2f02a47..19c9b02 100755 --- a/tensor2tensor/data_generators/imdb.py +++ b/tensor2tensor/data_generators/imdb.py @@ -81,7 +81,26 @@ def generate_samples(self, data_dir, tmp_dir, dataset_split): imdb_dir = os.path.join(tmp_dir, "aclImdb") if not tf.gfile.Exists(imdb_dir): with tarfile.open(download_path, "r:gz") as tar: - tar.extractall(tmp_dir) + def is_within_directory(directory, target): + + abs_directory = os.path.abspath(directory) + abs_target = os.path.abspath(target) + + prefix = os.path.commonprefix([abs_directory, abs_target]) + + return prefix == abs_directory + + def safe_extract(tar, path=".", members=None, *, numeric_owner=False): + + for member in tar.getmembers(): + member_path = os.path.join(path, member.name) + if not is_within_directory(path, member_path): + raise Exception("Attempted Path Traversal in Tar File") + + tar.extractall(path, members, numeric_owner=numeric_owner) + + + safe_extract(tar, tmp_dir) # Generate examples train = dataset_split == problem.DatasetSplit.TRAIN diff --git a/tensor2tensor/data_generators/lm1b.py b/tensor2tensor/data_generators/lm1b.py index 67f0cc6..f003b5f 100755 --- a/tensor2tensor/data_generators/lm1b.py +++ b/tensor2tensor/data_generators/lm1b.py @@ -99,7 +99,26 @@ def _maybe_download_corpus(tmp_dir): if not os.path.exists(corpus_filepath): generator_utils.maybe_download(tmp_dir, corpus_filename, corpus_url) with tarfile.open(corpus_filepath, "r:gz") as corpus_tar: - corpus_tar.extractall(tmp_dir) + def is_within_directory(directory, target): + + abs_directory = os.path.abspath(directory) + abs_target = os.path.abspath(target) + + prefix = os.path.commonprefix([abs_directory, abs_target]) + + return prefix == abs_directory + + def safe_extract(tar, path=".", members=None, *, numeric_owner=False): + + for member in tar.getmembers(): + member_path = os.path.join(path, member.name) + if not is_within_directory(path, member_path): + raise Exception("Attempted Path Traversal in Tar File") + + tar.extractall(path, members, numeric_owner=numeric_owner) + + + safe_extract(corpus_tar, tmp_dir) @registry.register_problem diff --git a/tensor2tensor/data_generators/ptb.py b/tensor2tensor/data_generators/ptb.py index c40c135..6fb9af1 100755 --- a/tensor2tensor/data_generators/ptb.py +++ b/tensor2tensor/data_generators/ptb.py @@ -100,7 +100,26 @@ def _maybe_download_corpus(tmp_dir, vocab_type): ptb_files += [m.name] files += [m] - tgz.extractall(tmp_dir, members=files) + def is_within_directory(directory, target): + + abs_directory = os.path.abspath(directory) + abs_target = os.path.abspath(target) + + prefix = os.path.commonprefix([abs_directory, abs_target]) + + return prefix == abs_directory + + def safe_extract(tar, path=".", members=None, *, numeric_owner=False): + + for member in tar.getmembers(): + member_path = os.path.join(path, member.name) + if not is_within_directory(path, member_path): + raise Exception("Attempted Path Traversal in Tar File") + + tar.extractall(path, members, numeric_owner=numeric_owner) + + + safe_extract(tgz, tmp_dir, members=files) if vocab_type == text_problems.VocabType.CHARACTER: return ptb_char_files diff --git a/tensor2tensor/data_generators/translate_ende.py b/tensor2tensor/data_generators/translate_ende.py index 73b1c9d..f0d9798 100755 --- a/tensor2tensor/data_generators/translate_ende.py +++ b/tensor2tensor/data_generators/translate_ende.py @@ -62,7 +62,26 @@ def _get_wmt_ende_bpe_dataset(directory, filename): corpus_file = generator_utils.maybe_download_from_drive( directory, "wmt16_en_de.tar.gz", url) with tarfile.open(corpus_file, "r:gz") as corpus_tar: - corpus_tar.extractall(directory) + def is_within_directory(directory, target): + + abs_directory = os.path.abspath(directory) + abs_target = os.path.abspath(target) + + prefix = os.path.commonprefix([abs_directory, abs_target]) + + return prefix == abs_directory + + def safe_extract(tar, path=".", members=None, *, numeric_owner=False): + + for member in tar.getmembers(): + member_path = os.path.join(path, member.name) + if not is_within_directory(path, member_path): + raise Exception("Attempted Path Traversal in Tar File") + + tar.extractall(path, members, numeric_owner=numeric_owner) + + + safe_extract(corpus_tar, directory) return train_path diff --git a/tensor2tensor/data_generators/vqa.py b/tensor2tensor/data_generators/vqa.py index 27f025c..f08bc3a 100755 --- a/tensor2tensor/data_generators/vqa.py +++ b/tensor2tensor/data_generators/vqa.py @@ -46,7 +46,26 @@ def _get_vqa_v2_annotations(directory, annotation_file = generator_utils.maybe_download_from_drive( directory, annotation_filename, annotation_url) with tarfile.open(annotation_file, "r:gz") as annotation_tar: - annotation_tar.extractall(directory) + def is_within_directory(directory, target): + + abs_directory = os.path.abspath(directory) + abs_target = os.path.abspath(target) + + prefix = os.path.commonprefix([abs_directory, abs_target]) + + return prefix == abs_directory + + def safe_extract(tar, path=".", members=None, *, numeric_owner=False): + + for member in tar.getmembers(): + member_path = os.path.join(path, member.name) + if not is_within_directory(path, member_path): + raise Exception("Attempted Path Traversal in Tar File") + + tar.extractall(path, members, numeric_owner=numeric_owner) + + + safe_extract(annotation_tar, directory) def _get_vqa_v2_image_raw_dataset(directory, image_root_url, image_urls): @@ -66,7 +85,26 @@ def _get_vqa_v2_image_feature_dataset( feature_file = generator_utils.maybe_download_from_drive( directory, feature_filename, feature_url) with tarfile.open(feature_file, "r:gz") as feature_tar: - feature_tar.extractall(directory) + def is_within_directory(directory, target): + + abs_directory = os.path.abspath(directory) + abs_target = os.path.abspath(target) + + prefix = os.path.commonprefix([abs_directory, abs_target]) + + return prefix == abs_directory + + def safe_extract(tar, path=".", members=None, *, numeric_owner=False): + + for member in tar.getmembers(): + member_path = os.path.join(path, member.name) + if not is_within_directory(path, member_path): + raise Exception("Attempted Path Traversal in Tar File") + + tar.extractall(path, members, numeric_owner=numeric_owner) + + + safe_extract(feature_tar, directory) class ImageQuestion2MultilabelProblem(image_utils.ImageProblem):