diff --git a/examples/speech_synthesis/README.md b/examples/speech_synthesis/README.md
deleted file mode 100644
index a31e7f68bd..0000000000
--- a/examples/speech_synthesis/README.md
+++ /dev/null
@@ -1,38 +0,0 @@
-Speech Synthesis (S^2)
-===
-[https://arxiv.org/abs/2109.06912](https://arxiv.org/abs/2109.06912)
-
-Speech synthesis with fairseq.
-
-## Features
-
-- Autoregressive and non-autoregressive models
-- Multi-speaker synthesis
-- Audio preprocessing (denoising, VAD, etc.) for less curated data
-- Automatic metrics for model development
-- Similar data configuration as [S2T](../speech_to_text/README.md)
-
-
-## Examples
-- [Single-speaker synthesis on LJSpeech](docs/ljspeech_example.md)
-- [Multi-speaker synthesis on VCTK](docs/vctk_example.md)
-- [Multi-speaker synthesis on Common Voice](docs/common_voice_example.md)
-
-
-## Citation
-Please cite as:
-```
-@article{wang2021fairseqs2,
-  title={fairseq S\^{} 2: A Scalable and Integrable Speech Synthesis Toolkit},
-  author={Wang, Changhan and Hsu, Wei-Ning and Adi, Yossi and Polyak, Adam and Lee, Ann and Chen, Peng-Jen and Gu, Jiatao and Pino, Juan},
-  journal={arXiv preprint arXiv:2109.06912},
-  year={2021}
-}
-
-@inproceedings{ott2019fairseq,
-  title = {fairseq: A Fast, Extensible Toolkit for Sequence Modeling},
-  author = {Myle Ott and Sergey Edunov and Alexei Baevski and Angela Fan and Sam Gross and Nathan Ng and David Grangier and Michael Auli},
-  booktitle = {Proceedings of NAACL-HLT 2019: Demonstrations},
-  year = {2019},
-}
-```
diff --git a/examples/speech_synthesis/__init__.py b/examples/speech_synthesis/__init__.py
deleted file mode 100644
index 6264236915..0000000000
--- a/examples/speech_synthesis/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
diff --git a/examples/speech_synthesis/data_utils.py b/examples/speech_synthesis/data_utils.py
deleted file mode 100644
index 3b2d079a9a..0000000000
--- a/examples/speech_synthesis/data_utils.py
+++ /dev/null
@@ -1,344 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import io
-import os
-from pathlib import Path
-from typing import Optional, List, Dict
-import zipfile
-import tempfile
-from dataclasses import dataclass
-from itertools import groupby
-
-import torch
-import torch.nn.functional as F
-import numpy as np
-from tqdm import tqdm
-
-from examples.speech_to_text.data_utils import load_tsv_to_dicts
-from fairseq.data.audio.audio_utils import (
-    TTSSpectrogram, TTSMelScale, parse_path, read_from_stored_zip, is_npy_data
-)
-
-
-def trim_or_pad_to_target_length(
-        data_1d_or_2d: np.ndarray, target_length: int
-) -> np.ndarray:
-    assert len(data_1d_or_2d.shape) in {1, 2}
-    delta = data_1d_or_2d.shape[0] - target_length
-    if delta >= 0:  # trim if being longer
-        data_1d_or_2d = data_1d_or_2d[: target_length]
-    else:  # pad if being shorter
-        if len(data_1d_or_2d.shape) == 1:
-            data_1d_or_2d = np.concatenate(
-                [data_1d_or_2d, np.zeros(-delta)], axis=0
-            )
-        else:
-            data_1d_or_2d = np.concatenate(
-                [data_1d_or_2d, np.zeros((-delta, data_1d_or_2d.shape[1]))],
-                axis=0
-            )
-    return data_1d_or_2d
-
-
-def extract_logmel_spectrogram(
-        waveform: torch.Tensor, sample_rate: int,
-        output_path: Optional[Path] = None, win_length: int = 1024,
-        hop_length: int = 256, n_fft: int = 1024,
-        win_fn: callable = torch.hann_window, n_mels: int = 80,
-        f_min: float = 0., f_max: float = 8000, eps: float = 1e-5,
-        overwrite: bool = False, target_length: Optional[int] = None
-):
-    if output_path is not None and output_path.is_file() and not overwrite:
-        return
-
-    spectrogram_transform = TTSSpectrogram(
-        n_fft=n_fft, win_length=win_length, hop_length=hop_length,
-        window_fn=win_fn
-    )
-    mel_scale_transform = TTSMelScale(
-        n_mels=n_mels, sample_rate=sample_rate, f_min=f_min, f_max=f_max,
-        n_stft=n_fft // 2 + 1
-    )
-    spectrogram = spectrogram_transform(waveform)
-    mel_spec = mel_scale_transform(spectrogram)
-    logmel_spec = torch.clamp(mel_spec, min=eps).log()
-    assert len(logmel_spec.shape) == 3 and logmel_spec.shape[0] == 1
-    logmel_spec = logmel_spec.squeeze().t()  # D x T -> T x D
-    if target_length is not None:
-        logmel_spec = trim_or_pad_to_target_length(logmel_spec, target_length)
-
-    if output_path is not None:
-        np.save(output_path.as_posix(), logmel_spec)
-    else:
-        return logmel_spec
-
-
-def extract_pitch(
-        waveform: torch.Tensor, sample_rate: int,
-        output_path: Optional[Path] = None, hop_length: int = 256,
-        log_scale: bool = True, phoneme_durations: Optional[List[int]] = None
-):
-    if output_path is not None and output_path.is_file():
-        return
-
-    try:
-        import pyworld
-    except ImportError:
-        raise ImportError("Please install PyWORLD: pip install pyworld")
-
-    _waveform = waveform.squeeze(0).double().numpy()
-    pitch, t = pyworld.dio(
-        _waveform, sample_rate, frame_period=hop_length / sample_rate * 1000
-    )
-    pitch = pyworld.stonemask(_waveform, pitch, t, sample_rate)
-
-    if phoneme_durations is not None:
-        pitch = trim_or_pad_to_target_length(pitch, sum(phoneme_durations))
-        try:
-            from scipy.interpolate import interp1d
-        except ImportError:
-            raise ImportError("Please install SciPy: pip install scipy")
-        nonzero_ids = np.where(pitch != 0)[0]
-        if len(nonzero_ids) == 0:
-            print((f"{output_path} has all empty values in the pitch contour"))
-            return
-        elif len(nonzero_ids) == 1:
-            print((f"{output_path} has only one non-zero values in the pitch contour"))
-            return
-        else:
-            interp_fn = interp1d(
-                nonzero_ids,
-                pitch[nonzero_ids],
-                fill_value=(pitch[nonzero_ids[0]], pitch[nonzero_ids[-1]]),
-                bounds_error=False,
-            )
-            pitch = interp_fn(np.arange(0, len(pitch)))
-        d_cumsum = np.cumsum(np.concatenate([np.array([0]), phoneme_durations]))
-        pitch = np.array(
-            [
-                np.mean(pitch[d_cumsum[i-1]: d_cumsum[i]])
-                for i in range(1, len(d_cumsum))
-            ]
-        )
-        assert len(pitch) == len(phoneme_durations)
-
-    if log_scale:
-        pitch = np.log(pitch + 1)
-
-    if output_path is not None:
-        np.save(output_path.as_posix(), pitch)
-    else:
-        return pitch
-
-
-def extract_energy(
-        waveform: torch.Tensor, output_path: Optional[Path] = None,
-        hop_length: int = 256, n_fft: int = 1024, log_scale: bool = True,
-        phoneme_durations: Optional[List[int]] = None
-):
-    if output_path is not None and output_path.is_file():
-        return
-
-    assert len(waveform.shape) == 2 and waveform.shape[0] == 1
-    waveform = waveform.view(1, 1, waveform.shape[1])
-    waveform = F.pad(
-        waveform.unsqueeze(1), [n_fft // 2, n_fft // 2, 0, 0],
-        mode="reflect"
-    )
-    waveform = waveform.squeeze(1)
-
-    fourier_basis = np.fft.fft(np.eye(n_fft))
-    cutoff = int((n_fft / 2 + 1))
-    fourier_basis = np.vstack(
-        [np.real(fourier_basis[:cutoff, :]),
-         np.imag(fourier_basis[:cutoff, :])]
-    )
-
-    forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
-    forward_transform = F.conv1d(
-        waveform, forward_basis, stride=hop_length, padding=0
-    )
-
-    real_part = forward_transform[:, :cutoff, :]
-    imag_part = forward_transform[:, cutoff:, :]
-    magnitude = torch.sqrt(real_part ** 2 + imag_part ** 2)
-    energy = torch.norm(magnitude, dim=1).squeeze(0).numpy()
-
-    if phoneme_durations is not None:
-        energy = trim_or_pad_to_target_length(energy, sum(phoneme_durations))
-        d_cumsum = np.cumsum(np.concatenate([np.array([0]), phoneme_durations]))
-        energy = np.array(
-            [
-                np.mean(energy[d_cumsum[i - 1]: d_cumsum[i]])
-                for i in range(1, len(d_cumsum))
-            ]
-        )
-        assert len(energy) == len(phoneme_durations)
-
-    if log_scale:
-        energy = np.log(energy + 1)
-
-    if output_path is not None:
-        np.save(output_path.as_posix(), energy)
-    else:
-        return energy
-
-
-def get_global_cmvn(feature_root: Path, output_path: Optional[Path] = None):
-    mean_x, mean_x2, n_frames = None, None, 0
-    feature_paths = feature_root.glob("*.npy")
-    for p in tqdm(feature_paths):
-        with open(p, 'rb') as f:
-            frames = np.load(f).squeeze()
-
-        n_frames += frames.shape[0]
-
-        cur_mean_x = frames.sum(axis=0)
-        if mean_x is None:
-            mean_x = cur_mean_x
-        else:
-            mean_x += cur_mean_x
-
-        cur_mean_x2 = (frames ** 2).sum(axis=0)
-        if mean_x2 is None:
-            mean_x2 = cur_mean_x2
-        else:
-            mean_x2 += cur_mean_x2
-
-    mean_x /= n_frames
-    mean_x2 /= n_frames
-    var_x = mean_x2 - mean_x ** 2
-    std_x = np.sqrt(np.maximum(var_x, 1e-10))
-
-    if output_path is not None:
-        with open(output_path, 'wb') as f:
-            np.savez(f, mean=mean_x, std=std_x)
-    else:
-        return {"mean": mean_x, "std": std_x}
-
-
-def ipa_phonemize(text, lang="en-us", use_g2p=False):
-    if use_g2p:
-        assert lang == "en-us", "g2pE phonemizer only works for en-us"
-        try:
-            from g2p_en import G2p
-            g2p = G2p()
-            return " ".join("|" if p == " " else p for p in g2p(text))
-        except ImportError:
-            raise ImportError(
-                "Please install phonemizer: pip install g2p_en"
-            )
-    else:
-        try:
-            from phonemizer import phonemize
-            from phonemizer.separator import Separator
-            return phonemize(
-                text, backend='espeak', language=lang,
-                separator=Separator(word="| ", phone=" ")
-            )
-        except ImportError:
-            raise ImportError(
-                "Please install phonemizer: pip install phonemizer"
-            )
-
-
-@dataclass
-class ForceAlignmentInfo(object):
-    tokens: List[str]
-    frame_durations: List[int]
-    start_sec: Optional[float]
-    end_sec: Optional[float]
-
-
-def get_mfa_alignment_by_sample_id(
-        textgrid_zip_path: str, sample_id: str, sample_rate: int,
-        hop_length: int, silence_phones: List[str] = ("sil", "sp", "spn")
-) -> ForceAlignmentInfo:
-    try:
-        import tgt
-    except ImportError:
-        raise ImportError("Please install TextGridTools: pip install tgt")
-
-    filename = f"{sample_id}.TextGrid"
-    out_root = Path(tempfile.gettempdir())
-    tgt_path = out_root / filename
-    with zipfile.ZipFile(textgrid_zip_path) as f_zip:
-        f_zip.extract(filename, path=out_root)
-    textgrid = tgt.io.read_textgrid(tgt_path.as_posix())
-    os.remove(tgt_path)
-
-    phones, frame_durations = [], []
-    start_sec, end_sec, end_idx = 0, 0, 0
-    for t in textgrid.get_tier_by_name("phones")._objects:
-        s, e, p = t.start_time, t.end_time, t.text
-        # Trim leading silences
-        if len(phones) == 0:
-            if p in silence_phones:
-                continue
-            else:
-                start_sec = s
-        phones.append(p)
-        if p not in silence_phones:
-            end_sec = e
-            end_idx = len(phones)
-        r = sample_rate / hop_length
-        frame_durations.append(int(np.round(e * r) - np.round(s * r)))
-    # Trim tailing silences
-    phones = phones[:end_idx]
-    frame_durations = frame_durations[:end_idx]
-
-    return ForceAlignmentInfo(
-        tokens=phones, frame_durations=frame_durations, start_sec=start_sec,
-        end_sec=end_sec
-    )
-
-
-def get_mfa_alignment(
-        textgrid_zip_path: str, sample_ids: List[str], sample_rate: int,
-        hop_length: int
-) -> Dict[str, ForceAlignmentInfo]:
-    return {
-        i: get_mfa_alignment_by_sample_id(
-            textgrid_zip_path, i, sample_rate, hop_length
-        ) for i in tqdm(sample_ids)
-    }
-
-
-def get_unit_alignment(
-        id_to_unit_tsv_path: str, sample_ids: List[str]
-) -> Dict[str, ForceAlignmentInfo]:
-    id_to_units = {
-        e["id"]: e["units"] for e in load_tsv_to_dicts(id_to_unit_tsv_path)
-    }
-    id_to_units = {i: id_to_units[i].split() for i in sample_ids}
-    id_to_units_collapsed = {
-        i: [uu for uu, _ in groupby(u)] for i, u in id_to_units.items()
-    }
-    id_to_durations = {
-        i: [len(list(g)) for _, g in groupby(u)] for i, u in id_to_units.items()
-    }
-
-    return {
-        i: ForceAlignmentInfo(
-            tokens=id_to_units_collapsed[i], frame_durations=id_to_durations[i],
-            start_sec=None, end_sec=None
-        )
-        for i in sample_ids
-    }
-
-
-def get_feature_value_min_max(feature_paths: List[str]):
-    v_min, v_max = 1e-8, -1e-8
-    for p in tqdm(feature_paths):
-        _path, slice_ptr = parse_path(p)
-        assert len(slice_ptr) == 2
-        byte_data = read_from_stored_zip(_path, slice_ptr[0], slice_ptr[1])
-        assert is_npy_data(byte_data)
-        path_or_fp = io.BytesIO(byte_data)
-        features = np.load(path_or_fp).squeeze()
-        v_min = min(v_min, features.min().item())
-        v_max = max(v_max, features.max().item())
-    return v_min, v_max
diff --git a/examples/speech_synthesis/docs/common_voice_example.md b/examples/speech_synthesis/docs/common_voice_example.md
deleted file mode 100644
index 1c0eef69a0..0000000000
--- a/examples/speech_synthesis/docs/common_voice_example.md
+++ /dev/null
@@ -1,67 +0,0 @@
-[[Back]](..)
-
-# Common Voice
-
-[Common Voice](https://commonvoice.mozilla.org/en/datasets) is a public domain speech corpus with 11.2K hours of read
-speech in 76 languages (the latest version 7.0). We provide examples for building
-[Transformer](https://arxiv.org/abs/1809.08895) models on this dataset.
-
-
-## Data preparation
-[Download](https://commonvoice.mozilla.org/en/datasets) and unpack Common Voice v4 to a path `${DATA_ROOT}/${LANG_ID}`.
-Create splits and generate audio manifests with
-```bash
-python -m examples.speech_synthesis.preprocessing.get_common_voice_audio_manifest \
-  --data-root ${DATA_ROOT} \
-  --lang ${LANG_ID} \
-  --output-manifest-root ${AUDIO_MANIFEST_ROOT} --convert-to-wav
-```
-
-To denoise audio and trim leading/trailing silence using signal processing based VAD, run
-```bash
-for SPLIT in dev test train; do
-    python -m examples.speech_synthesis.preprocessing.denoise_and_vad_audio \
-      --audio-manifest ${AUDIO_MANIFEST_ROOT}/${SPLIT}.audio.tsv \
-      --output-dir ${PROCESSED_DATA_ROOT} \
-      --denoise --vad --vad-agg-level 2
-done
-```
-
-which generates a new audio TSV manifest under `${PROCESSED_DATA_ROOT}` with updated path to the processed audio and
-a new column for SNR.
-
-To do filtering by CER, follow the [Automatic Evaluation](../docs/ljspeech_example.md#automatic-evaluation) section to
-run ASR model (add `--eval-target` to `get_eval_manifest` for evaluation on the reference audio; add `--err-unit char`
-to `eval_asr` to compute CER instead of WER). The example-level CER is saved to
-`${EVAL_OUTPUT_ROOT}/uer_cer.${SPLIT}.tsv`.
-
-Then, extract log-Mel spectrograms, generate feature manifest and create data configuration YAML with
-```bash
-python -m examples.speech_synthesis.preprocessing.get_feature_manifest \
-  --audio-manifest-root ${AUDIO_MANIFEST_ROOT} \
-  --output-root ${FEATURE_MANIFEST_ROOT} \
-  --ipa-vocab --lang ${LANG_ID} \
-  --snr-threshold 15 \
-  --cer-threshold 0.1 --cer-tsv-path ${EVAL_OUTPUT_ROOT}/uer_cer.${SPLIT}.tsv
-```
-where we use phoneme inputs (`--ipa-vocab`) as example. For sample filtering, we set the SNR and CER threshold
-to 15 and 10%, respectively.
-
-
-## Training
-(Please refer to [the LJSpeech example](../docs/ljspeech_example.md#transformer).)
-
-
-## Inference
-(Please refer to [the LJSpeech example](../docs/ljspeech_example.md#inference).)
-
-## Automatic Evaluation
-(Please refer to [the LJSpeech example](../docs/ljspeech_example.md#automatic-evaluation).)
-
-## Results
-
-| Language | Speakers | --arch | Params | Test MCD | Model |
-|---|---|---|---|---|---|
-| English | 200 | tts_transformer | 54M | 3.8 | [Download](https://dl.fbaipublicfiles.com/fairseq/s2/cv4_en200_transformer_phn.tar) |
-
-[[Back]](..)
diff --git a/examples/speech_synthesis/docs/ljspeech_example.md b/examples/speech_synthesis/docs/ljspeech_example.md
deleted file mode 100644
index 836c30d6d5..0000000000
--- a/examples/speech_synthesis/docs/ljspeech_example.md
+++ /dev/null
@@ -1,137 +0,0 @@
-[[Back]](..)
-
-# LJSpeech
-
-[LJSpeech](https://keithito.com/LJ-Speech-Dataset) is a public domain TTS
-corpus with around 24 hours of English speech sampled at 22.05kHz. We provide examples for building
-[Transformer](https://arxiv.org/abs/1809.08895) and [FastSpeech 2](https://arxiv.org/abs/2006.04558)
-models on this dataset.
-
-
-## Data preparation
-
-Download data, create splits and generate audio manifests with
-```bash
-python -m examples.speech_synthesis.preprocessing.get_ljspeech_audio_manifest \
-  --output-data-root ${AUDIO_DATA_ROOT} \
-  --output-manifest-root ${AUDIO_MANIFEST_ROOT}
-```
-
-Then, extract log-Mel spectrograms, generate feature manifest and create data configuration YAML with
-```bash
-python -m examples.speech_synthesis.preprocessing.get_feature_manifest \
-  --audio-manifest-root ${AUDIO_MANIFEST_ROOT} \
-  --output-root ${FEATURE_MANIFEST_ROOT} \
-  --ipa-vocab --use-g2p
-```
-where we use phoneme inputs (`--ipa-vocab --use-g2p`) as example.
-
-FastSpeech 2 additionally requires frame durations, pitch and energy as auxiliary training targets.
-Add `--add-fastspeech-targets` to include these fields in the feature manifests. We get frame durations either from
-phoneme-level force-alignment or frame-level pseudo-text unit sequence. They should be pre-computed and specified via:
-- `--textgrid-zip ${TEXT_GRID_ZIP_PATH}` for a ZIP file, inside which there is one
-  [TextGrid](https://www.fon.hum.uva.nl/praat/manual/TextGrid.html) file per sample to provide force-alignment info.
-- `--id-to-units-tsv ${ID_TO_UNIT_TSV}` for a TSV file, where there are 2 columns for sample ID and
-  space-delimited pseudo-text unit sequence, respectively.
-
-For your convenience, we provide pre-computed
-[force-alignment](https://dl.fbaipublicfiles.com/fairseq/s2/ljspeech_mfa.zip) from
-[Montreal Forced Aligner](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) and
-[pseudo-text units](s3://dl.fbaipublicfiles.com/fairseq/s2/ljspeech_hubert.tsv) from
-[HuBERT](https://github.com/pytorch/fairseq/tree/main/examples/hubert). You can also generate them by yourself using
-a different software or model.
-
-
-## Training
-#### Transformer
-```bash
-fairseq-train ${FEATURE_MANIFEST_ROOT} --save-dir ${SAVE_DIR} \
-  --config-yaml config.yaml --train-subset train --valid-subset dev \
-  --num-workers 4 --max-tokens 30000 --max-update 200000 \
-  --task text_to_speech --criterion tacotron2 --arch tts_transformer \
-  --clip-norm 5.0 --n-frames-per-step 4 --bce-pos-weight 5.0 \
-  --dropout 0.1 --attention-dropout 0.1 --activation-dropout 0.1 \
-  --encoder-normalize-before --decoder-normalize-before \
-  --optimizer adam --lr 2e-3 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
-  --seed 1 --update-freq 8 --eval-inference --best-checkpoint-metric mcd_loss
-```
-where `SAVE_DIR` is the checkpoint root path. We set `--update-freq 8` to simulate 8 GPUs with 1 GPU. You may want to
-update it accordingly when using more than 1 GPU.
-
-#### FastSpeech2
-```bash
-fairseq-train ${FEATURE_MANIFEST_ROOT} --save-dir ${SAVE_DIR} \
-  --config-yaml config.yaml --train-subset train --valid-subset dev \
-  --num-workers 4 --max-sentences 6 --max-update 200000 \
-  --task text_to_speech --criterion fastspeech2 --arch fastspeech2 \
-  --clip-norm 5.0 --n-frames-per-step 1 \
-  --dropout 0.1 --attention-dropout 0.1 \
-  --optimizer adam --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
-  --seed 1 --update-freq 8 --eval-inference --best-checkpoint-metric mcd_loss
-```
-
-
-## Inference
-Average the last 5 checkpoints, generate the test split spectrogram and waveform using the default Griffin-Lim vocoder:
-```bash
-SPLIT=test
-CHECKPOINT_NAME=avg_last_5
-CHECKPOINT_PATH=${SAVE_DIR}/checkpoint_${CHECKPOINT_NAME}.pt
-python scripts/average_checkpoints.py --inputs ${SAVE_DIR} \
-  --num-epoch-checkpoints 5 \
-  --output ${CHECKPOINT_PATH}
-
-python -m examples.speech_synthesis.generate_waveform ${FEATURE_MANIFEST_ROOT} \
-  --config-yaml config.yaml --gen-subset ${SPLIT} --task text_to_speech \
-  --path ${CHECKPOINT_PATH} --max-tokens 50000 --spec-bwd-max-iter 32 \
-  --dump-waveforms
-```
-which dumps files (waveform, feature, attention plot, etc.) to `${SAVE_DIR}/generate-${CHECKPOINT_NAME}-${SPLIT}`. To
-re-synthesize target waveforms for automatic evaluation, add `--dump-target`.
-
-## Automatic Evaluation
-To start with, generate the manifest for synthetic speech, which will be taken as inputs by evaluation scripts.
-```bash
-python -m examples.speech_synthesis.evaluation.get_eval_manifest \
-  --generation-root ${SAVE_DIR}/generate-${CHECKPOINT_NAME}-${SPLIT} \
-  --audio-manifest ${AUDIO_MANIFEST_ROOT}/${SPLIT}.audio.tsv \
-  --output-path ${EVAL_OUTPUT_ROOT}/eval.tsv \
-  --vocoder griffin_lim --sample-rate 22050 --audio-format flac \
-  --use-resynthesized-target
-```
-Speech recognition (ASR) models usually operate at lower sample rates (e.g. 16kHz). For the WER/CER metric,
-you may need to resample the audios accordingly --- add `--output-sample-rate 16000` for `generate_waveform.py` and
-use `--sample-rate 16000` for `get_eval_manifest.py`.
-
-
-#### WER/CER metric
-We use wav2vec 2.0 ASR model as example. [Download](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec)
-the model checkpoint and dictionary, then compute WER/CER with
-```bash
-python -m examples.speech_synthesis.evaluation.eval_asr \
-  --audio-header syn --text-header text --err-unit char --split ${SPLIT} \
-  --w2v-ckpt ${WAV2VEC2_CHECKPOINT_PATH} --w2v-dict-dir ${WAV2VEC2_DICT_DIR} \
-  --raw-manifest ${EVAL_OUTPUT_ROOT}/eval_16khz.tsv --asr-dir ${EVAL_OUTPUT_ROOT}/asr
-```
-
-#### MCD/MSD metric
-```bash
-python -m examples.speech_synthesis.evaluation.eval_sp \
-  ${EVAL_OUTPUT_ROOT}/eval.tsv --mcd --msd
-```
-
-#### F0 metrics
-```bash
-python -m examples.speech_synthesis.evaluation.eval_f0 \
-  ${EVAL_OUTPUT_ROOT}/eval.tsv --gpe --vde --ffe
-```
-
-
-## Results
-
-| --arch | Params | Test MCD | Model |
-|---|---|---|---|
-| tts_transformer | 54M | 3.8 | [Download](https://dl.fbaipublicfiles.com/fairseq/s2/ljspeech_transformer_phn.tar) |
-| fastspeech2 | 41M | 3.8 | [Download](https://dl.fbaipublicfiles.com/fairseq/s2/ljspeech_fastspeech2_phn.tar) |
-
-[[Back]](..)
diff --git a/examples/speech_synthesis/docs/vctk_example.md b/examples/speech_synthesis/docs/vctk_example.md
deleted file mode 100644
index 6808256d44..0000000000
--- a/examples/speech_synthesis/docs/vctk_example.md
+++ /dev/null
@@ -1,61 +0,0 @@
-[[Back]](..)
-
-# VCTK
-
-[VCTK](https://datashare.ed.ac.uk/handle/10283/3443) is an open English speech corpus. We provide examples
-for building [Transformer](https://arxiv.org/abs/1809.08895) models on this dataset.
-
-
-## Data preparation
-Download data, create splits and generate audio manifests with
-```bash
-python -m examples.speech_synthesis.preprocessing.get_vctk_audio_manifest \
-  --output-data-root ${AUDIO_DATA_ROOT} \
-  --output-manifest-root ${AUDIO_MANIFEST_ROOT}
-```
-
-To denoise audio and trim leading/trailing silence using signal processing based VAD, run
-```bash
-for SPLIT in dev test train; do
-    python -m examples.speech_synthesis.preprocessing.denoise_and_vad_audio \
-      --audio-manifest ${AUDIO_MANIFEST_ROOT}/${SPLIT}.audio.tsv \
-      --output-dir ${PROCESSED_DATA_ROOT} \
-      --denoise --vad --vad-agg-level 3
-done
-```
-which generates a new audio TSV manifest under `${PROCESSED_DATA_ROOT}` with updated path to the processed audio and
-a new column for SNR.
-
-To do filtering by CER, follow the [Automatic Evaluation](../docs/ljspeech_example.md#automatic-evaluation) section to
-run ASR model (add `--eval-target` to `get_eval_manifest` for evaluation on the reference audio; add `--err-unit char`
-to `eval_asr` to compute CER instead of WER). The example-level CER is saved to
-`${EVAL_OUTPUT_ROOT}/uer_cer.${SPLIT}.tsv`.
-
-Then, extract log-Mel spectrograms, generate feature manifest and create data configuration YAML with
-```bash
-python -m examples.speech_synthesis.preprocessing.get_feature_manifest \
-  --audio-manifest-root ${PROCESSED_DATA_ROOT} \
-  --output-root ${FEATURE_MANIFEST_ROOT} \
-  --ipa-vocab --use-g2p \
-  --snr-threshold 15 \
-  --cer-threshold 0.1 --cer-tsv-path ${EVAL_OUTPUT_ROOT}/uer_cer.${SPLIT}.tsv
-```
-where we use phoneme inputs (`--ipa-vocab --use-g2p`) as example. For sample filtering, we set the SNR and CER threshold
-to 15 and 10%, respectively.
-
-## Training
-(Please refer to [the LJSpeech example](../docs/ljspeech_example.md#transformer).)
-
-## Inference
-(Please refer to [the LJSpeech example](../docs/ljspeech_example.md#inference).)
-
-## Automatic Evaluation
-(Please refer to [the LJSpeech example](../docs/ljspeech_example.md#automatic-evaluation).)
-
-## Results
-
-| --arch | Params | Test MCD | Model |
-|---|---|---|---|
-| tts_transformer | 54M | 3.4 | [Download](https://dl.fbaipublicfiles.com/fairseq/s2/vctk_transformer_phn.tar) |
-
-[[Back]](..)
diff --git a/examples/speech_synthesis/evaluation/__init__.py b/examples/speech_synthesis/evaluation/__init__.py
deleted file mode 100644
index 6264236915..0000000000
--- a/examples/speech_synthesis/evaluation/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
diff --git a/examples/speech_synthesis/evaluation/eval_asr.py b/examples/speech_synthesis/evaluation/eval_asr.py
deleted file mode 100644
index 005a11bfb3..0000000000
--- a/examples/speech_synthesis/evaluation/eval_asr.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import editdistance
-import re
-import shutil
-import soundfile as sf
-import subprocess
-from pathlib import Path
-
-from examples.speech_to_text.data_utils import load_tsv_to_dicts
-
-
-def preprocess_text(text):
-    text = "|".join(re.sub(r"[^A-Z' ]", " ", text.upper()).split())
-    text = " ".join(text)
-    return text
-
-
-def prepare_w2v_data(
-        dict_dir, sample_rate, label, audio_paths, texts, split, data_dir
-):
-    data_dir.mkdir(parents=True, exist_ok=True)
-    shutil.copyfile(
-        dict_dir / f"dict.{label}.txt",
-        data_dir / f"dict.{label}.txt"
-    )
-    with open(data_dir / f"{split}.tsv", "w") as f:
-        f.write("/\n")
-        for audio_path in audio_paths:
-            wav, sr = sf.read(audio_path)
-            assert sr == sample_rate, f"{sr} != sample_rate"
-            nsample = len(wav)
-            f.write(f"{audio_path}\t{nsample}\n")
-    with open(data_dir / f"{split}.{label}", "w") as f:
-        for text in texts:
-            text = preprocess_text(text)
-            f.write(f"{text}\n")
-
-
-def run_asr(asr_dir, split, w2v_ckpt, w2v_label, res_dir):
-    """
-    results will be saved at
-    {res_dir}/{ref,hypo}.word-{w2v_ckpt.filename}-{split}.txt
-    """
-    cmd = ["python", "-m", "examples.speech_recognition.infer"]
-    cmd += [str(asr_dir.resolve())]
-    cmd += ["--task", "audio_finetuning", "--nbest", "1", "--quiet"]
-    cmd += ["--w2l-decoder", "viterbi", "--criterion", "ctc"]
-    cmd += ["--post-process", "letter", "--max-tokens", "4000000"]
-    cmd += ["--path", str(w2v_ckpt.resolve()), "--labels", w2v_label]
-    cmd += ["--gen-subset", split, "--results-path", str(res_dir.resolve())]
-
-    print(f"running cmd:\n{' '.join(cmd)}")
-    subprocess.run(cmd, check=True)
-
-
-def compute_error_rate(hyp_wrd_path, ref_wrd_path, unit="word"):
-    """each line is "<text> (None-<index>)" """
-    tokenize_line = {
-        "word": lambda x: re.sub(r" \(.*\)$", "", x.rstrip()).split(),
-        "char": lambda x: list(re.sub(r" \(.*\)$", "", x.rstrip()))
-    }.get(unit)
-    if tokenize_line is None:
-        raise ValueError(f"{unit} not supported")
-
-    inds = [int(re.sub(r"\D*(\d*)\D*", r"\1", line))
-            for line in open(hyp_wrd_path)]
-    hyps = [tokenize_line(line) for line in open(hyp_wrd_path)]
-    refs = [tokenize_line(line) for line in open(ref_wrd_path)]
-    assert(len(hyps) == len(refs))
-    err_rates = [
-        editdistance.eval(hyp, ref) / len(ref) for hyp, ref in zip(hyps, refs)
-    ]
-    ind_to_err_rates = {i: e for i, e in zip(inds, err_rates)}
-    return ind_to_err_rates
-
-
-def main(args):
-    samples = load_tsv_to_dicts(args.raw_manifest)
-    ids = [
-        sample[args.id_header] if args.id_header else "" for sample in samples
-    ]
-    audio_paths = [sample[args.audio_header] for sample in samples]
-    texts = [sample[args.text_header] for sample in samples]
-
-    prepare_w2v_data(
-        args.w2v_dict_dir,
-        args.w2v_sample_rate,
-        args.w2v_label,
-        audio_paths,
-        texts,
-        args.split,
-        args.asr_dir
-    )
-    run_asr(args.asr_dir, args.split, args.w2v_ckpt, args.w2v_label, args.asr_dir)
-    ind_to_err_rates = compute_error_rate(
-        args.asr_dir / f"hypo.word-{args.w2v_ckpt.name}-{args.split}.txt",
-        args.asr_dir / f"ref.word-{args.w2v_ckpt.name}-{args.split}.txt",
-        args.err_unit,
-    )
-
-    uer_path = args.asr_dir / f"uer_{args.err_unit}.{args.split}.tsv"
-    with open(uer_path, "w") as f:
-        f.write("id\taudio\tuer\n")
-        for ind, (id_, audio_path) in enumerate(zip(ids, audio_paths)):
-            f.write(f"{id_}\t{audio_path}\t{ind_to_err_rates[ind]:.4f}\n")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--raw-manifest", required=True, type=Path)
-    parser.add_argument("--asr-dir", required=True, type=Path)
-    parser.add_argument("--id-header", default="id", type=str)
-    parser.add_argument("--audio-header", default="audio", type=str)
-    parser.add_argument("--text-header", default="src_text", type=str)
-    parser.add_argument("--split", default="raw", type=str)
-    parser.add_argument("--w2v-ckpt", required=True, type=Path)
-    parser.add_argument("--w2v-dict-dir", required=True, type=Path)
-    parser.add_argument("--w2v-sample-rate", default=16000, type=int)
-    parser.add_argument("--w2v-label", default="ltr", type=str)
-    parser.add_argument("--err-unit", default="word", type=str)
-    args = parser.parse_args()
-
-    main(args)
diff --git a/examples/speech_synthesis/evaluation/eval_f0.py b/examples/speech_synthesis/evaluation/eval_f0.py
deleted file mode 100644
index df721d6831..0000000000
--- a/examples/speech_synthesis/evaluation/eval_f0.py
+++ /dev/null
@@ -1,266 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-Signal processing-based evaluation using waveforms
-"""
-import numpy as np
-import os.path as op
-
-import torchaudio
-import tqdm
-from tabulate import tabulate
-
-from examples.speech_synthesis.utils import (
-    gross_pitch_error, voicing_decision_error, f0_frame_error
-)
-from examples.speech_synthesis.evaluation.eval_sp import load_eval_spec
-
-
-def difference_function(x, n, tau_max):
-    """
-    Compute difference function of data x. This solution is implemented directly
-    with Numpy fft.
-
-
-    :param x: audio data
-    :param n: length of data
-    :param tau_max: integration window size
-    :return: difference function
-    :rtype: list
-    """
-
-    x = np.array(x, np.float64)
-    w = x.size
-    tau_max = min(tau_max, w)
-    x_cumsum = np.concatenate((np.array([0.]), (x * x).cumsum()))
-    size = w + tau_max
-    p2 = (size // 32).bit_length()
-    nice_numbers = (16, 18, 20, 24, 25, 27, 30, 32)
-    size_pad = min(x * 2 ** p2 for x in nice_numbers if x * 2 ** p2 >= size)
-    fc = np.fft.rfft(x, size_pad)
-    conv = np.fft.irfft(fc * fc.conjugate())[:tau_max]
-    return x_cumsum[w:w - tau_max:-1] + x_cumsum[w] - x_cumsum[:tau_max] - \
-        2 * conv
-
-
-def cumulative_mean_normalized_difference_function(df, n):
-    """
-    Compute cumulative mean normalized difference function (CMND).
-
-    :param df: Difference function
-    :param n: length of data
-    :return: cumulative mean normalized difference function
-    :rtype: list
-    """
-
-    # scipy method
-    cmn_df = df[1:] * range(1, n) / np.cumsum(df[1:]).astype(float)
-    return np.insert(cmn_df, 0, 1)
-
-
-def get_pitch(cmdf, tau_min, tau_max, harmo_th=0.1):
-    """
-    Return fundamental period of a frame based on CMND function.
-
-    :param cmdf: Cumulative Mean Normalized Difference function
-    :param tau_min: minimum period for speech
-    :param tau_max: maximum period for speech
-    :param harmo_th: harmonicity threshold to determine if it is necessary to
-    compute pitch frequency
-    :return: fundamental period if there is values under threshold, 0 otherwise
-    :rtype: float
-    """
-    tau = tau_min
-    while tau < tau_max:
-        if cmdf[tau] < harmo_th:
-            while tau + 1 < tau_max and cmdf[tau + 1] < cmdf[tau]:
-                tau += 1
-            return tau
-        tau += 1
-
-    return 0    # if unvoiced
-
-
-def compute_yin(sig, sr, w_len=512, w_step=256, f0_min=100, f0_max=500,
-                harmo_thresh=0.1):
-    """
-
-    Compute the Yin Algorithm. Return fundamental frequency and harmonic rate.
-
-    https://github.com/NVIDIA/mellotron adaption of
-    https://github.com/patriceguyot/Yin
-
-    :param sig: Audio signal (list of float)
-    :param sr: sampling rate (int)
-    :param w_len: size of the analysis window (samples)
-    :param w_step: size of the lag between two consecutives windows (samples)
-    :param f0_min: Minimum fundamental frequency that can be detected (hertz)
-    :param f0_max: Maximum fundamental frequency that can be detected (hertz)
-    :param harmo_thresh: Threshold of detection. The yalgorithmù return the
-    first minimum of the CMND function below this threshold.
-
-    :returns:
-
-        * pitches: list of fundamental frequencies,
-        * harmonic_rates: list of harmonic rate values for each fundamental
-        frequency value (= confidence value)
-        * argmins: minimums of the Cumulative Mean Normalized DifferenceFunction
-        * times: list of time of each estimation
-    :rtype: tuple
-    """
-
-    tau_min = int(sr / f0_max)
-    tau_max = int(sr / f0_min)
-
-    # time values for each analysis window
-    time_scale = range(0, len(sig) - w_len, w_step)
-    times = [t/float(sr) for t in time_scale]
-    frames = [sig[t:t + w_len] for t in time_scale]
-
-    pitches = [0.0] * len(time_scale)
-    harmonic_rates = [0.0] * len(time_scale)
-    argmins = [0.0] * len(time_scale)
-
-    for i, frame in enumerate(frames):
-        # Compute YIN
-        df = difference_function(frame, w_len, tau_max)
-        cm_df = cumulative_mean_normalized_difference_function(df, tau_max)
-        p = get_pitch(cm_df, tau_min, tau_max, harmo_thresh)
-
-        # Get results
-        if np.argmin(cm_df) > tau_min:
-            argmins[i] = float(sr / np.argmin(cm_df))
-        if p != 0:  # A pitch was found
-            pitches[i] = float(sr / p)
-            harmonic_rates[i] = cm_df[p]
-        else:  # No pitch, but we compute a value of the harmonic rate
-            harmonic_rates[i] = min(cm_df)
-
-    return pitches, harmonic_rates, argmins, times
-
-
-def extract_f0(samples):
-    f0_samples = []
-    for sample in tqdm.tqdm(samples):
-        if not op.isfile(sample["ref"]) or not op.isfile(sample["syn"]):
-            f0_samples.append(None)
-            continue
-
-        # assume single channel
-        yref, sr = torchaudio.load(sample["ref"])
-        ysyn, _sr = torchaudio.load(sample["syn"])
-        yref, ysyn = yref[0], ysyn[0]
-        assert sr == _sr, f"{sr} != {_sr}"
-
-        yref_f0 = compute_yin(yref, sr)
-        ysyn_f0 = compute_yin(ysyn, sr)
-
-        f0_samples += [
-            {
-                "ref": yref_f0,
-                "syn": ysyn_f0
-            }
-        ]
-
-    return f0_samples
-
-
-def eval_f0_error(samples, distortion_fn):
-    results = []
-    for sample in tqdm.tqdm(samples):
-        if sample is None:
-            results.append(None)
-            continue
-        # assume single channel
-        yref_f, _, _, yref_t = sample["ref"]
-        ysyn_f, _, _, ysyn_t = sample["syn"]
-
-        yref_f = np.array(yref_f)
-        yref_t = np.array(yref_t)
-        ysyn_f = np.array(ysyn_f)
-        ysyn_t = np.array(ysyn_t)
-
-        distortion = distortion_fn(yref_t, yref_f, ysyn_t, ysyn_f)
-        results.append((distortion.item(),
-                        len(yref_f),
-                        len(ysyn_f)
-                        ))
-    return results
-
-
-def eval_gross_pitch_error(samples):
-    return eval_f0_error(samples, gross_pitch_error)
-
-
-def eval_voicing_decision_error(samples):
-    return eval_f0_error(samples, voicing_decision_error)
-
-
-def eval_f0_frame_error(samples):
-    return eval_f0_error(samples, f0_frame_error)
-
-
-def print_results(results, show_bin):
-    results = np.array(list(filter(lambda x: x is not None, results)))
-
-    np.set_printoptions(precision=3)
-
-    def _print_result(results):
-        res = {
-            "nutt": len(results),
-            "error": results[:, 0].mean(),
-            "std": results[:, 0].std(),
-            "dur_ref": int(results[:, 1].sum()),
-            "dur_syn": int(results[:, 2].sum()),
-        }
-        print(tabulate([res.values()], res.keys(), floatfmt=".4f"))
-
-    print(">>>> ALL")
-    _print_result(results)
-
-    if show_bin:
-        edges = [0, 200, 400, 600, 800, 1000, 2000, 4000]
-        for i in range(1, len(edges)):
-            mask = np.logical_and(results[:, 1] >= edges[i-1],
-                                  results[:, 1] < edges[i])
-            if not mask.any():
-                continue
-            bin_results = results[mask]
-            print(f">>>> ({edges[i-1]}, {edges[i]})")
-            _print_result(bin_results)
-
-
-def main(eval_f0, gpe, vde, ffe, show_bin):
-    samples = load_eval_spec(eval_f0)
-    if gpe or vde or ffe:
-        f0_samples = extract_f0(samples)
-
-    if gpe:
-        print("===== Evaluate Gross Pitch Error =====")
-        results = eval_gross_pitch_error(f0_samples)
-        print_results(results, show_bin)
-    if vde:
-        print("===== Evaluate Voicing Decision Error =====")
-        results = eval_voicing_decision_error(f0_samples)
-        print_results(results, show_bin)
-    if ffe:
-        print("===== Evaluate F0 Frame Error =====")
-        results = eval_f0_frame_error(f0_samples)
-        print_results(results, show_bin)
-
-
-if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("eval_f0")
-    parser.add_argument("--gpe", action="store_true")
-    parser.add_argument("--vde", action="store_true")
-    parser.add_argument("--ffe", action="store_true")
-    parser.add_argument("--show-bin", action="store_true")
-    args = parser.parse_args()
-
-    main(args.eval_f0, args.gpe, args.vde, args.ffe, args.show_bin)
diff --git a/examples/speech_synthesis/evaluation/eval_sp.py b/examples/speech_synthesis/evaluation/eval_sp.py
deleted file mode 100644
index 702c498038..0000000000
--- a/examples/speech_synthesis/evaluation/eval_sp.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-"""
-Signal processing-based evaluation using waveforms
-"""
-
-import csv
-import numpy as np
-import os.path as op
-
-import torch
-import tqdm
-from tabulate import tabulate
-import torchaudio
-
-from examples.speech_synthesis.utils import batch_mel_spectral_distortion
-from fairseq.tasks.text_to_speech import batch_mel_cepstral_distortion
-
-
-def load_eval_spec(path):
-    with open(path) as f:
-        reader = csv.DictReader(f, delimiter='\t')
-        samples = list(reader)
-    return samples
-
-
-def eval_distortion(samples, distortion_fn, device="cuda"):
-    nmiss = 0
-    results = []
-    for sample in tqdm.tqdm(samples):
-        if not op.isfile(sample["ref"]) or not op.isfile(sample["syn"]):
-            nmiss += 1
-            results.append(None)
-            continue
-        # assume single channel
-        yref, sr = torchaudio.load(sample["ref"])
-        ysyn, _sr = torchaudio.load(sample["syn"])
-        yref, ysyn = yref[0].to(device), ysyn[0].to(device)
-        assert sr == _sr, f"{sr} != {_sr}"
-
-        distortion, extra = distortion_fn([yref], [ysyn], sr, None)[0]
-        _, _, _, _, _, pathmap = extra
-        nins = torch.sum(pathmap.sum(dim=1) - 1)  # extra frames in syn
-        ndel = torch.sum(pathmap.sum(dim=0) - 1)  # missing frames from syn
-        results.append(
-            (distortion.item(),  # path distortion
-             pathmap.size(0),  # yref num frames
-             pathmap.size(1),  # ysyn num frames
-             pathmap.sum().item(),  # path length
-             nins.item(),  # insertion
-             ndel.item(),  # deletion
-             )
-        )
-    return results
-
-
-def eval_mel_cepstral_distortion(samples, device="cuda"):
-    return eval_distortion(samples, batch_mel_cepstral_distortion, device)
-
-
-def eval_mel_spectral_distortion(samples, device="cuda"):
-    return eval_distortion(samples, batch_mel_spectral_distortion, device)
-
-
-def print_results(results, show_bin):
-    results = np.array(list(filter(lambda x: x is not None, results)))
-
-    np.set_printoptions(precision=3)
-
-    def _print_result(results):
-        dist, dur_ref, dur_syn, dur_ali, nins, ndel = results.sum(axis=0)
-        res = {
-            "nutt": len(results),
-            "dist": dist,
-            "dur_ref": int(dur_ref),
-            "dur_syn": int(dur_syn),
-            "dur_ali": int(dur_ali),
-            "dist_per_ref_frm": dist/dur_ref,
-            "dist_per_syn_frm": dist/dur_syn,
-            "dist_per_ali_frm": dist/dur_ali,
-            "ins": nins/dur_ref,
-            "del": ndel/dur_ref,
-        }
-        print(tabulate(
-            [res.values()],
-            res.keys(),
-            floatfmt=".4f"
-        ))
-
-    print(">>>> ALL")
-    _print_result(results)
-
-    if show_bin:
-        edges = [0, 200, 400, 600, 800, 1000, 2000, 4000]
-        for i in range(1, len(edges)):
-            mask = np.logical_and(results[:, 1] >= edges[i-1],
-                                  results[:, 1] < edges[i])
-            if not mask.any():
-                continue
-            bin_results = results[mask]
-            print(f">>>> ({edges[i-1]}, {edges[i]})")
-            _print_result(bin_results)
-
-
-def main(eval_spec, mcd, msd, show_bin):
-    samples = load_eval_spec(eval_spec)
-    device = "cpu"
-    if mcd:
-        print("===== Evaluate Mean Cepstral Distortion =====")
-        results = eval_mel_cepstral_distortion(samples, device)
-        print_results(results, show_bin)
-    if msd:
-        print("===== Evaluate Mean Spectral Distortion =====")
-        results = eval_mel_spectral_distortion(samples, device)
-        print_results(results, show_bin)
-
-
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument("eval_spec")
-    parser.add_argument("--mcd", action="store_true")
-    parser.add_argument("--msd", action="store_true")
-    parser.add_argument("--show-bin", action="store_true")
-    args = parser.parse_args()
-
-    main(args.eval_spec, args.mcd, args.msd, args.show_bin)
diff --git a/examples/speech_synthesis/evaluation/get_eval_manifest.py b/examples/speech_synthesis/evaluation/get_eval_manifest.py
deleted file mode 100644
index 44b3685bb2..0000000000
--- a/examples/speech_synthesis/evaluation/get_eval_manifest.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-import csv
-from pathlib import Path
-
-
-def main(args):
-    """
-    `uid syn ref text`
-    """
-    in_root = Path(args.generation_root).resolve()
-    ext = args.audio_format
-    with open(args.audio_manifest) as f, open(args.output_path, "w") as f_out:
-        reader = csv.DictReader(
-            f, delimiter="\t", quotechar=None, doublequote=False,
-            lineterminator="\n", quoting=csv.QUOTE_NONE
-        )
-        header = ["id", "syn", "ref", "text", "speaker"]
-        f_out.write("\t".join(header) + "\n")
-        for row in reader:
-            dir_name = f"{ext}_{args.sample_rate}hz_{args.vocoder}"
-            id_ = row["id"]
-            syn = (in_root / dir_name / f"{id_}.{ext}").as_posix()
-            ref = row["audio"]
-            if args.use_resynthesized_target:
-                ref = (in_root / f"{dir_name}_tgt" / f"{id_}.{ext}").as_posix()
-            if args.eval_target:
-                syn = row["audio"]
-            sample = [id_, syn, ref, row["tgt_text"], row["speaker"]]
-            f_out.write("\t".join(sample) + "\n")
-    print(f"wrote evaluation file to {args.output_path}")
-
-
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--generation-root",  help="output directory for generate_waveform.py"
-    )
-    parser.add_argument(
-        "--audio-manifest",
-        help="used to determine the original utterance ID and text"
-    )
-    parser.add_argument(
-        "--output-path", help="path to output evaluation spec file"
-    )
-    parser.add_argument(
-        "--use-resynthesized-target", action="store_true",
-        help="use resynthesized reference instead of the original audio"
-    )
-    parser.add_argument(
-        "--eval-target", action="store_true",
-        help="evaluate reference instead of model prediction"
-    )
-    parser.add_argument("--vocoder", type=str, default="griffin_lim")
-    parser.add_argument("--sample-rate", type=int, default=22_050)
-    parser.add_argument("--audio-format", type=str, default="wav")
-    args = parser.parse_args()
-
-    main(args)
diff --git a/examples/speech_synthesis/generate_waveform.py b/examples/speech_synthesis/generate_waveform.py
deleted file mode 100644
index 3b56190dbe..0000000000
--- a/examples/speech_synthesis/generate_waveform.py
+++ /dev/null
@@ -1,192 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import ast
-import logging
-import matplotlib.pyplot as plt
-import numpy as np
-from pathlib import Path
-import soundfile as sf
-import sys
-import torch
-import torchaudio
-
-from fairseq import checkpoint_utils, options, tasks, utils
-from fairseq.logging import progress_bar
-from fairseq.tasks.text_to_speech import plot_tts_output
-from fairseq.data.audio.text_to_speech_dataset import TextToSpeechDataset
-
-
-logging.basicConfig()
-logging.root.setLevel(logging.INFO)
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-def make_parser():
-    parser = options.get_speech_generation_parser()
-    parser.add_argument("--dump-features", action="store_true")
-    parser.add_argument("--dump-waveforms", action="store_true")
-    parser.add_argument("--dump-attentions", action="store_true")
-    parser.add_argument("--dump-eos-probs", action="store_true")
-    parser.add_argument("--dump-plots", action="store_true")
-    parser.add_argument("--dump-target", action="store_true")
-    parser.add_argument("--output-sample-rate", default=22050, type=int)
-    parser.add_argument("--teacher-forcing", action="store_true")
-    parser.add_argument(
-        "--audio-format", type=str, default="wav", choices=["wav", "flac"]
-    )
-    return parser
-
-
-def postprocess_results(
-        dataset: TextToSpeechDataset, sample, hypos, resample_fn, dump_target
-):
-    def to_np(x):
-        return None if x is None else x.detach().cpu().numpy()
-
-    sample_ids = [dataset.ids[i] for i in sample["id"].tolist()]
-    texts = sample["src_texts"] if "src_texts" in sample else [""] * len(hypos)
-    attns = [to_np(hypo["attn"]) for hypo in hypos]
-    eos_probs = [to_np(hypo.get("eos_prob", None)) for hypo in hypos]
-    feat_preds = [to_np(hypo["feature"]) for hypo in hypos]
-    wave_preds = [to_np(resample_fn(h["waveform"])) for h in hypos]
-    if dump_target:
-        feat_targs = [to_np(hypo["targ_feature"]) for hypo in hypos]
-        wave_targs = [to_np(resample_fn(h["targ_waveform"])) for h in hypos]
-    else:
-        feat_targs = [None for _ in hypos]
-        wave_targs = [None for _ in hypos]
-
-    return zip(sample_ids, texts, attns, eos_probs, feat_preds, wave_preds,
-               feat_targs, wave_targs)
-
-
-def dump_result(
-        is_na_model,
-        args,
-        vocoder,
-        sample_id,
-        text,
-        attn,
-        eos_prob,
-        feat_pred,
-        wave_pred,
-        feat_targ,
-        wave_targ,
-):
-    sample_rate = args.output_sample_rate
-    out_root = Path(args.results_path)
-    if args.dump_features:
-        feat_dir = out_root / "feat"
-        feat_dir.mkdir(exist_ok=True, parents=True)
-        np.save(feat_dir / f"{sample_id}.npy", feat_pred)
-        if args.dump_target:
-            feat_tgt_dir = out_root / "feat_tgt"
-            feat_tgt_dir.mkdir(exist_ok=True, parents=True)
-            np.save(feat_tgt_dir / f"{sample_id}.npy", feat_targ)
-    if args.dump_attentions:
-        attn_dir = out_root / "attn"
-        attn_dir.mkdir(exist_ok=True, parents=True)
-        np.save(attn_dir / f"{sample_id}.npy", attn.numpy())
-    if args.dump_eos_probs and not is_na_model:
-        eos_dir = out_root / "eos"
-        eos_dir.mkdir(exist_ok=True, parents=True)
-        np.save(eos_dir / f"{sample_id}.npy", eos_prob)
-
-    if args.dump_plots:
-        images = [feat_pred.T] if is_na_model else [feat_pred.T, attn]
-        names = ["output"] if is_na_model else ["output", "alignment"]
-        if feat_targ is not None:
-            images = [feat_targ.T] + images
-            names = [f"target (idx={sample_id})"] + names
-        if is_na_model:
-            plot_tts_output(images, names, attn, "alignment", suptitle=text)
-        else:
-            plot_tts_output(images, names, eos_prob, "eos prob", suptitle=text)
-        plot_dir = out_root / "plot"
-        plot_dir.mkdir(exist_ok=True, parents=True)
-        plt.savefig(plot_dir / f"{sample_id}.png")
-        plt.close()
-
-    if args.dump_waveforms:
-        ext = args.audio_format
-        if wave_pred is not None:
-            wav_dir = out_root / f"{ext}_{sample_rate}hz_{vocoder}"
-            wav_dir.mkdir(exist_ok=True, parents=True)
-            sf.write(wav_dir / f"{sample_id}.{ext}", wave_pred, sample_rate)
-        if args.dump_target and wave_targ is not None:
-            wav_tgt_dir = out_root / f"{ext}_{sample_rate}hz_{vocoder}_tgt"
-            wav_tgt_dir.mkdir(exist_ok=True, parents=True)
-            sf.write(wav_tgt_dir / f"{sample_id}.{ext}", wave_targ, sample_rate)
-
-
-def main(args):
-    assert(args.dump_features or args.dump_waveforms or args.dump_attentions
-           or args.dump_eos_probs or args.dump_plots)
-    if args.max_tokens is None and args.batch_size is None:
-        args.max_tokens = 8000
-    logger.info(args)
-
-    use_cuda = torch.cuda.is_available() and not args.cpu
-    task = tasks.setup_task(args)
-    models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
-        [args.path],
-        task=task,
-        arg_overrides=ast.literal_eval(args.model_overrides),
-    )
-    model = models[0].cuda() if use_cuda else models[0]
-    # use the original n_frames_per_step
-    task.args.n_frames_per_step = saved_cfg.task.n_frames_per_step
-    task.load_dataset(args.gen_subset, task_cfg=saved_cfg.task)
-
-    data_cfg = task.data_cfg
-    sample_rate = data_cfg.config.get("features", {}).get("sample_rate", 22050)
-    resample_fn = {
-        False: lambda x: x,
-        True: lambda x: torchaudio.sox_effects.apply_effects_tensor(
-            x.detach().cpu().unsqueeze(0), sample_rate,
-            [['rate', str(args.output_sample_rate)]]
-        )[0].squeeze(0)
-    }.get(args.output_sample_rate != sample_rate)
-    if args.output_sample_rate != sample_rate:
-        logger.info(f"resampling to {args.output_sample_rate}Hz")
-
-    generator = task.build_generator([model], args)
-    itr = task.get_batch_iterator(
-        dataset=task.dataset(args.gen_subset),
-        max_tokens=args.max_tokens,
-        max_sentences=args.batch_size,
-        max_positions=(sys.maxsize, sys.maxsize),
-        ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
-        required_batch_size_multiple=args.required_batch_size_multiple,
-        num_shards=args.num_shards,
-        shard_id=args.shard_id,
-        num_workers=args.num_workers,
-        data_buffer_size=args.data_buffer_size,
-    ).next_epoch_itr(shuffle=False)
-
-    Path(args.results_path).mkdir(exist_ok=True, parents=True)
-    is_na_model = getattr(model, "NON_AUTOREGRESSIVE", False)
-    dataset = task.dataset(args.gen_subset)
-    vocoder = task.args.vocoder
-    with progress_bar.build_progress_bar(args, itr) as t:
-        for sample in t:
-            sample = utils.move_to_cuda(sample) if use_cuda else sample
-            hypos = generator.generate(model, sample, has_targ=args.dump_target)
-            for result in postprocess_results(
-                    dataset, sample, hypos, resample_fn, args.dump_target
-            ):
-                dump_result(is_na_model, args, vocoder, *result)
-
-
-def cli_main():
-    parser = make_parser()
-    args = options.parse_args_and_arch(parser)
-    main(args)
-
-
-if __name__ == "__main__":
-    cli_main()
diff --git a/examples/speech_synthesis/preprocessing/__init__.py b/examples/speech_synthesis/preprocessing/__init__.py
deleted file mode 100644
index 6264236915..0000000000
--- a/examples/speech_synthesis/preprocessing/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
diff --git a/examples/speech_synthesis/preprocessing/denoise_and_vad_audio.py b/examples/speech_synthesis/preprocessing/denoise_and_vad_audio.py
deleted file mode 100644
index 4e13b38a5d..0000000000
--- a/examples/speech_synthesis/preprocessing/denoise_and_vad_audio.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import logging
-import os
-import csv
-import tempfile
-from collections import defaultdict
-from pathlib import Path
-
-import torchaudio
-try:
-    import webrtcvad
-except ImportError:
-    raise ImportError("Please install py-webrtcvad: pip install webrtcvad")
-import pandas as pd
-from tqdm import tqdm
-
-from examples.speech_synthesis.preprocessing.denoiser.pretrained import master64
-import examples.speech_synthesis.preprocessing.denoiser.utils as utils
-from examples.speech_synthesis.preprocessing.vad import (
-    frame_generator, vad_collector, read_wave, write_wave, FS_MS, THRESHOLD,
-    SCALE
-)
-from examples.speech_to_text.data_utils import save_df_to_tsv
-
-
-log = logging.getLogger(__name__)
-
-PATHS = ["after_denoise", "after_vad"]
-MIN_T = 0.05
-
-
-def generate_tmp_filename(extension="txt"):
-    return tempfile._get_default_tempdir() + "/" + \
-           next(tempfile._get_candidate_names()) + "." + extension
-
-
-def convert_sr(inpath, sr, output_path=None):
-    if not output_path:
-        output_path = generate_tmp_filename("wav")
-    cmd = f"sox {inpath} -r {sr} {output_path}"
-    os.system(cmd)
-    return output_path
-
-
-def apply_vad(vad, inpath):
-    audio, sample_rate = read_wave(inpath)
-    frames = frame_generator(FS_MS, audio, sample_rate)
-    frames = list(frames)
-    segments = vad_collector(sample_rate, FS_MS, 300, vad, frames)
-    merge_segments = list()
-    timestamp_start = 0.0
-    timestamp_end = 0.0
-    # removing start, end, and long sequences of sils
-    for i, segment in enumerate(segments):
-        merge_segments.append(segment[0])
-        if i and timestamp_start:
-            sil_duration = segment[1] - timestamp_end
-            if sil_duration > THRESHOLD:
-                merge_segments.append(int(THRESHOLD / SCALE) * (b'\x00'))
-            else:
-                merge_segments.append(int((sil_duration / SCALE)) * (b'\x00'))
-        timestamp_start = segment[1]
-        timestamp_end = segment[2]
-    segment = b''.join(merge_segments)
-    return segment, sample_rate
-
-
-def write(wav, filename, sr=16_000):
-    # Normalize audio if it prevents clipping
-    wav = wav / max(wav.abs().max().item(), 1)
-    torchaudio.save(filename, wav.cpu(), sr, encoding="PCM_S",
-                    bits_per_sample=16)
-
-
-def process(args):
-    # making sure we are requested either denoise or vad
-    if not args.denoise and not args.vad:
-        log.error("No denoise or vad is requested.")
-        return
-
-    log.info("Creating out directories...")
-    if args.denoise:
-        out_denoise = Path(args.output_dir).absolute().joinpath(PATHS[0])
-        out_denoise.mkdir(parents=True, exist_ok=True)
-    if args.vad:
-        out_vad = Path(args.output_dir).absolute().joinpath(PATHS[1])
-        out_vad.mkdir(parents=True, exist_ok=True)
-
-    log.info("Loading pre-trained speech enhancement model...")
-    model = master64().to(args.device)
-
-    log.info("Building the VAD model...")
-    vad = webrtcvad.Vad(int(args.vad_agg_level))
-
-    # preparing the output dict
-    output_dict = defaultdict(list)
-
-    log.info(f"Parsing input manifest: {args.audio_manifest}")
-    with open(args.audio_manifest, "r") as f:
-        manifest_dict = csv.DictReader(f, delimiter="\t")
-        for row in tqdm(manifest_dict):
-            filename = str(row["audio"])
-
-            final_output = filename
-            keep_sample = True
-            n_frames = row["n_frames"]
-            snr = -1
-            if args.denoise:
-                output_path_denoise = out_denoise.joinpath(Path(filename).name)
-                # convert to 16khz in case we use a differet sr
-                tmp_path = convert_sr(final_output, 16000)
-
-                # loading audio file and generating the enhanced version
-                out, sr = torchaudio.load(tmp_path)
-                out = out.to(args.device)
-                estimate = model(out)
-                estimate = (1 - args.dry_wet) * estimate + args.dry_wet * out
-                write(estimate[0], str(output_path_denoise), sr)
-
-                snr = utils.cal_snr(out, estimate)
-                snr = snr.cpu().detach().numpy()[0][0]
-                final_output = str(output_path_denoise)
-
-            if args.vad:
-                output_path_vad = out_vad.joinpath(Path(filename).name)
-                sr = torchaudio.info(final_output).sample_rate
-                if sr in [16000, 32000, 48000]:
-                    tmp_path = final_output
-                elif sr < 16000:
-                    tmp_path = convert_sr(final_output, 16000)
-                elif sr < 32000:
-                    tmp_path = convert_sr(final_output, 32000)
-                else:
-                    tmp_path = convert_sr(final_output, 48000)
-                # apply VAD
-                segment, sample_rate = apply_vad(vad, tmp_path)
-                if len(segment) < sample_rate * MIN_T:
-                    keep_sample = False
-                    print((
-                        f"WARNING: skip {filename} because it is too short "
-                        f"after VAD ({len(segment) / sample_rate} < {MIN_T})"
-                    ))
-                else:
-                    if sample_rate != sr:
-                        tmp_path = generate_tmp_filename("wav")
-                        write_wave(tmp_path, segment, sample_rate)
-                        convert_sr(tmp_path, sr,
-                                   output_path=str(output_path_vad))
-                    else:
-                        write_wave(str(output_path_vad), segment, sample_rate)
-                    final_output = str(output_path_vad)
-                    segment, _ = torchaudio.load(final_output)
-                    n_frames = segment.size(1)
-
-            if keep_sample:
-                output_dict["id"].append(row["id"])
-                output_dict["audio"].append(final_output)
-                output_dict["n_frames"].append(n_frames)
-                output_dict["tgt_text"].append(row["tgt_text"])
-                output_dict["speaker"].append(row["speaker"])
-                output_dict["src_text"].append(row["src_text"])
-                output_dict["snr"].append(snr)
-
-        out_tsv_path = Path(args.output_dir) / Path(args.audio_manifest).name
-        log.info(f"Saving manifest to {out_tsv_path.as_posix()}")
-        save_df_to_tsv(pd.DataFrame.from_dict(output_dict), out_tsv_path)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--audio-manifest", "-i", required=True,
-                        type=str, help="path to the input manifest.")
-    parser.add_argument(
-        "--output-dir", "-o", required=True, type=str,
-        help="path to the output dir. it will contain files after denoising and"
-             " vad"
-    )
-    parser.add_argument("--vad-agg-level", "-a", type=int, default=2,
-                        help="the aggresive level of the vad [0-3].")
-    parser.add_argument(
-        "--dry-wet", "-dw", type=float, default=0.01,
-        help="the level of linear interpolation between noisy and enhanced "
-             "files."
-    )
-    parser.add_argument(
-        "--device", "-d", type=str, default="cpu",
-        help="the device to be used for the speech enhancement model: "
-             "cpu | cuda."
-    )
-    parser.add_argument("--denoise", action="store_true",
-                        help="apply a denoising")
-    parser.add_argument("--vad", action="store_true", help="apply a VAD")
-    args = parser.parse_args()
-
-    process(args)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/speech_synthesis/preprocessing/denoiser/__init__.py b/examples/speech_synthesis/preprocessing/denoiser/__init__.py
deleted file mode 100644
index 6264236915..0000000000
--- a/examples/speech_synthesis/preprocessing/denoiser/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
diff --git a/examples/speech_synthesis/preprocessing/denoiser/demucs.py b/examples/speech_synthesis/preprocessing/denoiser/demucs.py
deleted file mode 100644
index 3f70e73d6a..0000000000
--- a/examples/speech_synthesis/preprocessing/denoiser/demucs.py
+++ /dev/null
@@ -1,473 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# author: adefossez
-
-import math
-import time
-
-import torch as th
-from torch import nn
-from torch.nn import functional as F
-
-from .resample import downsample2, upsample2
-from .utils import capture_init
-
-
-class BLSTM(nn.Module):
-    def __init__(self, dim, layers=2, bi=True):
-        super().__init__()
-        klass = nn.LSTM
-        self.lstm = klass(
-            bidirectional=bi, num_layers=layers, hidden_size=dim, input_size=dim
-        )
-        self.linear = None
-        if bi:
-            self.linear = nn.Linear(2 * dim, dim)
-
-    def forward(self, x, hidden=None):
-        x, hidden = self.lstm(x, hidden)
-        if self.linear:
-            x = self.linear(x)
-        return x, hidden
-
-
-def rescale_conv(conv, reference):
-    std = conv.weight.std().detach()
-    scale = (std / reference)**0.5
-    conv.weight.data /= scale
-    if conv.bias is not None:
-        conv.bias.data /= scale
-
-
-def rescale_module(module, reference):
-    for sub in module.modules():
-        if isinstance(sub, (nn.Conv1d, nn.ConvTranspose1d)):
-            rescale_conv(sub, reference)
-
-
-class Demucs(nn.Module):
-    """
-    Demucs speech enhancement model.
-    Args:
-        - chin (int): number of input channels.
-        - chout (int): number of output channels.
-        - hidden (int): number of initial hidden channels.
-        - depth (int): number of layers.
-        - kernel_size (int): kernel size for each layer.
-        - stride (int): stride for each layer.
-        - causal (bool): if false, uses BiLSTM instead of LSTM.
-        - resample (int): amount of resampling to apply to the input/output.
-            Can be one of 1, 2 or 4.
-        - growth (float): number of channels is multiplied by this for every layer.
-        - max_hidden (int): maximum number of channels. Can be useful to
-            control the size/speed of the model.
-        - normalize (bool): if true, normalize the input.
-        - glu (bool): if true uses GLU instead of ReLU in 1x1 convolutions.
-        - rescale (float): controls custom weight initialization.
-            See https://arxiv.org/abs/1911.13254.
-        - floor (float): stability flooring when normalizing.
-
-    """
-    @capture_init
-    def __init__(self,
-                 chin=1,
-                 chout=1,
-                 hidden=48,
-                 depth=5,
-                 kernel_size=8,
-                 stride=4,
-                 causal=True,
-                 resample=4,
-                 growth=2,
-                 max_hidden=10_000,
-                 normalize=True,
-                 glu=True,
-                 rescale=0.1,
-                 floor=1e-3):
-
-        super().__init__()
-        if resample not in [1, 2, 4]:
-            raise ValueError("Resample should be 1, 2 or 4.")
-
-        self.chin = chin
-        self.chout = chout
-        self.hidden = hidden
-        self.depth = depth
-        self.kernel_size = kernel_size
-        self.stride = stride
-        self.causal = causal
-        self.floor = floor
-        self.resample = resample
-        self.normalize = normalize
-
-        self.encoder = nn.ModuleList()
-        self.decoder = nn.ModuleList()
-        activation = nn.GLU(1) if glu else nn.ReLU()
-        ch_scale = 2 if glu else 1
-
-        for index in range(depth):
-            encode = []
-            encode += [
-                nn.Conv1d(chin, hidden, kernel_size, stride),
-                nn.ReLU(),
-                nn.Conv1d(hidden, hidden * ch_scale, 1), activation,
-            ]
-            self.encoder.append(nn.Sequential(*encode))
-
-            decode = []
-            decode += [
-                nn.Conv1d(hidden, ch_scale * hidden, 1), activation,
-                nn.ConvTranspose1d(hidden, chout, kernel_size, stride),
-            ]
-            if index > 0:
-                decode.append(nn.ReLU())
-            self.decoder.insert(0, nn.Sequential(*decode))
-            chout = hidden
-            chin = hidden
-            hidden = min(int(growth * hidden), max_hidden)
-
-        self.lstm = BLSTM(chin, bi=not causal)
-        if rescale:
-            rescale_module(self, reference=rescale)
-
-    def valid_length(self, length):
-        """
-        Return the nearest valid length to use with the model so that
-        there is no time steps left over in a convolutions, e.g. for all
-        layers, size of the input - kernel_size % stride = 0.
-
-        If the mixture has a valid length, the estimated sources
-        will have exactly the same length.
-        """
-        length = math.ceil(length * self.resample)
-        for _ in range(self.depth):
-            length = math.ceil((length - self.kernel_size) / self.stride) + 1
-            length = max(length, 1)
-        for _ in range(self.depth):
-            length = (length - 1) * self.stride + self.kernel_size
-        length = int(math.ceil(length / self.resample))
-        return int(length)
-
-    @property
-    def total_stride(self):
-        return self.stride ** self.depth // self.resample
-
-    def forward(self, mix):
-        if mix.dim() == 2:
-            mix = mix.unsqueeze(1)
-
-        if self.normalize:
-            mono = mix.mean(dim=1, keepdim=True)
-            std = mono.std(dim=-1, keepdim=True)
-            mix = mix / (self.floor + std)
-        else:
-            std = 1
-        length = mix.shape[-1]
-        x = mix
-        x = F.pad(x, (0, self.valid_length(length) - length))
-        if self.resample == 2:
-            x = upsample2(x)
-        elif self.resample == 4:
-            x = upsample2(x)
-            x = upsample2(x)
-        skips = []
-        for encode in self.encoder:
-            x = encode(x)
-            skips.append(x)
-        x = x.permute(2, 0, 1)
-        x, _ = self.lstm(x)
-        x = x.permute(1, 2, 0)
-        for decode in self.decoder:
-            skip = skips.pop(-1)
-            x = x + skip[..., :x.shape[-1]]
-            x = decode(x)
-        if self.resample == 2:
-            x = downsample2(x)
-        elif self.resample == 4:
-            x = downsample2(x)
-            x = downsample2(x)
-
-        x = x[..., :length]
-        return std * x
-
-
-def fast_conv(conv, x):
-    """
-    Faster convolution evaluation if either kernel size is 1
-    or length of sequence is 1.
-    """
-    batch, chin, length = x.shape
-    chout, chin, kernel = conv.weight.shape
-    assert batch == 1
-    if kernel == 1:
-        x = x.view(chin, length)
-        out = th.addmm(conv.bias.view(-1, 1),
-                       conv.weight.view(chout, chin), x)
-    elif length == kernel:
-        x = x.view(chin * kernel, 1)
-        out = th.addmm(conv.bias.view(-1, 1),
-                       conv.weight.view(chout, chin * kernel), x)
-    else:
-        out = conv(x)
-    return out.view(batch, chout, -1)
-
-
-class DemucsStreamer:
-    """
-    Streaming implementation for Demucs. It supports being fed with any amount
-    of audio at a time. You will get back as much audio as possible at that
-    point.
-
-    Args:
-        - demucs (Demucs): Demucs model.
-        - dry (float): amount of dry (e.g. input) signal to keep. 0 is maximum
-            noise removal, 1 just returns the input signal. Small values > 0
-            allows to limit distortions.
-        - num_frames (int): number of frames to process at once. Higher values
-            will increase overall latency but improve the real time factor.
-        - resample_lookahead (int): extra lookahead used for the resampling.
-        - resample_buffer (int): size of the buffer of previous inputs/outputs
-            kept for resampling.
-    """
-    def __init__(self, demucs,
-                 dry=0,
-                 num_frames=1,
-                 resample_lookahead=64,
-                 resample_buffer=256):
-        device = next(iter(demucs.parameters())).device
-        self.demucs = demucs
-        self.lstm_state = None
-        self.conv_state = None
-        self.dry = dry
-        self.resample_lookahead = resample_lookahead
-        resample_buffer = min(demucs.total_stride, resample_buffer)
-        self.resample_buffer = resample_buffer
-        self.frame_length = demucs.valid_length(1) + \
-            demucs.total_stride * (num_frames - 1)
-        self.total_length = self.frame_length + self.resample_lookahead
-        self.stride = demucs.total_stride * num_frames
-        self.resample_in = th.zeros(demucs.chin, resample_buffer, device=device)
-        self.resample_out = th.zeros(
-            demucs.chin, resample_buffer, device=device
-        )
-
-        self.frames = 0
-        self.total_time = 0
-        self.variance = 0
-        self.pending = th.zeros(demucs.chin, 0, device=device)
-
-        bias = demucs.decoder[0][2].bias
-        weight = demucs.decoder[0][2].weight
-        chin, chout, kernel = weight.shape
-        self._bias = bias.view(-1, 1).repeat(1, kernel).view(-1, 1)
-        self._weight = weight.permute(1, 2, 0).contiguous()
-
-    def reset_time_per_frame(self):
-        self.total_time = 0
-        self.frames = 0
-
-    @property
-    def time_per_frame(self):
-        return self.total_time / self.frames
-
-    def flush(self):
-        """
-        Flush remaining audio by padding it with zero. Call this
-        when you have no more input and want to get back the last chunk of audio.
-        """
-        pending_length = self.pending.shape[1]
-        padding = th.zeros(
-            self.demucs.chin, self.total_length, device=self.pending.device
-        )
-        out = self.feed(padding)
-        return out[:, :pending_length]
-
-    def feed(self, wav):
-        """
-        Apply the model to mix using true real time evaluation.
-        Normalization is done online as is the resampling.
-        """
-        begin = time.time()
-        demucs = self.demucs
-        resample_buffer = self.resample_buffer
-        stride = self.stride
-        resample = demucs.resample
-
-        if wav.dim() != 2:
-            raise ValueError("input wav should be two dimensional.")
-        chin, _ = wav.shape
-        if chin != demucs.chin:
-            raise ValueError(f"Expected {demucs.chin} channels, got {chin}")
-
-        self.pending = th.cat([self.pending, wav], dim=1)
-        outs = []
-        while self.pending.shape[1] >= self.total_length:
-            self.frames += 1
-            frame = self.pending[:, :self.total_length]
-            dry_signal = frame[:, :stride]
-            if demucs.normalize:
-                mono = frame.mean(0)
-                variance = (mono**2).mean()
-                self.variance = variance / self.frames + \
-                    (1 - 1 / self.frames) * self.variance
-                frame = frame / (demucs.floor + math.sqrt(self.variance))
-            frame = th.cat([self.resample_in, frame], dim=-1)
-            self.resample_in[:] = frame[:, stride - resample_buffer:stride]
-
-            if resample == 4:
-                frame = upsample2(upsample2(frame))
-            elif resample == 2:
-                frame = upsample2(frame)
-            # remove pre sampling buffer
-            frame = frame[:, resample * resample_buffer:]
-            # remove extra samples after window
-            frame = frame[:, :resample * self.frame_length]
-
-            out, extra = self._separate_frame(frame)
-            padded_out = th.cat([self.resample_out, out, extra], 1)
-            self.resample_out[:] = out[:, -resample_buffer:]
-            if resample == 4:
-                out = downsample2(downsample2(padded_out))
-            elif resample == 2:
-                out = downsample2(padded_out)
-            else:
-                out = padded_out
-
-            out = out[:, resample_buffer // resample:]
-            out = out[:, :stride]
-
-            if demucs.normalize:
-                out *= math.sqrt(self.variance)
-            out = self.dry * dry_signal + (1 - self.dry) * out
-            outs.append(out)
-            self.pending = self.pending[:, stride:]
-
-        self.total_time += time.time() - begin
-        if outs:
-            out = th.cat(outs, 1)
-        else:
-            out = th.zeros(chin, 0, device=wav.device)
-        return out
-
-    def _separate_frame(self, frame):
-        demucs = self.demucs
-        skips = []
-        next_state = []
-        first = self.conv_state is None
-        stride = self.stride * demucs.resample
-        x = frame[None]
-        for idx, encode in enumerate(demucs.encoder):
-            stride //= demucs.stride
-            length = x.shape[2]
-            if idx == demucs.depth - 1:
-                # This is sligthly faster for the last conv
-                x = fast_conv(encode[0], x)
-                x = encode[1](x)
-                x = fast_conv(encode[2], x)
-                x = encode[3](x)
-            else:
-                if not first:
-                    prev = self.conv_state.pop(0)
-                    prev = prev[..., stride:]
-                    tgt = (length - demucs.kernel_size) // demucs.stride + 1
-                    missing = tgt - prev.shape[-1]
-                    offset = length - demucs.kernel_size - \
-                        demucs.stride * (missing - 1)
-                    x = x[..., offset:]
-                x = encode[1](encode[0](x))
-                x = fast_conv(encode[2], x)
-                x = encode[3](x)
-                if not first:
-                    x = th.cat([prev, x], -1)
-                next_state.append(x)
-            skips.append(x)
-
-        x = x.permute(2, 0, 1)
-        x, self.lstm_state = demucs.lstm(x, self.lstm_state)
-        x = x.permute(1, 2, 0)
-        # In the following, x contains only correct samples, i.e. the one
-        # for which each time position is covered by two window of the upper
-        # layer. extra contains extra samples to the right, and is used only as
-        # a better padding for the online resampling.
-        extra = None
-        for idx, decode in enumerate(demucs.decoder):
-            skip = skips.pop(-1)
-            x += skip[..., :x.shape[-1]]
-            x = fast_conv(decode[0], x)
-            x = decode[1](x)
-
-            if extra is not None:
-                skip = skip[..., x.shape[-1]:]
-                extra += skip[..., :extra.shape[-1]]
-                extra = decode[2](decode[1](decode[0](extra)))
-            x = decode[2](x)
-            next_state.append(
-                x[..., -demucs.stride:] - decode[2].bias.view(-1, 1)
-            )
-            if extra is None:
-                extra = x[..., -demucs.stride:]
-            else:
-                extra[..., :demucs.stride] += next_state[-1]
-            x = x[..., :-demucs.stride]
-
-            if not first:
-                prev = self.conv_state.pop(0)
-                x[..., :demucs.stride] += prev
-            if idx != demucs.depth - 1:
-                x = decode[3](x)
-                extra = decode[3](extra)
-        self.conv_state = next_state
-        return x[0], extra[0]
-
-
-def test():
-    import argparse
-    parser = argparse.ArgumentParser(
-        "denoiser.demucs",
-        description="Benchmark the streaming Demucs implementation, as well as "
-                    "checking the delta with the offline implementation.")
-    parser.add_argument("--depth", default=5, type=int)
-    parser.add_argument("--resample", default=4, type=int)
-    parser.add_argument("--hidden", default=48, type=int)
-    parser.add_argument("--sample_rate", default=16000, type=float)
-    parser.add_argument("--device", default="cpu")
-    parser.add_argument("-t", "--num_threads", type=int)
-    parser.add_argument("-f", "--num_frames", type=int, default=1)
-    args = parser.parse_args()
-    if args.num_threads:
-        th.set_num_threads(args.num_threads)
-    sr = args.sample_rate
-    sr_ms = sr / 1000
-    demucs = Demucs(
-        depth=args.depth, hidden=args.hidden, resample=args.resample
-    ).to(args.device)
-    x = th.randn(1, int(sr * 4)).to(args.device)
-    out = demucs(x[None])[0]
-    streamer = DemucsStreamer(demucs, num_frames=args.num_frames)
-    out_rt = []
-    frame_size = streamer.total_length
-    with th.no_grad():
-        while x.shape[1] > 0:
-            out_rt.append(streamer.feed(x[:, :frame_size]))
-            x = x[:, frame_size:]
-            frame_size = streamer.demucs.total_stride
-    out_rt.append(streamer.flush())
-    out_rt = th.cat(out_rt, 1)
-    model_size = sum(p.numel() for p in demucs.parameters()) * 4 / 2**20
-    initial_lag = streamer.total_length / sr_ms
-    tpf = 1000 * streamer.time_per_frame
-    print(f"model size: {model_size:.1f}MB, ", end='')
-    print(f"delta batch/streaming: {th.norm(out - out_rt) / th.norm(out):.2%}")
-    print(f"initial lag: {initial_lag:.1f}ms, ", end='')
-    print(f"stride: {streamer.stride * args.num_frames / sr_ms:.1f}ms")
-    print(f"time per frame: {tpf:.1f}ms, ", end='')
-    rtf = (1000 * streamer.time_per_frame) / (streamer.stride / sr_ms)
-    print(f"RTF: {rtf:.2f}")
-    print(f"Total lag with computation: {initial_lag + tpf:.1f}ms")
-
-
-if __name__ == "__main__":
-    test()
diff --git a/examples/speech_synthesis/preprocessing/denoiser/pretrained.py b/examples/speech_synthesis/preprocessing/denoiser/pretrained.py
deleted file mode 100644
index 2fa846075b..0000000000
--- a/examples/speech_synthesis/preprocessing/denoiser/pretrained.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# author: adefossez
-
-import logging
-
-import torch.hub
-
-from .demucs import Demucs
-from .utils import deserialize_model
-
-logger = logging.getLogger(__name__)
-ROOT = "https://dl.fbaipublicfiles.com/adiyoss/denoiser/"
-DNS_48_URL = ROOT + "dns48-11decc9d8e3f0998.th"
-DNS_64_URL = ROOT + "dns64-a7761ff99a7d5bb6.th"
-MASTER_64_URL = ROOT + "master64-8a5dfb4bb92753dd.th"
-
-
-def _demucs(pretrained, url, **kwargs):
-    model = Demucs(**kwargs)
-    if pretrained:
-        state_dict = torch.hub.load_state_dict_from_url(url, map_location='cpu')
-        model.load_state_dict(state_dict)
-    return model
-
-
-def dns48(pretrained=True):
-    return _demucs(pretrained, DNS_48_URL, hidden=48)
-
-
-def dns64(pretrained=True):
-    return _demucs(pretrained, DNS_64_URL, hidden=64)
-
-
-def master64(pretrained=True):
-    return _demucs(pretrained, MASTER_64_URL, hidden=64)
-
-
-def add_model_flags(parser):
-    group = parser.add_mutually_exclusive_group(required=False)
-    group.add_argument(
-        "-m", "--model_path", help="Path to local trained model."
-    )
-    group.add_argument(
-        "--dns48", action="store_true",
-        help="Use pre-trained real time H=48 model trained on DNS."
-    )
-    group.add_argument(
-        "--dns64", action="store_true",
-        help="Use pre-trained real time H=64 model trained on DNS."
-    )
-    group.add_argument(
-        "--master64", action="store_true",
-        help="Use pre-trained real time H=64 model trained on DNS and Valentini."
-    )
-
-
-def get_model(args):
-    """
-    Load local model package or torchhub pre-trained model.
-    """
-    if args.model_path:
-        logger.info("Loading model from %s", args.model_path)
-        pkg = torch.load(args.model_path)
-        model = deserialize_model(pkg)
-    elif args.dns64:
-        logger.info("Loading pre-trained real time H=64 model trained on DNS.")
-        model = dns64()
-    elif args.master64:
-        logger.info(
-            "Loading pre-trained real time H=64 model trained on DNS and Valentini."
-        )
-        model = master64()
-    else:
-        logger.info("Loading pre-trained real time H=48 model trained on DNS.")
-        model = dns48()
-    logger.debug(model)
-    return model
diff --git a/examples/speech_synthesis/preprocessing/denoiser/resample.py b/examples/speech_synthesis/preprocessing/denoiser/resample.py
deleted file mode 100644
index 1222addc42..0000000000
--- a/examples/speech_synthesis/preprocessing/denoiser/resample.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# author: adefossez
-
-import math
-
-import torch as th
-from torch.nn import functional as F
-
-
-def sinc(t):
-    """sinc.
-
-    :param t: the input tensor
-    """
-    return th.where(t == 0, th.tensor(1., device=t.device, dtype=t.dtype),
-                    th.sin(t) / t)
-
-
-def kernel_upsample2(zeros=56):
-    """kernel_upsample2.
-
-    """
-    win = th.hann_window(4 * zeros + 1, periodic=False)
-    winodd = win[1::2]
-    t = th.linspace(-zeros + 0.5, zeros - 0.5, 2 * zeros)
-    t *= math.pi
-    kernel = (sinc(t) * winodd).view(1, 1, -1)
-    return kernel
-
-
-def upsample2(x, zeros=56):
-    """
-    Upsampling the input by 2 using sinc interpolation.
-    Smith, Julius, and Phil Gossett. "A flexible sampling-rate conversion method."
-    ICASSP'84. IEEE International Conference on Acoustics, Speech, and Signal Processing.
-    Vol. 9. IEEE, 1984.
-    """
-    *other, time = x.shape
-    kernel = kernel_upsample2(zeros).to(x)
-    out = F.conv1d(x.view(-1, 1, time), kernel, padding=zeros)[..., 1:].view(
-        *other, time
-    )
-    y = th.stack([x, out], dim=-1)
-    return y.view(*other, -1)
-
-
-def kernel_downsample2(zeros=56):
-    """kernel_downsample2.
-
-    """
-    win = th.hann_window(4 * zeros + 1, periodic=False)
-    winodd = win[1::2]
-    t = th.linspace(-zeros + 0.5, zeros - 0.5, 2 * zeros)
-    t.mul_(math.pi)
-    kernel = (sinc(t) * winodd).view(1, 1, -1)
-    return kernel
-
-
-def downsample2(x, zeros=56):
-    """
-    Downsampling the input by 2 using sinc interpolation.
-    Smith, Julius, and Phil Gossett. "A flexible sampling-rate conversion method."
-    ICASSP'84. IEEE International Conference on Acoustics, Speech, and Signal Processing.
-    Vol. 9. IEEE, 1984.
-    """
-    if x.shape[-1] % 2 != 0:
-        x = F.pad(x, (0, 1))
-    xeven = x[..., ::2]
-    xodd = x[..., 1::2]
-    *other, time = xodd.shape
-    kernel = kernel_downsample2(zeros).to(x)
-    out = xeven + F.conv1d(
-        xodd.view(-1, 1, time), kernel, padding=zeros
-    )[..., :-1].view(*other, time)
-    return out.view(*other, -1).mul(0.5)
diff --git a/examples/speech_synthesis/preprocessing/denoiser/utils.py b/examples/speech_synthesis/preprocessing/denoiser/utils.py
deleted file mode 100644
index 734d047f1b..0000000000
--- a/examples/speech_synthesis/preprocessing/denoiser/utils.py
+++ /dev/null
@@ -1,176 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# author: adefossez
-
-import functools
-import logging
-from contextlib import contextmanager
-import inspect
-import time
-
-logger = logging.getLogger(__name__)
-
-EPS = 1e-8
-
-
-def capture_init(init):
-    """capture_init.
-
-    Decorate `__init__` with this, and you can then
-    recover the *args and **kwargs passed to it in `self._init_args_kwargs`
-    """
-    @functools.wraps(init)
-    def __init__(self, *args, **kwargs):
-        self._init_args_kwargs = (args, kwargs)
-        init(self, *args, **kwargs)
-
-    return __init__
-
-
-def deserialize_model(package, strict=False):
-    """deserialize_model.
-
-    """
-    klass = package['class']
-    if strict:
-        model = klass(*package['args'], **package['kwargs'])
-    else:
-        sig = inspect.signature(klass)
-        kw = package['kwargs']
-        for key in list(kw):
-            if key not in sig.parameters:
-                logger.warning("Dropping inexistant parameter %s", key)
-                del kw[key]
-        model = klass(*package['args'], **kw)
-    model.load_state_dict(package['state'])
-    return model
-
-
-def copy_state(state):
-    return {k: v.cpu().clone() for k, v in state.items()}
-
-
-def serialize_model(model):
-    args, kwargs = model._init_args_kwargs
-    state = copy_state(model.state_dict())
-    return {"class": model.__class__, "args": args, "kwargs": kwargs, "state": state}
-
-
-@contextmanager
-def swap_state(model, state):
-    """
-    Context manager that swaps the state of a model, e.g:
-
-        # model is in old state
-        with swap_state(model, new_state):
-            # model in new state
-        # model back to old state
-    """
-    old_state = copy_state(model.state_dict())
-    model.load_state_dict(state)
-    try:
-        yield
-    finally:
-        model.load_state_dict(old_state)
-
-
-def pull_metric(history, name):
-    out = []
-    for metrics in history:
-        if name in metrics:
-            out.append(metrics[name])
-    return out
-
-
-class LogProgress:
-    """
-    Sort of like tqdm but using log lines and not as real time.
-    Args:
-        - logger: logger obtained from `logging.getLogger`,
-        - iterable: iterable object to wrap
-        - updates (int): number of lines that will be printed, e.g.
-            if `updates=5`, log every 1/5th of the total length.
-        - total (int): length of the iterable, in case it does not support
-            `len`.
-        - name (str): prefix to use in the log.
-        - level: logging level (like `logging.INFO`).
-    """
-    def __init__(self,
-                 logger,
-                 iterable,
-                 updates=5,
-                 total=None,
-                 name="LogProgress",
-                 level=logging.INFO):
-        self.iterable = iterable
-        self.total = total or len(iterable)
-        self.updates = updates
-        self.name = name
-        self.logger = logger
-        self.level = level
-
-    def update(self, **infos):
-        self._infos = infos
-
-    def __iter__(self):
-        self._iterator = iter(self.iterable)
-        self._index = -1
-        self._infos = {}
-        self._begin = time.time()
-        return self
-
-    def __next__(self):
-        self._index += 1
-        try:
-            value = next(self._iterator)
-        except StopIteration:
-            raise
-        else:
-            return value
-        finally:
-            log_every = max(1, self.total // self.updates)
-            # logging is delayed by 1 it, in order to have the metrics from update
-            if self._index >= 1 and self._index % log_every == 0:
-                self._log()
-
-    def _log(self):
-        self._speed = (1 + self._index) / (time.time() - self._begin)
-        infos = " | ".join(f"{k.capitalize()} {v}" for k, v in self._infos.items())
-        if self._speed < 1e-4:
-            speed = "oo sec/it"
-        elif self._speed < 0.1:
-            speed = f"{1/self._speed:.1f} sec/it"
-        else:
-            speed = f"{self._speed:.1f} it/sec"
-        out = f"{self.name} | {self._index}/{self.total} | {speed}"
-        if infos:
-            out += " | " + infos
-        self.logger.log(self.level, out)
-
-
-def colorize(text, color):
-    """
-    Display text with some ANSI color in the terminal.
-    """
-    code = f"\033[{color}m"
-    restore = "\033[0m"
-    return "".join([code, text, restore])
-
-
-def bold(text):
-    """
-    Display text in bold in the terminal.
-    """
-    return colorize(text, "1")
-
-
-def cal_snr(lbl, est):
-    import torch
-    y = 10.0 * torch.log10(
-        torch.sum(lbl**2, dim=-1) / (torch.sum((est-lbl)**2, dim=-1) + EPS) +
-        EPS
-    )
-    return y
diff --git a/examples/speech_synthesis/preprocessing/get_common_voice_audio_manifest.py b/examples/speech_synthesis/preprocessing/get_common_voice_audio_manifest.py
deleted file mode 100644
index a302546043..0000000000
--- a/examples/speech_synthesis/preprocessing/get_common_voice_audio_manifest.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import logging
-from pathlib import Path
-from collections import defaultdict
-from typing import List, Dict, Tuple
-
-import pandas as pd
-import numpy as np
-import torchaudio
-from tqdm import tqdm
-
-from examples.speech_to_text.data_utils import load_df_from_tsv, save_df_to_tsv
-
-
-log = logging.getLogger(__name__)
-
-SPLITS = ["train", "dev", "test"]
-
-
-def get_top_n(
-        root: Path, n_speakers: int = 10, min_n_tokens: int = 5
-) -> pd.DataFrame:
-    df = load_df_from_tsv(root / "validated.tsv")
-    df["n_tokens"] = [len(s.split()) for s in df["sentence"]]
-    df = df[df["n_tokens"] >= min_n_tokens]
-    df["n_frames"] = [
-        torchaudio.info((root / "clips" / p).as_posix()).num_frames
-        for p in tqdm(df["path"])
-    ]
-    df["id"] = [Path(p).stem for p in df["path"]]
-    total_duration_ms = df.groupby("client_id")["n_frames"].agg(["sum"])
-    total_duration_ms = total_duration_ms.sort_values("sum", ascending=False)
-
-    top_n_total_duration_ms = total_duration_ms.head(n_speakers)
-    top_n_client_ids = set(top_n_total_duration_ms.index.tolist())
-    df_top_n = df[df["client_id"].isin(top_n_client_ids)]
-    return df_top_n
-
-
-def get_splits(
-        df, train_split_ratio=0.99, speaker_in_all_splits=False, rand_seed=0
-) -> Tuple[Dict[str, str], List[str]]:
-    np.random.seed(rand_seed)
-    dev_split_ratio = (1. - train_split_ratio) / 3
-    grouped = list(df.groupby("client_id"))
-    id_to_split = {}
-    for _, cur_df in tqdm(grouped):
-        cur_n_examples = len(cur_df)
-        if speaker_in_all_splits and cur_n_examples < 3:
-            continue
-        cur_n_train = int(cur_n_examples * train_split_ratio)
-        cur_n_dev = int(cur_n_examples * dev_split_ratio)
-        cur_n_test = cur_n_examples - cur_n_dev - cur_n_train
-        if speaker_in_all_splits and cur_n_dev * cur_n_test == 0:
-            cur_n_dev, cur_n_test = 1, 1
-            cur_n_train = cur_n_examples - cur_n_dev - cur_n_test
-        cur_indices = cur_df.index.tolist()
-        cur_shuffled_indices = np.random.permutation(cur_n_examples)
-        cur_shuffled_indices = [cur_indices[i] for i in cur_shuffled_indices]
-        cur_indices_by_split = {
-            "train": cur_shuffled_indices[:cur_n_train],
-            "dev": cur_shuffled_indices[cur_n_train: cur_n_train + cur_n_dev],
-            "test": cur_shuffled_indices[cur_n_train + cur_n_dev:]
-        }
-        for split in SPLITS:
-            for i in cur_indices_by_split[split]:
-                id_ = df["id"].loc[i]
-                id_to_split[id_] = split
-    return id_to_split, sorted(df["client_id"].unique())
-
-
-def convert_to_wav(root: Path, filenames: List[str], target_sr=16_000):
-    out_root = root / "wav"
-    out_root.mkdir(exist_ok=True, parents=True)
-    print("Converting to WAV...")
-    for n in tqdm(filenames):
-        in_path = (root / "clips" / n).as_posix()
-        waveform, sr = torchaudio.load(in_path)
-        converted, converted_sr = torchaudio.sox_effects.apply_effects_tensor(
-            waveform, sr, [["rate", str(target_sr)], ["channels", "1"]]
-        )
-        out_path = (out_root / Path(n).with_suffix(".wav").name).as_posix()
-        torchaudio.save(out_path, converted, converted_sr, encoding="PCM_S",
-                        bits_per_sample=16)
-
-
-def process(args):
-    data_root = Path(args.data_root).absolute() / args.lang
-
-    # Generate TSV manifest
-    print("Generating manifest...")
-
-    df_top_n = get_top_n(data_root)
-    id_to_split, speakers = get_splits(df_top_n)
-
-    if args.convert_to_wav:
-        convert_to_wav(data_root, df_top_n["path"].tolist())
-
-    manifest_by_split = {split: defaultdict(list) for split in SPLITS}
-    for sample in tqdm(df_top_n.to_dict(orient="index").values()):
-        sample_id = sample["id"]
-        split = id_to_split[sample_id]
-        manifest_by_split[split]["id"].append(sample_id)
-        if args.convert_to_wav:
-            audio_path = data_root / "wav" / f"{sample_id}.wav"
-        else:
-            audio_path = data_root / "clips" / f"{sample_id}.mp3"
-        manifest_by_split[split]["audio"].append(audio_path.as_posix())
-        manifest_by_split[split]["n_frames"].append(sample["n_frames"])
-        manifest_by_split[split]["tgt_text"].append(sample["sentence"])
-        manifest_by_split[split]["speaker"].append(sample["client_id"])
-        manifest_by_split[split]["src_text"].append(sample["sentence"])
-
-    output_root = Path(args.output_manifest_root).absolute()
-    output_root.mkdir(parents=True, exist_ok=True)
-    for split in SPLITS:
-        save_df_to_tsv(
-            pd.DataFrame.from_dict(manifest_by_split[split]),
-            output_root / f"{split}.audio.tsv"
-        )
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--data-root", "-d", required=True, type=str)
-    parser.add_argument("--output-manifest-root", "-m", required=True, type=str)
-    parser.add_argument("--lang", "-l", required=True, type=str)
-    parser.add_argument("--convert-to-wav", action="store_true")
-    args = parser.parse_args()
-
-    process(args)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/speech_synthesis/preprocessing/get_feature_manifest.py b/examples/speech_synthesis/preprocessing/get_feature_manifest.py
deleted file mode 100644
index 4a1e119b32..0000000000
--- a/examples/speech_synthesis/preprocessing/get_feature_manifest.py
+++ /dev/null
@@ -1,262 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import logging
-from pathlib import Path
-import shutil
-from tempfile import NamedTemporaryFile
-from collections import Counter, defaultdict
-
-import pandas as pd
-import torchaudio
-from tqdm import tqdm
-
-from fairseq.data.audio.audio_utils import convert_waveform
-from examples.speech_to_text.data_utils import (
-    create_zip,
-    gen_config_yaml,
-    gen_vocab,
-    get_zip_manifest,
-    load_tsv_to_dicts,
-    save_df_to_tsv
-)
-from examples.speech_synthesis.data_utils import (
-    extract_logmel_spectrogram, extract_pitch, extract_energy, get_global_cmvn,
-    ipa_phonemize, get_mfa_alignment, get_unit_alignment,
-    get_feature_value_min_max
-)
-
-
-log = logging.getLogger(__name__)
-
-
-def process(args):
-    assert "train" in args.splits
-    out_root = Path(args.output_root).absolute()
-    out_root.mkdir(exist_ok=True)
-
-    print("Fetching data...")
-    audio_manifest_root = Path(args.audio_manifest_root).absolute()
-    samples = []
-    for s in args.splits:
-        for e in load_tsv_to_dicts(audio_manifest_root / f"{s}.audio.tsv"):
-            e["split"] = s
-            samples.append(e)
-    sample_ids = [s["id"] for s in samples]
-
-    # Get alignment info
-    id_to_alignment = None
-    if args.textgrid_zip is not None:
-        assert args.id_to_units_tsv is None
-        id_to_alignment = get_mfa_alignment(
-            args.textgrid_zip, sample_ids, args.sample_rate, args.hop_length
-        )
-    elif args.id_to_units_tsv is not None:
-        # assume identical hop length on the unit sequence
-        id_to_alignment = get_unit_alignment(args.id_to_units_tsv, sample_ids)
-
-    # Extract features and pack features into ZIP
-    feature_name = "logmelspec80"
-    zip_path = out_root / f"{feature_name}.zip"
-    pitch_zip_path = out_root / "pitch.zip"
-    energy_zip_path = out_root / "energy.zip"
-    gcmvn_npz_path = out_root / "gcmvn_stats.npz"
-    if zip_path.exists() and gcmvn_npz_path.exists():
-        print(f"{zip_path} and {gcmvn_npz_path} exist.")
-    else:
-        feature_root = out_root / feature_name
-        feature_root.mkdir(exist_ok=True)
-        pitch_root = out_root / "pitch"
-        energy_root = out_root / "energy"
-        if args.add_fastspeech_targets:
-            pitch_root.mkdir(exist_ok=True)
-            energy_root.mkdir(exist_ok=True)
-        print("Extracting Mel spectrogram features...")
-        for sample in tqdm(samples):
-            waveform, sample_rate = torchaudio.load(sample["audio"])
-            waveform, sample_rate = convert_waveform(
-                waveform, sample_rate, normalize_volume=args.normalize_volume,
-                to_sample_rate=args.sample_rate
-            )
-            sample_id = sample["id"]
-            target_length = None
-            if id_to_alignment is not None:
-                a = id_to_alignment[sample_id]
-                target_length = sum(a.frame_durations)
-                if a.start_sec is not None and a.end_sec is not None:
-                    start_frame = int(a.start_sec * sample_rate)
-                    end_frame = int(a.end_sec * sample_rate)
-                    waveform = waveform[:, start_frame: end_frame]
-            extract_logmel_spectrogram(
-                waveform, sample_rate, feature_root / f"{sample_id}.npy",
-                win_length=args.win_length, hop_length=args.hop_length,
-                n_fft=args.n_fft, n_mels=args.n_mels, f_min=args.f_min,
-                f_max=args.f_max, target_length=target_length
-            )
-            if args.add_fastspeech_targets:
-                assert id_to_alignment is not None
-                extract_pitch(
-                    waveform, sample_rate, pitch_root / f"{sample_id}.npy",
-                    hop_length=args.hop_length, log_scale=True,
-                    phoneme_durations=id_to_alignment[sample_id].frame_durations
-                )
-                extract_energy(
-                    waveform, energy_root / f"{sample_id}.npy",
-                    hop_length=args.hop_length, n_fft=args.n_fft,
-                    log_scale=True,
-                    phoneme_durations=id_to_alignment[sample_id].frame_durations
-                )
-        print("ZIPing features...")
-        create_zip(feature_root, zip_path)
-        get_global_cmvn(feature_root, gcmvn_npz_path)
-        shutil.rmtree(feature_root)
-        if args.add_fastspeech_targets:
-            create_zip(pitch_root, pitch_zip_path)
-            shutil.rmtree(pitch_root)
-            create_zip(energy_root, energy_zip_path)
-            shutil.rmtree(energy_root)
-
-    print("Fetching ZIP manifest...")
-    audio_paths, audio_lengths = get_zip_manifest(zip_path)
-    pitch_paths, pitch_lengths, energy_paths, energy_lengths = [None] * 4
-    if args.add_fastspeech_targets:
-        pitch_paths, pitch_lengths = get_zip_manifest(pitch_zip_path)
-        energy_paths, energy_lengths = get_zip_manifest(energy_zip_path)
-    # Generate TSV manifest
-    print("Generating manifest...")
-    id_to_cer = None
-    if args.cer_threshold is not None:
-        assert Path(args.cer_tsv_path).is_file()
-        id_to_cer = {
-            x["id"]: x["uer"] for x in load_tsv_to_dicts(args.cer_tsv_path)
-        }
-    manifest_by_split = {split: defaultdict(list) for split in args.splits}
-    for sample in tqdm(samples):
-        sample_id, split = sample["id"], sample["split"]
-
-        if args.snr_threshold is not None and "snr" in sample \
-                and sample["snr"] < args.snr_threshold:
-            continue
-        if args.cer_threshold is not None \
-                and id_to_cer[sample_id] > args.cer_threhold:
-            continue
-
-        normalized_utt = sample["tgt_text"]
-        if id_to_alignment is not None:
-            normalized_utt = " ".join(id_to_alignment[sample_id].tokens)
-        elif args.ipa_vocab:
-            normalized_utt = ipa_phonemize(
-                normalized_utt, lang=args.lang, use_g2p=args.use_g2p
-            )
-        manifest_by_split[split]["id"].append(sample_id)
-        manifest_by_split[split]["audio"].append(audio_paths[sample_id])
-        manifest_by_split[split]["n_frames"].append(audio_lengths[sample_id])
-        manifest_by_split[split]["tgt_text"].append(normalized_utt)
-        manifest_by_split[split]["speaker"].append(sample["speaker"])
-        manifest_by_split[split]["src_text"].append(sample["src_text"])
-        if args.add_fastspeech_targets:
-            assert id_to_alignment is not None
-            duration = " ".join(
-                str(d) for d in id_to_alignment[sample_id].frame_durations
-            )
-            manifest_by_split[split]["duration"].append(duration)
-            manifest_by_split[split]["pitch"].append(pitch_paths[sample_id])
-            manifest_by_split[split]["energy"].append(energy_paths[sample_id])
-    for split in args.splits:
-        save_df_to_tsv(
-            pd.DataFrame.from_dict(manifest_by_split[split]),
-            out_root / f"{split}.tsv"
-        )
-    # Generate vocab
-    vocab_name, spm_filename = None, None
-    if id_to_alignment is not None or args.ipa_vocab:
-        vocab = Counter()
-        for t in manifest_by_split["train"]["tgt_text"]:
-            vocab.update(t.split(" "))
-        vocab_name = "vocab.txt"
-        with open(out_root / vocab_name, "w") as f:
-            for s, c in vocab.most_common():
-                f.write(f"{s} {c}\n")
-    else:
-        spm_filename_prefix = "spm_char"
-        spm_filename = f"{spm_filename_prefix}.model"
-        with NamedTemporaryFile(mode="w") as f:
-            for t in manifest_by_split["train"]["tgt_text"]:
-                f.write(t + "\n")
-            f.flush()  # needed to ensure gen_vocab sees dumped text
-            gen_vocab(Path(f.name), out_root / spm_filename_prefix, "char")
-    # Generate speaker list
-    speakers = sorted({sample["speaker"] for sample in samples})
-    speakers_path = out_root / "speakers.txt"
-    with open(speakers_path, "w") as f:
-        for speaker in speakers:
-            f.write(f"{speaker}\n")
-    # Generate config YAML
-    win_len_t = args.win_length / args.sample_rate
-    hop_len_t = args.hop_length / args.sample_rate
-    extra = {
-        "sample_rate": args.sample_rate,
-        "features": {
-            "type": "spectrogram+melscale+log",
-            "eps": 1e-5, "n_mels": args.n_mels, "n_fft": args.n_fft,
-            "window_fn": "hann", "win_length": args.win_length,
-            "hop_length": args.hop_length, "sample_rate": args.sample_rate,
-            "win_len_t": win_len_t, "hop_len_t": hop_len_t,
-            "f_min": args.f_min, "f_max": args.f_max,
-            "n_stft": args.n_fft // 2 + 1
-        }
-    }
-    if len(speakers) > 1:
-        extra["speaker_set_filename"] = "speakers.txt"
-    if args.add_fastspeech_targets:
-        pitch_min, pitch_max = get_feature_value_min_max(
-            [(out_root / n).as_posix() for n in pitch_paths.values()]
-        )
-        energy_min, energy_max = get_feature_value_min_max(
-            [(out_root / n).as_posix() for n in energy_paths.values()]
-        )
-        extra["features"]["pitch_min"] = pitch_min
-        extra["features"]["pitch_max"] = pitch_max
-        extra["features"]["energy_min"] = energy_min
-        extra["features"]["energy_max"] = energy_max
-    gen_config_yaml(
-        out_root, spm_filename=spm_filename, vocab_name=vocab_name,
-        audio_root=out_root.as_posix(), input_channels=None,
-        input_feat_per_channel=None, specaugment_policy=None,
-        cmvn_type="global", gcmvn_path=gcmvn_npz_path, extra=extra
-    )
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--audio-manifest-root", "-m", required=True, type=str)
-    parser.add_argument("--output-root", "-o", required=True, type=str)
-    parser.add_argument("--splits", "-s", type=str, nargs="+",
-                        default=["train", "dev", "test"])
-    parser.add_argument("--ipa-vocab", action="store_true")
-    parser.add_argument("--use-g2p", action="store_true")
-    parser.add_argument("--lang", type=str, default="en-us")
-    parser.add_argument("--win-length", type=int, default=1024)
-    parser.add_argument("--hop-length", type=int, default=256)
-    parser.add_argument("--n-fft", type=int, default=1024)
-    parser.add_argument("--n-mels", type=int, default=80)
-    parser.add_argument("--f-min", type=int, default=20)
-    parser.add_argument("--f-max", type=int, default=8000)
-    parser.add_argument("--sample-rate", type=int, default=22050)
-    parser.add_argument("--normalize-volume", "-n", action="store_true")
-    parser.add_argument("--textgrid-zip", type=str, default=None)
-    parser.add_argument("--id-to-units-tsv", type=str, default=None)
-    parser.add_argument("--add-fastspeech-targets", action="store_true")
-    parser.add_argument("--snr-threshold", type=float, default=None)
-    parser.add_argument("--cer-threshold", type=float, default=None)
-    parser.add_argument("--cer-tsv-path", type=str, default="")
-    args = parser.parse_args()
-
-    process(args)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/speech_synthesis/preprocessing/get_ljspeech_audio_manifest.py b/examples/speech_synthesis/preprocessing/get_ljspeech_audio_manifest.py
deleted file mode 100644
index 7ec1fb7521..0000000000
--- a/examples/speech_synthesis/preprocessing/get_ljspeech_audio_manifest.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import logging
-from pathlib import Path
-from collections import defaultdict
-
-import pandas as pd
-from torchaudio.datasets import LJSPEECH
-from tqdm import tqdm
-
-from examples.speech_to_text.data_utils import save_df_to_tsv
-
-
-log = logging.getLogger(__name__)
-
-SPLITS = ["train", "dev", "test"]
-
-
-def process(args):
-    out_root = Path(args.output_data_root).absolute()
-    out_root.mkdir(parents=True, exist_ok=True)
-
-    # Generate TSV manifest
-    print("Generating manifest...")
-    # following FastSpeech's splits
-    dataset = LJSPEECH(out_root.as_posix(), download=True)
-    id_to_split = {}
-    for x in dataset._flist:
-        id_ = x[0]
-        speaker = id_.split("-")[0]
-        id_to_split[id_] = {
-            "LJ001": "test", "LJ002": "test", "LJ003": "dev"
-        }.get(speaker, "train")
-    manifest_by_split = {split: defaultdict(list) for split in SPLITS}
-    progress = tqdm(enumerate(dataset), total=len(dataset))
-    for i, (waveform, _, utt, normalized_utt) in progress:
-        sample_id = dataset._flist[i][0]
-        split = id_to_split[sample_id]
-        manifest_by_split[split]["id"].append(sample_id)
-        audio_path = f"{dataset._path}/{sample_id}.wav"
-        manifest_by_split[split]["audio"].append(audio_path)
-        manifest_by_split[split]["n_frames"].append(len(waveform[0]))
-        manifest_by_split[split]["tgt_text"].append(normalized_utt)
-        manifest_by_split[split]["speaker"].append("ljspeech")
-        manifest_by_split[split]["src_text"].append(utt)
-
-    manifest_root = Path(args.output_manifest_root).absolute()
-    manifest_root.mkdir(parents=True, exist_ok=True)
-    for split in SPLITS:
-        save_df_to_tsv(
-            pd.DataFrame.from_dict(manifest_by_split[split]),
-            manifest_root / f"{split}.audio.tsv"
-        )
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--output-data-root", "-d", required=True, type=str)
-    parser.add_argument("--output-manifest-root", "-m", required=True, type=str)
-    args = parser.parse_args()
-
-    process(args)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/speech_synthesis/preprocessing/get_speaker_embedding.py b/examples/speech_synthesis/preprocessing/get_speaker_embedding.py
deleted file mode 100644
index 0e3e4c5cd7..0000000000
--- a/examples/speech_synthesis/preprocessing/get_speaker_embedding.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-import argparse
-from collections import defaultdict
-from itertools import chain
-from pathlib import Path
-
-import numpy as np
-import torchaudio
-import torchaudio.sox_effects as ta_sox
-import yaml
-from tqdm import tqdm
-
-from examples.speech_to_text.data_utils import load_tsv_to_dicts
-from examples.speech_synthesis.preprocessing.speaker_embedder import SpkrEmbedder
-
-
-def extract_embedding(audio_path, embedder):
-    wav, sr = torchaudio.load(audio_path)  # 2D
-    if sr != embedder.RATE:
-        wav, sr = ta_sox.apply_effects_tensor(
-            wav, sr, [["rate", str(embedder.RATE)]]
-        )
-    try:
-        emb = embedder([wav[0].cuda().float()]).cpu().numpy()
-    except RuntimeError:
-        emb = None
-    return emb
-
-
-def process(args):
-    print("Fetching data...")
-    raw_manifest_root = Path(args.raw_manifest_root).absolute()
-    samples = [load_tsv_to_dicts(raw_manifest_root / (s + ".tsv"))
-               for s in args.splits]
-    samples = list(chain(*samples))
-    with open(args.config, "r") as f:
-        config = yaml.load(f, Loader=yaml.FullLoader)
-    with open(f"{config['audio_root']}/{config['speaker_set_filename']}") as f:
-        speaker_to_id = {r.strip(): i for i, r in enumerate(f)}
-
-    embedder = SpkrEmbedder(args.ckpt).cuda()
-    speaker_to_cnt = defaultdict(float)
-    speaker_to_emb = defaultdict(float)
-    for sample in tqdm(samples, desc="extract emb"):
-        emb = extract_embedding(sample["audio"], embedder)
-        if emb is not None:
-            speaker_to_cnt[sample["speaker"]] += 1
-            speaker_to_emb[sample["speaker"]] += emb
-    if len(speaker_to_emb) != len(speaker_to_id):
-        missed = set(speaker_to_id) - set(speaker_to_emb.keys())
-        print(
-            f"WARNING: missing embeddings for {len(missed)} speaker:\n{missed}"
-        )
-    speaker_emb_mat = np.zeros((len(speaker_to_id), len(emb)), float)
-    for speaker in speaker_to_emb:
-        idx = speaker_to_id[speaker]
-        emb = speaker_to_emb[speaker]
-        cnt = speaker_to_cnt[speaker]
-        speaker_emb_mat[idx, :] = emb / cnt
-    speaker_emb_name = "speaker_emb.npy"
-    speaker_emb_path = f"{config['audio_root']}/{speaker_emb_name}"
-    np.save(speaker_emb_path, speaker_emb_mat)
-    config["speaker_emb_filename"] = speaker_emb_name
-
-    with open(args.new_config, "w") as f:
-        yaml.dump(config, f)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--raw-manifest-root", "-m", required=True, type=str)
-    parser.add_argument("--splits", "-s", type=str, nargs="+",
-                        default=["train"])
-    parser.add_argument("--config", "-c", required=True, type=str)
-    parser.add_argument("--new-config", "-n", required=True, type=str)
-    parser.add_argument("--ckpt", required=True, type=str,
-                        help="speaker embedder checkpoint")
-    args = parser.parse_args()
-
-    process(args)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/speech_synthesis/preprocessing/get_vctk_audio_manifest.py b/examples/speech_synthesis/preprocessing/get_vctk_audio_manifest.py
deleted file mode 100644
index 7afa40fcd1..0000000000
--- a/examples/speech_synthesis/preprocessing/get_vctk_audio_manifest.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import argparse
-import logging
-import numpy as np
-import re
-from pathlib import Path
-from collections import defaultdict
-
-import pandas as pd
-from torchaudio.datasets import VCTK
-from tqdm import tqdm
-
-from examples.speech_to_text.data_utils import save_df_to_tsv
-
-
-log = logging.getLogger(__name__)
-
-SPLITS = ["train", "dev", "test"]
-
-
-def normalize_text(text):
-    return re.sub(r"[^a-zA-Z.?!,'\- ]", '', text)
-
-
-def process(args):
-    out_root = Path(args.output_data_root).absolute()
-    out_root.mkdir(parents=True, exist_ok=True)
-
-    # Generate TSV manifest
-    print("Generating manifest...")
-    dataset = VCTK(out_root.as_posix(), download=False)
-    ids = list(dataset._walker)
-    np.random.seed(args.seed)
-    np.random.shuffle(ids)
-    n_train = len(ids) - args.n_dev - args.n_test
-    _split = ["train"] * n_train + ["dev"] * args.n_dev + ["test"] * args.n_test
-    id_to_split = dict(zip(ids, _split))
-    manifest_by_split = {split: defaultdict(list) for split in SPLITS}
-    progress = tqdm(enumerate(dataset), total=len(dataset))
-    for i, (waveform, _, text, speaker_id, _) in progress:
-        sample_id = dataset._walker[i]
-        _split = id_to_split[sample_id]
-        audio_dir = Path(dataset._path) / dataset._folder_audio / speaker_id
-        audio_path = audio_dir / f"{sample_id}.wav"
-        text = normalize_text(text)
-        manifest_by_split[_split]["id"].append(sample_id)
-        manifest_by_split[_split]["audio"].append(audio_path.as_posix())
-        manifest_by_split[_split]["n_frames"].append(len(waveform[0]))
-        manifest_by_split[_split]["tgt_text"].append(text)
-        manifest_by_split[_split]["speaker"].append(speaker_id)
-        manifest_by_split[_split]["src_text"].append(text)
-
-    manifest_root = Path(args.output_manifest_root).absolute()
-    manifest_root.mkdir(parents=True, exist_ok=True)
-    for _split in SPLITS:
-        save_df_to_tsv(
-            pd.DataFrame.from_dict(manifest_by_split[_split]),
-            manifest_root / f"{_split}.audio.tsv"
-        )
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--output-data-root", "-d", required=True, type=str)
-    parser.add_argument("--output-manifest-root", "-m", required=True, type=str)
-    parser.add_argument("--n-dev", default=50, type=int)
-    parser.add_argument("--n-test", default=100, type=int)
-    parser.add_argument("--seed", "-s", default=1234, type=int)
-    args = parser.parse_args()
-
-    process(args)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/speech_synthesis/preprocessing/speaker_embedder/__init__.py b/examples/speech_synthesis/preprocessing/speaker_embedder/__init__.py
deleted file mode 100644
index 3b178676ba..0000000000
--- a/examples/speech_synthesis/preprocessing/speaker_embedder/__init__.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-import librosa
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.utils.data
-import torchaudio
-
-
-EMBEDDER_PARAMS = {
-    'num_mels': 40,
-    'n_fft': 512,
-    'emb_dim': 256,
-    'lstm_hidden': 768,
-    'lstm_layers': 3,
-    'window': 80,
-    'stride': 40,
-}
-
-
-def set_requires_grad(nets, requires_grad=False):
-    """Set requies_grad=Fasle for all the networks to avoid unnecessary
-    computations
-    Parameters:
-        nets (network list)   -- a list of networks
-        requires_grad (bool)  -- whether the networks require gradients or not
-    """
-    if not isinstance(nets, list):
-        nets = [nets]
-    for net in nets:
-        if net is not None:
-            for param in net.parameters():
-                param.requires_grad = requires_grad
-
-
-class LinearNorm(nn.Module):
-    def __init__(self, hp):
-        super(LinearNorm, self).__init__()
-        self.linear_layer = nn.Linear(hp["lstm_hidden"], hp["emb_dim"])
-
-    def forward(self, x):
-        return self.linear_layer(x)
-
-
-class SpeechEmbedder(nn.Module):
-    def __init__(self, hp):
-        super(SpeechEmbedder, self).__init__()
-        self.lstm = nn.LSTM(hp["num_mels"],
-                            hp["lstm_hidden"],
-                            num_layers=hp["lstm_layers"],
-                            batch_first=True)
-        self.proj = LinearNorm(hp)
-        self.hp = hp
-
-    def forward(self, mel):
-        # (num_mels, T) -> (num_mels, T', window)
-        mels = mel.unfold(1, self.hp["window"], self.hp["stride"])
-        mels = mels.permute(1, 2, 0)  # (T', window, num_mels)
-        x, _ = self.lstm(mels)  # (T', window, lstm_hidden)
-        x = x[:, -1, :]  # (T', lstm_hidden), use last frame only
-        x = self.proj(x)  # (T', emb_dim)
-        x = x / torch.norm(x, p=2, dim=1, keepdim=True)  # (T', emb_dim)
-
-        x = x.mean(dim=0)
-        if x.norm(p=2) != 0:
-            x = x / x.norm(p=2)
-        return x
-
-
-class SpkrEmbedder(nn.Module):
-    RATE = 16000
-
-    def __init__(
-        self,
-        embedder_path,
-        embedder_params=EMBEDDER_PARAMS,
-        rate=16000,
-        hop_length=160,
-        win_length=400,
-        pad=False,
-    ):
-        super(SpkrEmbedder, self).__init__()
-        embedder_pt = torch.load(embedder_path, map_location="cpu")
-        self.embedder = SpeechEmbedder(embedder_params)
-        self.embedder.load_state_dict(embedder_pt)
-        self.embedder.eval()
-        set_requires_grad(self.embedder, requires_grad=False)
-        self.embedder_params = embedder_params
-
-        self.register_buffer('mel_basis', torch.from_numpy(
-            librosa.filters.mel(
-                sr=self.RATE,
-                n_fft=self.embedder_params["n_fft"],
-                n_mels=self.embedder_params["num_mels"])
-        )
-                             )
-
-        self.resample = None
-        if rate != self.RATE:
-            self.resample = torchaudio.transforms.Resample(rate, self.RATE)
-        self.hop_length = hop_length
-        self.win_length = win_length
-        self.pad = pad
-
-    def get_mel(self, y):
-        if self.pad and y.shape[-1] < 14000:
-            y = F.pad(y, (0, 14000 - y.shape[-1]))
-
-        window = torch.hann_window(self.win_length).to(y)
-        y = torch.stft(y, n_fft=self.embedder_params["n_fft"],
-                       hop_length=self.hop_length,
-                       win_length=self.win_length,
-                       window=window)
-        magnitudes = torch.norm(y, dim=-1, p=2) ** 2
-        mel = torch.log10(self.mel_basis @ magnitudes + 1e-6)
-        return mel
-
-    def forward(self, inputs):
-        dvecs = []
-        for wav in inputs:
-            mel = self.get_mel(wav)
-            if mel.dim() == 3:
-                mel = mel.squeeze(0)
-            dvecs += [self.embedder(mel)]
-        dvecs = torch.stack(dvecs)
-
-        dvec = torch.mean(dvecs, dim=0)
-        dvec = dvec / torch.norm(dvec)
-
-        return dvec
diff --git a/examples/speech_synthesis/preprocessing/vad/__init__.py b/examples/speech_synthesis/preprocessing/vad/__init__.py
deleted file mode 100644
index 9cf121081f..0000000000
--- a/examples/speech_synthesis/preprocessing/vad/__init__.py
+++ /dev/null
@@ -1,192 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-
-import collections
-import contextlib
-import wave
-
-try:
-    import webrtcvad
-except ImportError:
-    raise ImportError("Please install py-webrtcvad: pip install webrtcvad")
-import argparse
-import os
-import logging
-from tqdm import tqdm
-
-AUDIO_SUFFIX = '.wav'
-FS_MS = 30
-SCALE = 6e-5
-THRESHOLD = 0.3
-
-
-def read_wave(path):
-    """Reads a .wav file.
-    Takes the path, and returns (PCM audio data, sample rate).
-    """
-    with contextlib.closing(wave.open(path, 'rb')) as wf:
-        num_channels = wf.getnchannels()
-        assert num_channels == 1
-        sample_width = wf.getsampwidth()
-        assert sample_width == 2
-        sample_rate = wf.getframerate()
-        assert sample_rate in (8000, 16000, 32000, 48000)
-        pcm_data = wf.readframes(wf.getnframes())
-        return pcm_data, sample_rate
-
-
-def write_wave(path, audio, sample_rate):
-    """Writes a .wav file.
-    Takes path, PCM audio data, and sample rate.
-    """
-    with contextlib.closing(wave.open(path, 'wb')) as wf:
-        wf.setnchannels(1)
-        wf.setsampwidth(2)
-        wf.setframerate(sample_rate)
-        wf.writeframes(audio)
-
-
-class Frame(object):
-    """Represents a "frame" of audio data."""
-    def __init__(self, bytes, timestamp, duration):
-        self.bytes = bytes
-        self.timestamp = timestamp
-        self.duration = duration
-
-
-def frame_generator(frame_duration_ms, audio, sample_rate):
-    """Generates audio frames from PCM audio data.
-    Takes the desired frame duration in milliseconds, the PCM data, and
-    the sample rate.
-    Yields Frames of the requested duration.
-    """
-    n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
-    offset = 0
-    timestamp = 0.0
-    duration = (float(n) / sample_rate) / 2.0
-    while offset + n < len(audio):
-        yield Frame(audio[offset:offset + n], timestamp, duration)
-        timestamp += duration
-        offset += n
-
-
-def vad_collector(sample_rate, frame_duration_ms,
-                  padding_duration_ms, vad, frames):
-    """Filters out non-voiced audio frames.
-    Given a webrtcvad.Vad and a source of audio frames, yields only
-    the voiced audio.
-    Uses a padded, sliding window algorithm over the audio frames.
-    When more than 90% of the frames in the window are voiced (as
-    reported by the VAD), the collector triggers and begins yielding
-    audio frames. Then the collector waits until 90% of the frames in
-    the window are unvoiced to detrigger.
-    The window is padded at the front and back to provide a small
-    amount of silence or the beginnings/endings of speech around the
-    voiced frames.
-    Arguments:
-    sample_rate - The audio sample rate, in Hz.
-    frame_duration_ms - The frame duration in milliseconds.
-    padding_duration_ms - The amount to pad the window, in milliseconds.
-    vad - An instance of webrtcvad.Vad.
-    frames - a source of audio frames (sequence or generator).
-    Returns: A generator that yields PCM audio data.
-    """
-    num_padding_frames = int(padding_duration_ms / frame_duration_ms)
-    # We use a deque for our sliding window/ring buffer.
-    ring_buffer = collections.deque(maxlen=num_padding_frames)
-    # We have two states: TRIGGERED and NOTTRIGGERED. We start in the
-    # NOTTRIGGERED state.
-    triggered = False
-
-    voiced_frames = []
-    for frame in frames:
-        is_speech = vad.is_speech(frame.bytes, sample_rate)
-
-        #  sys.stdout.write('1' if is_speech else '0')
-        if not triggered:
-            ring_buffer.append((frame, is_speech))
-            num_voiced = len([f for f, speech in ring_buffer if speech])
-            # If we're NOTTRIGGERED and more than 90% of the frames in
-            # the ring buffer are voiced frames, then enter the
-            # TRIGGERED state.
-            if num_voiced > 0.9 * ring_buffer.maxlen:
-                triggered = True
-                # We want to yield all the audio we see from now until
-                # we are NOTTRIGGERED, but we have to start with the
-                # audio that's already in the ring buffer.
-                for f, _ in ring_buffer:
-                    voiced_frames.append(f)
-                ring_buffer.clear()
-        else:
-            # We're in the TRIGGERED state, so collect the audio data
-            # and add it to the ring buffer.
-            voiced_frames.append(frame)
-            ring_buffer.append((frame, is_speech))
-            num_unvoiced = len([f for f, speech in ring_buffer if not speech])
-            # If more than 90% of the frames in the ring buffer are
-            # unvoiced, then enter NOTTRIGGERED and yield whatever
-            # audio we've collected.
-            if num_unvoiced > 0.9 * ring_buffer.maxlen:
-                triggered = False
-                yield [b''.join([f.bytes for f in voiced_frames]),
-                       voiced_frames[0].timestamp, voiced_frames[-1].timestamp]
-                ring_buffer.clear()
-                voiced_frames = []
-    # If we have any leftover voiced audio when we run out of input,
-    # yield it.
-    if voiced_frames:
-        yield [b''.join([f.bytes for f in voiced_frames]),
-               voiced_frames[0].timestamp, voiced_frames[-1].timestamp]
-
-
-def main(args):
-    # create output folder
-    try:
-        cmd = f"mkdir -p {args.out_path}"
-        os.system(cmd)
-    except Exception:
-        logging.error("Can not create output folder")
-        exit(-1)
-
-    # build vad object
-    vad = webrtcvad.Vad(int(args.agg))
-    # iterating over wavs in dir
-    for file in tqdm(os.listdir(args.in_path)):
-        if file.endswith(AUDIO_SUFFIX):
-            audio_inpath = os.path.join(args.in_path, file)
-            audio_outpath = os.path.join(args.out_path, file)
-            audio, sample_rate = read_wave(audio_inpath)
-            frames = frame_generator(FS_MS, audio, sample_rate)
-            frames = list(frames)
-            segments = vad_collector(sample_rate, FS_MS, 300, vad, frames)
-            merge_segments = list()
-            timestamp_start = 0.0
-            timestamp_end = 0.0
-            # removing start, end, and long sequences of sils
-            for i, segment in enumerate(segments):
-                merge_segments.append(segment[0])
-                if i and timestamp_start:
-                    sil_duration = segment[1] - timestamp_end
-                    if sil_duration > THRESHOLD:
-                        merge_segments.append(int(THRESHOLD / SCALE)*(b'\x00'))
-                    else:
-                        merge_segments.append(int((sil_duration / SCALE))*(b'\x00'))
-                timestamp_start = segment[1]
-                timestamp_end = segment[2]
-            segment = b''.join(merge_segments)
-            write_wave(audio_outpath, segment, sample_rate)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Apply vad to a file of fils.')
-    parser.add_argument('in_path', type=str, help='Path to the input files')
-    parser.add_argument('out_path', type=str,
-                        help='Path to save the processed files')
-    parser.add_argument('--agg', type=int, default=3,
-                        help='The level of aggressiveness of the VAD: [0-3]')
-    args = parser.parse_args()
-
-    main(args)
diff --git a/examples/speech_synthesis/utils.py b/examples/speech_synthesis/utils.py
deleted file mode 100644
index 2c7b03733d..0000000000
--- a/examples/speech_synthesis/utils.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-import torch
-from scipy.interpolate import interp1d
-import torchaudio
-
-from fairseq.tasks.text_to_speech import (
-    batch_compute_distortion, compute_rms_dist
-)
-
-
-def batch_mel_spectral_distortion(
-        y1, y2, sr, normalize_type="path", mel_fn=None
-):
-    """
-    https://arxiv.org/pdf/2011.03568.pdf
-
-    Same as Mel Cepstral Distortion, but computed on log-mel spectrograms.
-    """
-    if mel_fn is None or mel_fn.sample_rate != sr:
-        mel_fn = torchaudio.transforms.MelSpectrogram(
-            sr, n_fft=int(0.05 * sr), win_length=int(0.05 * sr),
-            hop_length=int(0.0125 * sr), f_min=20, n_mels=80,
-            window_fn=torch.hann_window
-        ).to(y1[0].device)
-    offset = 1e-6
-    return batch_compute_distortion(
-        y1, y2, sr, lambda y: torch.log(mel_fn(y) + offset).transpose(-1, -2),
-        compute_rms_dist, normalize_type
-    )
-
-
-# This code is based on
-# "https://github.com/bastibe/MAPS-Scripts/blob/master/helper.py"
-def _same_t_in_true_and_est(func):
-    def new_func(true_t, true_f, est_t, est_f):
-        assert type(true_t) is np.ndarray
-        assert type(true_f) is np.ndarray
-        assert type(est_t) is np.ndarray
-        assert type(est_f) is np.ndarray
-
-        interpolated_f = interp1d(
-            est_t, est_f, bounds_error=False, kind='nearest', fill_value=0
-        )(true_t)
-        return func(true_t, true_f, true_t, interpolated_f)
-
-    return new_func
-
-
-@_same_t_in_true_and_est
-def gross_pitch_error(true_t, true_f, est_t, est_f):
-    """The relative frequency in percent of pitch estimates that are
-    outside a threshold around the true pitch. Only frames that are
-    considered pitched by both the ground truth and the estimator (if
-    applicable) are considered.
-    """
-
-    correct_frames = _true_voiced_frames(true_t, true_f, est_t, est_f)
-    gross_pitch_error_frames = _gross_pitch_error_frames(
-        true_t, true_f, est_t, est_f
-    )
-    return np.sum(gross_pitch_error_frames) / np.sum(correct_frames)
-
-
-def _gross_pitch_error_frames(true_t, true_f, est_t, est_f, eps=1e-8):
-    voiced_frames = _true_voiced_frames(true_t, true_f, est_t, est_f)
-    true_f_p_eps = [x + eps for x in true_f]
-    pitch_error_frames = np.abs(est_f / true_f_p_eps - 1) > 0.2
-    return voiced_frames & pitch_error_frames
-
-
-def _true_voiced_frames(true_t, true_f, est_t, est_f):
-    return (est_f != 0) & (true_f != 0)
-
-
-def _voicing_decision_error_frames(true_t, true_f, est_t, est_f):
-    return (est_f != 0) != (true_f != 0)
-
-
-@_same_t_in_true_and_est
-def f0_frame_error(true_t, true_f, est_t, est_f):
-    gross_pitch_error_frames = _gross_pitch_error_frames(
-        true_t, true_f, est_t, est_f
-    )
-    voicing_decision_error_frames = _voicing_decision_error_frames(
-        true_t, true_f, est_t, est_f
-    )
-    return (np.sum(gross_pitch_error_frames) +
-            np.sum(voicing_decision_error_frames)) / (len(true_t))
-
-
-@_same_t_in_true_and_est
-def voicing_decision_error(true_t, true_f, est_t, est_f):
-    voicing_decision_error_frames = _voicing_decision_error_frames(
-        true_t, true_f, est_t, est_f
-    )
-    return np.sum(voicing_decision_error_frames) / (len(true_t))