Add script to generate deterministic samples and hashes

synesthesiam · synesthesiam · commit b446579e4f3a · 2022-04-20T11:43:15.000-04:00
diff --git a/.isort.cfg b/.isort.cfg
@@ -0,0 +1,6 @@
+[settings]
+multi_line_output=3
+include_trailing_comma=True
+force_grid_wrap=0
+use_parentheses=True
+line_length=88
diff --git a/check.sh b/check.sh
@@ -23,6 +23,26 @@ set -eo pipefail
 # Directory of *this* script
 this_dir="$( cd "$( dirname "$0" )" && pwd )"
 
+# Path to virtual environment
+: "${venv:=${base_dir}/.venv}"
+
+if [ -d "${venv}" ]; then
+    # Activate virtual environment if available
+    source "${venv}/bin/activate"
+fi
+
+python_files=("${this_dir}/tests"/*.py)
+
+# Format code
+black "${python_files[@]}"
+isort "${python_files[@]}"
+
+# Check
+flake8 "${python_files[@]}"
+pylint "${python_files[@]}"
+mypy "${python_files[@]}"
+
+# Check submodules
 script_name='check.sh'
 
 find "${this_dir}" -mindepth 2 -maxdepth 2 -name "${script_name}" -type f | \
diff --git a/mimic3-tts/mimic3_tts/voices.json b/mimic3-tts/mimic3_tts/voices.json
@@ -51,6 +51,49 @@
         ],
         "properties": {}
     },
+    "de_DE/thorsten-emotion_low": {
+        "files": {
+            "LICENSE": {
+                "size_bytes": 31,
+                "sha256_sum": "19b4afb2e1c2c9f2cb07da14acd2d34098a4e11c13fccb09356845c4748fb50c"
+            },
+            "SOURCE": {
+                "size_bytes": 28,
+                "sha256_sum": "f351b9304a8fa1e6e58db9b3411f92387e192bb4b57fcbe3e8f839424b13f3c8"
+            },
+            "config.json": {
+                "size_bytes": 3756,
+                "sha256_sum": "e0ae97001a4a9b69040f4531de804b08ec865223bab807310652852fafdad909"
+            },
+            "generator.onnx": {
+                "size_bytes": 76346209,
+                "sha256_sum": "164defc5bffcf739996b04855a6a0c76b600b44f73dfe4801d24e838eea5e112"
+            },
+            "phonemes.txt": {
+                "size_bytes": 339,
+                "sha256_sum": "4820e701ece71e34c5d32f9640db6b103391093943ac071c001222d27b891e5f"
+            },
+            "speaker_map.csv": {
+                "size_bytes": 222,
+                "sha256_sum": "371546cc49cefe755562da2e8404ae857206e6593fa777f70860bf1fff070b89"
+            },
+            "speakers.txt": {
+                "size_bytes": 62,
+                "sha256_sum": "b12e946f3867f124f3d07c48a13362edda6564fa91b13e46aba6dee41193f4aa"
+            }
+        },
+        "speakers": [
+            "amused",
+            "angry",
+            "disgusted",
+            "drunk",
+            "neutral",
+            "sleepy",
+            "surprised",
+            "whisper"
+        ],
+        "properties": {}
+    },
     "de_DE/thorsten_low": {
         "files": {
             "ALIASES": {
@@ -556,6 +599,10 @@
     },
     "fa/haaniye_low": {
         "files": {
+            "ALIASES": {
+                "size_bytes": 11,
+                "sha256_sum": "9d5c7cf903a22b8d0d3f4768583664b2533ba4252367d8d6892ca4e398a811bc"
+            },
             "LICENSE": {
                 "size_bytes": 5,
                 "sha256_sum": "958246282e394a96727515ee6d834073ea09314be61a5730d03bf6d540ae0c26"
diff --git a/opentts-abc/opentts_abc/VERSION b/opentts-abc/opentts_abc/VERSION
@@ -1 +1 @@
-0.1.0
+0.1.1
diff --git a/opentts-abc/opentts_abc/__init__.py b/opentts-abc/opentts_abc/__init__.py
@@ -243,7 +243,7 @@ def begin_utterance(self):
         """Begins a new utterance"""
 
     @abstractmethod
-    def speak_text(self, text: str):
+    def speak_text(self, text: str, text_language: typing.Optional[str] = None):
         """Speaks text using the underlying system's tokenization mechanism.
 
         Becomes an AudioResult in end_utterance()
@@ -276,3 +276,40 @@ def end_utterance(self) -> typing.Iterable[BaseResult]:
 
         Returns an iterable of results (audio, marks, etc.)
         """
+
+    def text_to_wav(
+        self, text: str, text_language: typing.Optional[str] = None
+    ) -> bytes:
+        """Synthesize text with current voice settings and return WAV audio"""
+        with io.BytesIO() as wav_io:
+            wav_file: wave.Wave_write = wave.open(wav_io, "wb")
+            wav_params_set = False
+
+            with wav_file:
+                try:
+                    self.begin_utterance()
+                    self.speak_text(text, text_language=text_language)
+                    results = self.end_utterance()
+
+                    for result in results:
+                        # Add audio to existing WAV file
+                        if isinstance(result, AudioResult):
+                            if not wav_params_set:
+                                wav_file.setframerate(result.sample_rate_hz)
+                                wav_file.setsampwidth(result.sample_width_bytes)
+                                wav_file.setnchannels(result.num_channels)
+                                wav_params_set = True
+
+                            wav_file.writeframes(result.audio_bytes)
+                except Exception as e:
+                    if not wav_params_set:
+                        # Set default parameters so exception can propagate
+                        wav_file.setframerate(22050)
+                        wav_file.setsampwidth(2)
+                        wav_file.setnchannels(1)
+
+                    raise e
+
+            wav_bytes = wav_io.getvalue()
+
+            return wav_bytes
diff --git a/pylintrc b/pylintrc
@@ -0,0 +1,40 @@
+[MESSAGES CONTROL]
+disable=
+  format,
+  abstract-class-little-used,
+  abstract-method,
+  cyclic-import,
+  duplicate-code,
+  global-statement,
+  import-outside-toplevel,
+  inconsistent-return-statements,
+  locally-disabled,
+  not-context-manager,
+  redefined-variable-type,
+  too-few-public-methods,
+  too-many-arguments,
+  too-many-branches,
+  too-many-instance-attributes,
+  too-many-lines,
+  too-many-locals,
+  too-many-public-methods,
+  too-many-return-statements,
+  too-many-statements,
+  too-many-boolean-expressions,
+  unnecessary-pass,
+  unused-argument,
+  broad-except,
+  too-many-nested-blocks,
+  invalid-name,
+  unused-import,
+  no-self-use,
+  fixme,
+  useless-super-delegation,
+  missing-module-docstring,
+  missing-class-docstring,
+  missing-function-docstring,
+  import-error,
+  relative-beyond-top-level
+
+[FORMAT]
+expected-line-ending-format=LF
diff --git a/setup.cfg b/setup.cfg
@@ -0,0 +1,26 @@
+[flake8]
+# To work with Black
+max-line-length = 88
+# E501: line too long
+# W503: Line break occurred before a binary operator
+# E203: Whitespace before ':'
+# D202 No blank lines allowed after function docstring
+# W504 line break after binary operator
+ignore =
+    E501,
+    W503,
+    E203,
+    D202,
+    W504
+
+# F401 import unused
+per-file-ignores =
+    mimic3_tts/__init__.py:F401
+
+[isort]
+multi_line_output = 3
+include_trailing_comma=True
+force_grid_wrap=0
+use_parentheses=True
+line_length=88
+indent = "    "
diff --git a/tests/get_sample_hashes.py b/tests/get_sample_hashes.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+# Copyright 2022 Mycroft AI Inc.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+"""
+Generates sha256 hashs of audio samples for all languages/voices.
+"""
+import argparse
+import contextlib
+import functools
+import hashlib
+import logging
+import re
+import tempfile
+import typing
+from multiprocessing import Pool
+from pathlib import Path
+
+from mimic3_tts import Mimic3Settings, Mimic3TextToSpeechSystem, Voice
+
+# -----------------------------------------------------------------------------
+
+_TEST_SENTENCES = {
+    "de": """Der Regenbogen ist ein atmosphärisch-optisches Phänomen, das als
+        kreisbogenförmiges farbiges Lichtband in einer von der Sonne
+        beschienenen Regenwand oder -wolke wahrgenommen wird.""",
+    "en": """A rainbow is a meteorological phenomenon that is caused by
+        reflection, refraction and dispersion of light in water droplets
+        resulting in a spectrum of light appearing in the sky.""",
+    "el": """Οι επιστήμονες μελετούν ακόμη το ουράνιο τόξο.""",
+    "es": """Un arcoíris​ o arco iris es un fenómeno óptico y meteorológico que
+        consiste en la aparición en el cielo de un arco de luz multicolor,
+        originado por la descomposición de la luz solar en el espectro visible,
+        la cual se produce por refracción, cuando los rayos del sol atraviesan
+        pequeñas gotas de agua contenidas en la atmósfera terrestre.""",
+    "fa": """برای دیگر کاربردها رنگین‌کمان (ابهام‌زدایی) را ببینید.""",
+    "fi": """Sateenkaari on spektrin väreissä esiintyvä ilmakehän optinen ilmiö.""",
+    "fr": """Un arc-en-ciel est un photométéore, un phénomène optique se
+        produisant dans le ciel, visible dans la direction opposée au Soleil
+        quand il brille pendant la pluie.""",
+    "hu": """A szivárvány olyan optikai jelenség, melyet eső- vagy páracseppek
+        okoznak, mikor a fény prizmaszerűen megtörik rajtuk és színeire bomlik,
+        kialakul a színképe, más néven spektruma.""",
+    "it": """In fisica dell'atmosfera e meteorologia l'arcobaleno è un fenomeno
+        ottico atmosferico che produce uno spettro quasi continuo di luce nel
+        cielo quando la luce del Sole attraversa le gocce d'acqua rimaste in
+        sospensione dopo un temporale, o presso una cascata o una fontana.""",
+    "ko": """무지개(문화어: 색동다리)는 하늘에 보이는 호(弧)를 이루는 색 띠를 말한다.""",
+    "nl": """Een regenboog is een gekleurde cirkelboog die aan de hemel
+        waargenomen kan worden als de, laagstaande, zon tegen een nevel van
+        waterdruppeltjes aan schijnt en de zon zich achter de waarnemer bevindt.""",
+    "pt": """Um arco-íris, também popularmente denominado arco-da-velha, é um
+        fenômeno óptico e meteorológico que separa a luz do sol em seu espectro
+        contínuo quando o sol brilha sobre gotículas de água suspensas no ar.""",
+    "ru": """Ра́дуга, атмосферное, оптическое и метеорологическое явление,
+        наблюдаемое при освещении ярким источником света множества водяных
+        капель.""",
+    "sv": """En regnbåge är ett optiskt, meteorologiskt fenomen som uppträder som
+        ett fullständigt ljusspektrum i form av en båge på himlen då solen lyser
+        på nedfallande regn.""",
+    "sw": """Upinde wa mvua ni tao la rangi mbalimbali angani ambalo linaweza
+        kuonekana wakati Jua huangaza kupitia matone ya mvua inayoanguka.""",
+    "te": """ఇంద్ర ధనుస్సు దృష్టి విద్యా సంబంధమయిన వాతావరణ శాస్త్ర సంబంధమయిన దృగ్విషయం.""",
+    "uk": """Весе́лка, також ра́йдуга оптичне явище в атмосфері, що являє собою
+        одну, дві чи декілька різнокольорових дуг (або кіл, якщо дивитися з
+        повітря), що спостерігаються на тлі хмари, якщо вона розташована проти
+        Сонця. Червоний колір ми бачимо з зовнішнього боку первинної веселки, а
+        фіолетовий — із внутрішнього.""",
+    "vi": """Cầu vồng hay mống cũng như quang phổ là hiện tượng tán sắc của các
+    ánh sáng từ Mặt Trời khi khúc xạ và phản xạ qua các giọt nước mưa.""",
+    "yo": """E̟nì kò̟ò̟kan ló ní è̟tó̟ láti kó̟ è̟kó̟.""",
+}
+
+_LOGGER = logging.getLogger("get_samples")
+
+# -----------------------------------------------------------------------------
+
+
+def synthesize(output_dir: Path, voice: Voice) -> typing.Iterable[str]:
+    """Generate samples for voice in a separate process"""
+    tts = Mimic3TextToSpeechSystem(
+        Mimic3Settings(
+            length_scale=1.0,
+            noise_scale=0.0,
+            noise_w=0.0,
+            use_deterministic_compute=True,
+        )
+    )
+
+    key = voice.key
+    language = voice.language
+    speakers = voice.speakers or [""]
+
+    # Try en_US and en
+    text = _TEST_SENTENCES.get(language, _TEST_SENTENCES.get(language.split("_")[0]))
+
+    assert text, f"No sentences for {language}"
+
+    # Normalize whitespace
+    text = re.sub(r"\s+", " ", text)
+
+    voice_dir = output_dir / key
+    voice_dir.mkdir(parents=True, exist_ok=True)
+
+    results = []
+    for speaker in sorted(speakers):
+        if speaker:
+            # Multi-speaker
+            voice_key = f"{key}#{speaker}"
+            sample_path = voice_dir / f"sample_{speaker}.wav"
+        else:
+            # Single speaker
+            voice_key = key
+            sample_path = voice_dir / "sample.wav"
+
+        if not sample_path.is_file():
+            tts.voice = voice_key
+            wav_bytes = tts.text_to_wav(text)
+            sample_path.write_bytes(wav_bytes)
+
+        wav_hash = hashlib.sha256(sample_path.read_bytes()).hexdigest()
+        _LOGGER.info(sample_path)
+
+        results.append(f"{voice_key} {wav_hash}")
+
+    return results
+
+
+# -----------------------------------------------------------------------------
+
+
+def main():
+    """Generate WAV samples from Mimic 3 in deterministic mode for testing"""
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--output-dir", help="Directory to write samples")
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.INFO)
+
+    if args.output_dir:
+        output_dir = Path(args.output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        temp_dir = contextlib.nullcontext()
+    else:
+        # Output to temp directory
+        temp_dir = tempfile.TemporaryDirectory()
+        output_dir = Path(temp_dir.name)
+
+    tts = Mimic3TextToSpeechSystem(Mimic3Settings())
+
+    # -------------------------------------------------------------------------
+    # Generate samples
+    # -------------------------------------------------------------------------
+
+    with temp_dir, Pool() as pool:
+        voices = sorted(tts.get_voices(), key=lambda v: v.key)
+        for results in pool.map(functools.partial(synthesize, output_dir), voices):
+            for result in results:
+                print(result)
+
+
+# -----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/sample_hashs.txt b/tests/sample_hashs.txt