Skip to content

Commit b446579

Browse files
committedApr 20, 2022
Add script to generate deterministic samples and hashes
1 parent 503126d commit b446579

File tree

9 files changed

+595
-2
lines changed

9 files changed

+595
-2
lines changed
 

‎.isort.cfg

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
[settings]
2+
multi_line_output=3
3+
include_trailing_comma=True
4+
force_grid_wrap=0
5+
use_parentheses=True
6+
line_length=88

‎check.sh

+20
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,26 @@ set -eo pipefail
2323
# Directory of *this* script
2424
this_dir="$( cd "$( dirname "$0" )" && pwd )"
2525

26+
# Path to virtual environment
27+
: "${venv:=${base_dir}/.venv}"
28+
29+
if [ -d "${venv}" ]; then
30+
# Activate virtual environment if available
31+
source "${venv}/bin/activate"
32+
fi
33+
34+
python_files=("${this_dir}/tests"/*.py)
35+
36+
# Format code
37+
black "${python_files[@]}"
38+
isort "${python_files[@]}"
39+
40+
# Check
41+
flake8 "${python_files[@]}"
42+
pylint "${python_files[@]}"
43+
mypy "${python_files[@]}"
44+
45+
# Check submodules
2646
script_name='check.sh'
2747

2848
find "${this_dir}" -mindepth 2 -maxdepth 2 -name "${script_name}" -type f | \

‎mimic3-tts/mimic3_tts/voices.json

+47
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,49 @@
5151
],
5252
"properties": {}
5353
},
54+
"de_DE/thorsten-emotion_low": {
55+
"files": {
56+
"LICENSE": {
57+
"size_bytes": 31,
58+
"sha256_sum": "19b4afb2e1c2c9f2cb07da14acd2d34098a4e11c13fccb09356845c4748fb50c"
59+
},
60+
"SOURCE": {
61+
"size_bytes": 28,
62+
"sha256_sum": "f351b9304a8fa1e6e58db9b3411f92387e192bb4b57fcbe3e8f839424b13f3c8"
63+
},
64+
"config.json": {
65+
"size_bytes": 3756,
66+
"sha256_sum": "e0ae97001a4a9b69040f4531de804b08ec865223bab807310652852fafdad909"
67+
},
68+
"generator.onnx": {
69+
"size_bytes": 76346209,
70+
"sha256_sum": "164defc5bffcf739996b04855a6a0c76b600b44f73dfe4801d24e838eea5e112"
71+
},
72+
"phonemes.txt": {
73+
"size_bytes": 339,
74+
"sha256_sum": "4820e701ece71e34c5d32f9640db6b103391093943ac071c001222d27b891e5f"
75+
},
76+
"speaker_map.csv": {
77+
"size_bytes": 222,
78+
"sha256_sum": "371546cc49cefe755562da2e8404ae857206e6593fa777f70860bf1fff070b89"
79+
},
80+
"speakers.txt": {
81+
"size_bytes": 62,
82+
"sha256_sum": "b12e946f3867f124f3d07c48a13362edda6564fa91b13e46aba6dee41193f4aa"
83+
}
84+
},
85+
"speakers": [
86+
"amused",
87+
"angry",
88+
"disgusted",
89+
"drunk",
90+
"neutral",
91+
"sleepy",
92+
"surprised",
93+
"whisper"
94+
],
95+
"properties": {}
96+
},
5497
"de_DE/thorsten_low": {
5598
"files": {
5699
"ALIASES": {
@@ -556,6 +599,10 @@
556599
},
557600
"fa/haaniye_low": {
558601
"files": {
602+
"ALIASES": {
603+
"size_bytes": 11,
604+
"sha256_sum": "9d5c7cf903a22b8d0d3f4768583664b2533ba4252367d8d6892ca4e398a811bc"
605+
},
559606
"LICENSE": {
560607
"size_bytes": 5,
561608
"sha256_sum": "958246282e394a96727515ee6d834073ea09314be61a5730d03bf6d540ae0c26"

‎opentts-abc/opentts_abc/VERSION

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.1.0
1+
0.1.1

‎opentts-abc/opentts_abc/__init__.py

+38-1
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,7 @@ def begin_utterance(self):
243243
"""Begins a new utterance"""
244244

245245
@abstractmethod
246-
def speak_text(self, text: str):
246+
def speak_text(self, text: str, text_language: typing.Optional[str] = None):
247247
"""Speaks text using the underlying system's tokenization mechanism.
248248
249249
Becomes an AudioResult in end_utterance()
@@ -276,3 +276,40 @@ def end_utterance(self) -> typing.Iterable[BaseResult]:
276276
277277
Returns an iterable of results (audio, marks, etc.)
278278
"""
279+
280+
def text_to_wav(
281+
self, text: str, text_language: typing.Optional[str] = None
282+
) -> bytes:
283+
"""Synthesize text with current voice settings and return WAV audio"""
284+
with io.BytesIO() as wav_io:
285+
wav_file: wave.Wave_write = wave.open(wav_io, "wb")
286+
wav_params_set = False
287+
288+
with wav_file:
289+
try:
290+
self.begin_utterance()
291+
self.speak_text(text, text_language=text_language)
292+
results = self.end_utterance()
293+
294+
for result in results:
295+
# Add audio to existing WAV file
296+
if isinstance(result, AudioResult):
297+
if not wav_params_set:
298+
wav_file.setframerate(result.sample_rate_hz)
299+
wav_file.setsampwidth(result.sample_width_bytes)
300+
wav_file.setnchannels(result.num_channels)
301+
wav_params_set = True
302+
303+
wav_file.writeframes(result.audio_bytes)
304+
except Exception as e:
305+
if not wav_params_set:
306+
# Set default parameters so exception can propagate
307+
wav_file.setframerate(22050)
308+
wav_file.setsampwidth(2)
309+
wav_file.setnchannels(1)
310+
311+
raise e
312+
313+
wav_bytes = wav_io.getvalue()
314+
315+
return wav_bytes

‎pylintrc

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
[MESSAGES CONTROL]
2+
disable=
3+
format,
4+
abstract-class-little-used,
5+
abstract-method,
6+
cyclic-import,
7+
duplicate-code,
8+
global-statement,
9+
import-outside-toplevel,
10+
inconsistent-return-statements,
11+
locally-disabled,
12+
not-context-manager,
13+
redefined-variable-type,
14+
too-few-public-methods,
15+
too-many-arguments,
16+
too-many-branches,
17+
too-many-instance-attributes,
18+
too-many-lines,
19+
too-many-locals,
20+
too-many-public-methods,
21+
too-many-return-statements,
22+
too-many-statements,
23+
too-many-boolean-expressions,
24+
unnecessary-pass,
25+
unused-argument,
26+
broad-except,
27+
too-many-nested-blocks,
28+
invalid-name,
29+
unused-import,
30+
no-self-use,
31+
fixme,
32+
useless-super-delegation,
33+
missing-module-docstring,
34+
missing-class-docstring,
35+
missing-function-docstring,
36+
import-error,
37+
relative-beyond-top-level
38+
39+
[FORMAT]
40+
expected-line-ending-format=LF

‎setup.cfg

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
[flake8]
2+
# To work with Black
3+
max-line-length = 88
4+
# E501: line too long
5+
# W503: Line break occurred before a binary operator
6+
# E203: Whitespace before ':'
7+
# D202 No blank lines allowed after function docstring
8+
# W504 line break after binary operator
9+
ignore =
10+
E501,
11+
W503,
12+
E203,
13+
D202,
14+
W504
15+
16+
# F401 import unused
17+
per-file-ignores =
18+
mimic3_tts/__init__.py:F401
19+
20+
[isort]
21+
multi_line_output = 3
22+
include_trailing_comma=True
23+
force_grid_wrap=0
24+
use_parentheses=True
25+
line_length=88
26+
indent = " "

‎tests/get_sample_hashes.py

+177
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
#!/usr/bin/env python3
2+
# Copyright 2022 Mycroft AI Inc.
3+
#
4+
# This program is free software: you can redistribute it and/or modify
5+
# it under the terms of the GNU Affero General Public License as published by
6+
# the Free Software Foundation, either version 3 of the License, or
7+
# (at your option) any later version.
8+
#
9+
# This program is distributed in the hope that it will be useful,
10+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12+
# GNU Affero General Public License for more details.
13+
#
14+
# You should have received a copy of the GNU Affero General Public License
15+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
16+
#
17+
"""
18+
Generates sha256 hashs of audio samples for all languages/voices.
19+
"""
20+
import argparse
21+
import contextlib
22+
import functools
23+
import hashlib
24+
import logging
25+
import re
26+
import tempfile
27+
import typing
28+
from multiprocessing import Pool
29+
from pathlib import Path
30+
31+
from mimic3_tts import Mimic3Settings, Mimic3TextToSpeechSystem, Voice
32+
33+
# -----------------------------------------------------------------------------
34+
35+
_TEST_SENTENCES = {
36+
"de": """Der Regenbogen ist ein atmosphärisch-optisches Phänomen, das als
37+
kreisbogenförmiges farbiges Lichtband in einer von der Sonne
38+
beschienenen Regenwand oder -wolke wahrgenommen wird.""",
39+
"en": """A rainbow is a meteorological phenomenon that is caused by
40+
reflection, refraction and dispersion of light in water droplets
41+
resulting in a spectrum of light appearing in the sky.""",
42+
"el": """Οι επιστήμονες μελετούν ακόμη το ουράνιο τόξο.""",
43+
"es": """Un arcoíris​ o arco iris es un fenómeno óptico y meteorológico que
44+
consiste en la aparición en el cielo de un arco de luz multicolor,
45+
originado por la descomposición de la luz solar en el espectro visible,
46+
la cual se produce por refracción, cuando los rayos del sol atraviesan
47+
pequeñas gotas de agua contenidas en la atmósfera terrestre.""",
48+
"fa": """برای دیگر کاربردها رنگین‌کمان (ابهام‌زدایی) را ببینید.""",
49+
"fi": """Sateenkaari on spektrin väreissä esiintyvä ilmakehän optinen ilmiö.""",
50+
"fr": """Un arc-en-ciel est un photométéore, un phénomène optique se
51+
produisant dans le ciel, visible dans la direction opposée au Soleil
52+
quand il brille pendant la pluie.""",
53+
"hu": """A szivárvány olyan optikai jelenség, melyet eső- vagy páracseppek
54+
okoznak, mikor a fény prizmaszerűen megtörik rajtuk és színeire bomlik,
55+
kialakul a színképe, más néven spektruma.""",
56+
"it": """In fisica dell'atmosfera e meteorologia l'arcobaleno è un fenomeno
57+
ottico atmosferico che produce uno spettro quasi continuo di luce nel
58+
cielo quando la luce del Sole attraversa le gocce d'acqua rimaste in
59+
sospensione dopo un temporale, o presso una cascata o una fontana.""",
60+
"ko": """무지개(문화어: 색동다리)는 하늘에 보이는 호(弧)를 이루는 색 띠를 말한다.""",
61+
"nl": """Een regenboog is een gekleurde cirkelboog die aan de hemel
62+
waargenomen kan worden als de, laagstaande, zon tegen een nevel van
63+
waterdruppeltjes aan schijnt en de zon zich achter de waarnemer bevindt.""",
64+
"pt": """Um arco-íris, também popularmente denominado arco-da-velha, é um
65+
fenômeno óptico e meteorológico que separa a luz do sol em seu espectro
66+
contínuo quando o sol brilha sobre gotículas de água suspensas no ar.""",
67+
"ru": """Ра́дуга, атмосферное, оптическое и метеорологическое явление,
68+
наблюдаемое при освещении ярким источником света множества водяных
69+
капель.""",
70+
"sv": """En regnbåge är ett optiskt, meteorologiskt fenomen som uppträder som
71+
ett fullständigt ljusspektrum i form av en båge på himlen då solen lyser
72+
på nedfallande regn.""",
73+
"sw": """Upinde wa mvua ni tao la rangi mbalimbali angani ambalo linaweza
74+
kuonekana wakati Jua huangaza kupitia matone ya mvua inayoanguka.""",
75+
"te": """ఇంద్ర ధనుస్సు దృష్టి విద్యా సంబంధమయిన వాతావరణ శాస్త్ర సంబంధమయిన దృగ్విషయం.""",
76+
"uk": """Весе́лка, також ра́йдуга оптичне явище в атмосфері, що являє собою
77+
одну, дві чи декілька різнокольорових дуг (або кіл, якщо дивитися з
78+
повітря), що спостерігаються на тлі хмари, якщо вона розташована проти
79+
Сонця. Червоний колір ми бачимо з зовнішнього боку первинної веселки, а
80+
фіолетовий — із внутрішнього.""",
81+
"vi": """Cầu vồng hay mống cũng như quang phổ là hiện tượng tán sắc của các
82+
ánh sáng từ Mặt Trời khi khúc xạ và phản xạ qua các giọt nước mưa.""",
83+
"yo": """E̟nì kò̟ò̟kan ló ní è̟tó̟ láti kó̟ è̟kó̟.""",
84+
}
85+
86+
_LOGGER = logging.getLogger("get_samples")
87+
88+
# -----------------------------------------------------------------------------
89+
90+
91+
def synthesize(output_dir: Path, voice: Voice) -> typing.Iterable[str]:
92+
"""Generate samples for voice in a separate process"""
93+
tts = Mimic3TextToSpeechSystem(
94+
Mimic3Settings(
95+
length_scale=1.0,
96+
noise_scale=0.0,
97+
noise_w=0.0,
98+
use_deterministic_compute=True,
99+
)
100+
)
101+
102+
key = voice.key
103+
language = voice.language
104+
speakers = voice.speakers or [""]
105+
106+
# Try en_US and en
107+
text = _TEST_SENTENCES.get(language, _TEST_SENTENCES.get(language.split("_")[0]))
108+
109+
assert text, f"No sentences for {language}"
110+
111+
# Normalize whitespace
112+
text = re.sub(r"\s+", " ", text)
113+
114+
voice_dir = output_dir / key
115+
voice_dir.mkdir(parents=True, exist_ok=True)
116+
117+
results = []
118+
for speaker in sorted(speakers):
119+
if speaker:
120+
# Multi-speaker
121+
voice_key = f"{key}#{speaker}"
122+
sample_path = voice_dir / f"sample_{speaker}.wav"
123+
else:
124+
# Single speaker
125+
voice_key = key
126+
sample_path = voice_dir / "sample.wav"
127+
128+
if not sample_path.is_file():
129+
tts.voice = voice_key
130+
wav_bytes = tts.text_to_wav(text)
131+
sample_path.write_bytes(wav_bytes)
132+
133+
wav_hash = hashlib.sha256(sample_path.read_bytes()).hexdigest()
134+
_LOGGER.info(sample_path)
135+
136+
results.append(f"{voice_key} {wav_hash}")
137+
138+
return results
139+
140+
141+
# -----------------------------------------------------------------------------
142+
143+
144+
def main():
145+
"""Generate WAV samples from Mimic 3 in deterministic mode for testing"""
146+
parser = argparse.ArgumentParser()
147+
parser.add_argument("--output-dir", help="Directory to write samples")
148+
args = parser.parse_args()
149+
150+
logging.basicConfig(level=logging.INFO)
151+
152+
if args.output_dir:
153+
output_dir = Path(args.output_dir)
154+
output_dir.mkdir(parents=True, exist_ok=True)
155+
temp_dir = contextlib.nullcontext()
156+
else:
157+
# Output to temp directory
158+
temp_dir = tempfile.TemporaryDirectory()
159+
output_dir = Path(temp_dir.name)
160+
161+
tts = Mimic3TextToSpeechSystem(Mimic3Settings())
162+
163+
# -------------------------------------------------------------------------
164+
# Generate samples
165+
# -------------------------------------------------------------------------
166+
167+
with temp_dir, Pool() as pool:
168+
voices = sorted(tts.get_voices(), key=lambda v: v.key)
169+
for results in pool.map(functools.partial(synthesize, output_dir), voices):
170+
for result in results:
171+
print(result)
172+
173+
174+
# -----------------------------------------------------------------------------
175+
176+
if __name__ == "__main__":
177+
main()

‎tests/sample_hashs.txt

+240
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)
Please sign in to comment.