0.15.0

matatonic · matatonic · commit be759f3fea57 · 2024-06-27T01:43:43.000-04:00
diff --git a/Dockerfile b/Dockerfile
@@ -1,7 +1,9 @@
 FROM python:3.11-slim
 
+RUN --mount=type=cache,target=/root/.cache/pip pip install -U pip
+
 ARG TARGETPLATFORM
-RUN apt-get update && apt-get install --no-install-recommends -y curl ffmpeg
+RUN apt-get update && apt-get install --no-install-recommends -y curl ffmpeg libaio-dev
 RUN if [ "$TARGETPLATFORM" != "linux/amd64" ]; then apt-get install --no-install-recommends -y build-essential ; fi
 RUN if [ "$TARGETPLATFORM" != "linux/amd64" ]; then curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y ; fi
 ENV PATH="/root/.cargo/bin:${PATH}"
diff --git a/Dockerfile.min b/Dockerfile.min
@@ -12,7 +12,7 @@ RUN mkdir -p voices config
 
 COPY requirements*.txt /app/
 RUN --mount=type=cache,target=/root/.cache/pip pip install -r requirements-min.txt
-COPY speech.py openedai.py say.py *.sh *.default.yaml README.md LICENSE /app/
+COPY *.py *.sh *.default.yaml README.md LICENSE /app/
 
 ENV TTS_HOME=voices
 ENV HF_HOME=voices
diff --git a/README.md b/README.md
@@ -29,6 +29,11 @@ If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know s
 
 ## Recent Changes
 
+Version 0.15.0, 2024-06-26
+
+* Switch to [coqui-tts](https://github.com/idiap/coqui-ai-TTS) (updated fork), updated simpler dependencies, torch 2.3, etc.
+* Resolve cuda threading issues
+
 Version 0.14.1, 2024-06-26
 
 * Make deepspeed possible (`--use-deepspeed`), but not enabled in pre-built docker images (too large). Requires the cuda-toolkit installed, see the Dockerfile comment for details
@@ -127,7 +132,7 @@ source .venv/bin/activate
 # Install the Python requirements
 # - use requirements-rocm.txt for AMD GPU (ROCm support)
 # - use requirements-min.txt for piper only (CPU only)
-pip install -r requirements.txt
+pip install -U -r requirements.txt
 # run the server
 bash startup.sh
 ```
diff --git a/requirements-min.txt b/requirements-min.txt
@@ -1,6 +1,5 @@
-pyyaml
 fastapi
 uvicorn
 loguru
 numpy<2
-piper-tts==1.2.0
+piper-tts
diff --git a/requirements-rocm.txt b/requirements-rocm.txt
@@ -1,17 +1,8 @@
 fastapi
 uvicorn
 loguru
-# piper-tts
-piper-tts==1.2.0
-# xtts
-TTS==0.22.0
-# https://github.com/huggingface/transformers/issues/31040
-transformers<4.41.0
-deepspeed<0.14.0
-# XXX, 3.8+ has some issue for now
-spacy==3.7.4
-
-# torch==2.2.2 Fixes: https://github.com/matatonic/openedai-speech/issues/9
-# Re:  https://github.com/pytorch/pytorch/issues/121834
-torch==2.2.2; --index-url https://download.pytorch.org/whl/rocm5.7; sys_platform == "linux"
-torchaudio==2.2.2; --index-url https://download.pytorch.org/whl/rocm5.7; sys_platform == "linux"
+piper-tts
+coqui-tts
+deepspeed
+torch; --index-url https://download.pytorch.org/whl/rocm5.7; sys_platform == "linux"
+torchaudio; --index-url https://download.pytorch.org/whl/rocm5.7; sys_platform == "linux"
diff --git a/requirements.txt b/requirements.txt
@@ -1,22 +1,14 @@
 fastapi
 uvicorn
 loguru
-# piper-tts
-piper-tts==1.2.0
-# xtts
-TTS==0.22.0
-# https://github.com/huggingface/transformers/issues/31040
-transformers<4.41.0 
-deepspeed<0.14.0
-# XXX, 3.8+ has some issue for now
-spacy==3.7.4
+piper-tts
+coqui-tts[languages]
+deepspeed
 
-# torch==2.2.2 Fixes: https://github.com/matatonic/openedai-speech/issues/9
-# Re:  https://github.com/pytorch/pytorch/issues/121834
-torch==2.2.2; sys_platform != "darwin"
+torch; sys_platform != "darwin"
 torchaudio; sys_platform != "darwin"
 # for MPS accelerated torch on Mac - doesn't work yet, incomplete support in torch and torchaudio
-torch==2.2.2; --index-url https://download.pytorch.org/whl/cpu; sys_platform == "darwin"
-torchaudio==2.2.2; --index-url https://download.pytorch.org/whl/cpu; sys_platform == "darwin"
+torch; --index-url https://download.pytorch.org/whl/cpu; sys_platform == "darwin"
+torchaudio; --index-url https://download.pytorch.org/whl/cpu; sys_platform == "darwin"
 
 # ROCM (Linux only) - use requirements.amd.txt
diff --git a/speech.py b/speech.py
@@ -92,7 +92,8 @@ def tts(self, text, language, speaker_wav, **hf_generate_kwargs):
         self.not_idle()
         try:
             with torch.no_grad():
-                gpt_cond_latent, speaker_embedding = self.xtts.get_conditioning_latents(audio_path=[speaker_wav]) # XXX TODO: allow multiple wav
+                with self.lock: # this doesn't seem threadsafe, but it's quick enough
+                    gpt_cond_latent, speaker_embedding = self.xtts.get_conditioning_latents(audio_path=[speaker_wav]) # XXX TODO: allow multiple wav
 
                 for wav in self.xtts.inference_stream(text, language, gpt_cond_latent, speaker_embedding, **hf_generate_kwargs):
                     yield wav.cpu().numpy().tobytes() # assumes wav data is f32le