0.10.0

matatonic · Apr 27, 2024 · 6864cf0 · 6864cf0
1 parent a2a3d2b
commit 6864cf0
Show file tree

Hide file tree

Showing 16 changed files with 260 additions and 70 deletions.
diff --git a/.github/workflows/build-docker.yml b/.github/workflows/build-docker.yml
@@ -0,0 +1,127 @@
+name: Build and Publish Docker Image
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - 'main'
+  release:
+    types: [published]
+
+jobs:
+  build-and-push-image:
+    runs-on: ubuntu-latest
+
+    permissions:
+      contents: read
+      packages: write
+
+    env:
+      # Set up environment variables for the job
+      DOCKER_REGISTRY: ghcr.io
+      IMAGE_NAME: ${{ github.repository }}
+      TAG: ${{ github.sha }}
+
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+        with:
+          install: true
+
+      # Log in to the GitHub Container Registry only when not running on a pull request event
+      - name: Login to Docker Registry
+        uses: docker/login-action@v2
+        with:
+          registry: ${{ env.DOCKER_REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@v4
+        with:
+          images: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}
+
+      # Build and push the Docker image to GHCR for the main branch or specific tags
+      - name: Build and Push Docker Image
+        if: github.ref == 'refs/heads/main'
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: Dockerfile
+          push: true
+          tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
+          labels: version=${{ github.run_id }}
+
+      # For tagged releases, build and push the Docker image with the corresponding tag
+      - name: Build and Push Docker Image (Tagged)
+        if: startsWith(github.ref, 'refs/tags/')
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: Dockerfile
+          push: true
+          tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
+          labels: version=${{ github.run_id }}
+
+  build-and-push-alt-image:
+    runs-on: ubuntu-latest
+
+    permissions:
+      contents: read
+      packages: write
+
+    env:
+      # Set up environment variables for the job
+      DOCKER_REGISTRY: ghcr.io
+      IMAGE_NAME: matatonic/openedai-speech-min
+      TAG: ${{ github.sha }}
+
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+        with:
+          install: true
+
+      # Log in to the GitHub Container Registry only when not running on a pull request event
+      - name: Login to Docker Registry
+        uses: docker/login-action@v2
+        with:
+          registry: ${{ env.DOCKER_REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@v4
+        with:
+          images: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}
+
+      # Build and push the Docker image to GHCR for the main branch or specific tags
+      - name: Build and Push Docker Image
+        if: github.ref == 'refs/heads/main'
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: Dockerfile.min
+          push: true
+          tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
+          labels: version=${{ github.run_id }}
+
+      # For tagged releases, build and push the Docker image with the corresponding tag
+      - name: Build and Push Docker Image (Tagged)
+        if: startsWith(github.ref, 'refs/tags/')
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: Dockerfile.min
+          push: true
+          tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
+          labels: version=${{ github.run_id }}
+
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,8 @@
 voices/
+.env
+speech.env
+config/pre_process_map.yaml
+config/voice_to_speaker.yaml
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/Dockerfile b/Dockerfile
@@ -1,24 +1,17 @@
 FROM python:3.11-slim
 
 ENV COQUI_TOS_AGREED=1
-ENV PRELOAD_MODEL=xtts
-# or PRELOAD_MODEL=parler-tts/parler_tts_mini_v0.1
 
 RUN apt-get update && \
     apt-get install --no-install-recommends -y curl git ffmpeg
 
-#RUN git clone https://github.com/matatonic/openedai-speech /app
 RUN mkdir -p /app/voices
-# default clone of the default voice is really bad, use a better default
-COPY voices/alloy-alt.wav /app/voices/
 WORKDIR /app
 COPY *.txt /app/
 RUN pip install --no-cache -r requirements.txt
-COPY *.sh /app/
-RUN ./download_voices_tts-1.sh
-RUN ./download_voices_tts-1-hd.sh
-COPY *.py *.yaml *.md LICENSE /app/
+COPY *.sh *.py *.yaml *.md LICENSE config /app/
 
 RUN apt-get clean && rm -rf /var/lib/apt/lists/*
 
-CMD python speech.py --host 0.0.0.0 --port 8000 --preload $PRELOAD_MODEL
+ENV CLI_COMMAND="python speech.py"
+CMD $CLI_COMMAND
diff --git a/Dockerfile.min b/Dockerfile.min
@@ -3,15 +3,13 @@ FROM python:3.11-slim
 RUN apt-get update && \
     apt-get install --no-install-recommends -y ffmpeg curl
 
-RUN pip install --no-cache piper-tts pyyaml fastapi uvicorn
+RUN pip install --no-cache piper-tts==1.2.0 pyyaml fastapi uvicorn
 
-#RUN git clone https://github.com/matatonic/openedai-speech /app
 RUN mkdir -p /app/voices
 COPY *.py *.yaml *.txt *.md *.sh LICENSE /app/
 WORKDIR /app
 
-RUN ./download_voices_tts-1.sh
-
 RUN apt-get clean && rm -rf /var/lib/apt/lists/*
 
-CMD python speech.py --host 0.0.0.0 --port 8000 --xtts_device none
+ENV CLI_COMMAND="python speech.py --xtts_device none"
+CMD $CLI_COMMAND
diff --git a/README.md b/README.md
@@ -25,6 +25,12 @@ Details:
 If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know so I can update the defaults.
 
 
+Version: 0.10.0, 2024-04-26
+
+* Better upgrades: Reorganize config files under config, voice models under voices
+* * **If you customized your `voice_to_speaker.yaml` or `pre_process_map.yaml` you need to move them to the `config/` folder.**
+* default listen host to 0.0.0.0
+
 Version: 0.9.0, 2024-04-23
 
 * Fix bug with yaml and loading UTF-8
@@ -54,45 +60,47 @@ API Documentation
 Installation instructions
 -------------------------
 
-You can run the server via docker like so (**recommended**):
+1) Download the models & voices
+```shell
+# for tts-1 / piper
+bash download_voices_tts-1.sh
+# and for tts-1-hd / xtts
+bash download_voices_tts-1-hd.sh
+```
+
+2a) Docker (**recommended**): You can run the server via docker like so:
 ```shell
+cp sample.env speech.env # edit to suit your environment as needed, you can preload a model on startup
 docker compose up
 ```
-If you want a minimal docker image with piper support only (900MB vs. 13.5GB, see: Dockerfile.min). You can edit the `docker-compose.yml` to easily change this.
+If you want a minimal docker image with piper support only (~1GB vs. ~10GB, see: Dockerfile.min). You can edit the `docker-compose.yml` to easily change this.
 
-Manual instructions:
+2b) Manual instructions:
 ```shell
 # Install the Python requirements
 pip install -r requirements.txt
 # install ffmpeg and curl
 sudo apt install ffmpeg curl
-# Download the voice models:
-# for tts-1
-bash download_voices_tts-1.sh
-# and for tts-1-hd
-bash download_voices_tts-1-hd.sh
+python speech.py
 ```
 
 Usage
 -----
 
 ```
-usage: speech.py [-h] [--piper_cuda] [--xtts_device XTTS_DEVICE] [--preload PRELOAD] [-P PORT]
-                 [-H HOST]
+usage: speech.py [-h] [--piper_cuda] [--xtts_device XTTS_DEVICE] [--preload PRELOAD] [-P PORT] [-H HOST]
 
 OpenedAI Speech API Server
 
 options:
   -h, --help            show this help message and exit
-  --piper_cuda          Enable cuda for piper. Note: --cuda/onnxruntime-gpu is not working for me,
-                        but cpu is fast enough (default: False)
+  --piper_cuda          Enable cuda for piper. Note: --cuda/onnxruntime-gpu is not working for me, but cpu is fast enough (default: False)
   --xtts_device XTTS_DEVICE
-                        Set the device for the xtts model. The special value of 'none' will use
-                        piper for all models. (default: cuda)
-  --preload PRELOAD     Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on
-                        first use. (default: None)
+                        Set the device for the xtts model. The special value of 'none' will use piper for all models. (default: cuda)
+  --preload PRELOAD     Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on first use. (default: None)
   -P PORT, --port PORT  Server tcp port (default: 8000)
-  -H HOST, --host HOST  Host to listen on, Ex. 0.0.0.0 (default: localhost)
+  -H HOST, --host HOST  Host to listen on, Ex. 0.0.0.0 (default: 0.0.0.0)
+
 ```
 
 Sample API Usage
@@ -141,10 +149,32 @@ with client.audio.speech.with_streaming_response.create(
 Also see the `say.py` sample application for an example of how to use the openai-python API.
 
 ```
-$ python say.py -i "The quick brown fox jumped over the lazy dog." -p # play the audio, requires 'pip install playsound'
-$ python say.py -i "The quick brown fox jumped over the lazy dog." -m tts-1-hd -v onyx -f flac -o fox.flac # save to a file.
+$ python say.py -t "The quick brown fox jumped over the lazy dog." -p # play the audio, requires 'pip install playsound'
+$ python say.py -t "The quick brown fox jumped over the lazy dog." -m tts-1-hd -v onyx -f flac -o fox.flac # save to a file.
+```
+
 ```
+usage: say.py [-h] [-m MODEL] [-v VOICE] [-f {mp3,aac,opus,flac}] [-s SPEED] [-t TEXT] [-i INPUT] [-o OUTPUT] [-p]
 
+Text to speech using the OpenAI API
+
+options:
+  -h, --help            show this help message and exit
+  -m MODEL, --model MODEL
+                        The model to use (default: tts-1)
+  -v VOICE, --voice VOICE
+                        The voice of the speaker (default: alloy)
+  -f {mp3,aac,opus,flac}, --format {mp3,aac,opus,flac}
+                        The output audio format (default: mp3)
+  -s SPEED, --speed SPEED
+                        playback speed, 0.25-4.0 (default: 1.0)
+  -t TEXT, --text TEXT  Provide text to read on the command line (default: None)
+  -i INPUT, --input INPUT
+                        Read text from a file (default is to read from stdin) (default: None)
+  -o OUTPUT, --output OUTPUT
+                        The filename to save the output to (default: None)
+  -p, --playsound       Play the audio (default: False)
+```
 
 Custom Voices Howto
 -------------------

diff --git a/pre_process_map.yaml → config/pre_process_map.default.yaml b/pre_process_map.yaml → config/pre_process_map.default.yaml
diff --git a/voice_to_speaker.yaml → config/voice_to_speaker.default.yaml b/voice_to_speaker.yaml → config/voice_to_speaker.default.yaml
@@ -2,7 +2,7 @@
   some_other_voice_name_you_want:
     model: voices/choose your own model.onnx
     speaker: set your own speaker
-  alloy: 
+  alloy:
     model: voices/en_US-libritts_r-medium.onnx
     speaker: 79 # 64, 79, 80, 101, 130
   echo:
@@ -24,7 +24,7 @@
     model: voices/en_US-libritts_r-medium.onnx
     speaker: 163
 tts-1-hd:
-  alloy: 
+  alloy:
     model: xtts
     speaker: voices/alloy-alt.wav
   alloy-orig: 

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,16 +1,17 @@
 services:
   server:
     build:
-      context: .
       dockerfile: Dockerfile # for tts-1-hd support via xtts_v2, ~4GB VRAM required, ~10GB
       #dockerfile: Dockerfile.min # piper for all models, no gpu/nvidia required, ~1GB
-    #command: ["python", "speech.py", "--host", "0.0.0.0", "--port", "8000",  "--preload", "parler-tts/parler_tts_mini_v0.1"]
-    command: ["python", "speech.py", "--host", "0.0.0.0", "--port", "8000", "--preload", "xtts"]
-    #command: ["python", "speech.py", "--host", "0.0.0.0", "--port", "8000", "--xtts_device", "none"] # min
+    image: ghcr.io/matatonic/openedai-speech
+    #image: ghcr.io/matatonic/openedai-speech-min
+    env_file: speech.env
     ports:
       - "8000:8000"
-#    volumes:
-#      - .:/app/
+    volumes:
+      - ./voices:/app/voices
+      - ./config:/app/config
+    #restart: unless-stopped # install as a service
     # Below can be removed if not using GPU
     runtime: nvidia
     deploy:

diff --git a/download_samples.sh b/download_samples.sh
@@ -1,4 +1,4 @@
 #!/bin/sh
 for i in alloy echo fable onyx nova shimmer; do
-	curl -s https://cdn.openai.com/API/docs/audio/$i.wav | ffmpeg -loglevel error -i - -ar 22050 -ac 1 voices/$i.wav
+	[ ! -e "voices/$i.wav" ] && curl -s https://cdn.openai.com/API/docs/audio/$i.wav | ffmpeg -loglevel error -i - -ar 22050 -ac 1 voices/$i.wav
 done
diff --git a/download_voices_tts-1-hd.sh b/download_voices_tts-1-hd.sh
@@ -1,4 +1,9 @@
 #!/bin/sh
 export COQUI_TOS_AGREED=1
-python -c "from TTS.utils.manage import ModelManager; ModelManager().download_model('$PRELOAD_MODEL')"
+export TTS_HOME=voices
+
+MODELS=${*:-xtts}
+for model in $MODELS; do
+	python -c "from TTS.utils.manage import ModelManager; ModelManager().download_model('$model')"
+done
 ./download_samples.sh
diff --git a/download_voices_tts-1.sh b/download_voices_tts-1.sh
@@ -1,5 +1,5 @@
 #!/bin/sh
-models="en_GB-northern_english_male-medium en_US-libritts_r-medium" # en_US-ryan-high 
+models=${*:-"en_GB-northern_english_male-medium en_US-libritts_r-medium"} # en_US-ryan-high
 piper --update-voices --data-dir voices --download-dir voices --model x 2> /dev/null
 for i in $models ; do
 	piper --data-dir voices --download-dir voices --model $i < /dev/null > /dev/null

diff --git a/openedai.py b/openedai.py
@@ -3,8 +3,8 @@
 from fastapi.responses import PlainTextResponse
 
 class OpenAIStub(FastAPI):
-    def __init__(self) -> None:
-        super().__init__()
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
         self.models = {}
 
         self.add_middleware(

diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,7 @@
 fastapi
 uvicorn
 # piper-tts
-piper-tts
+piper-tts==1.2.0
 onnxruntime-gpu
 # xtts
 TTS

diff --git a/sample.env b/sample.env
@@ -0,0 +1,6 @@
+TTS_HOME=voices
+HF_HOME=voices
+#PRELOAD_MODEL=xtts
+#PRELOAD_MODEL=xtts_v2.0.2
+#CLI_COMMAND="python speech.py --preload $PRELOAD_MODEL"
+#CLI_COMMAND="python speech.py --xtts_device none" # for piper only