Skip to content

Commit

Permalink
0.10.0
Browse files Browse the repository at this point in the history
  • Loading branch information
matatonic committed Apr 27, 2024
1 parent a2a3d2b commit 6864cf0
Show file tree
Hide file tree
Showing 16 changed files with 260 additions and 70 deletions.
127 changes: 127 additions & 0 deletions .github/workflows/build-docker.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
name: Build and Publish Docker Image

on:
workflow_dispatch:
push:
branches:
- 'main'
release:
types: [published]

jobs:
build-and-push-image:
runs-on: ubuntu-latest

permissions:
contents: read
packages: write

env:
# Set up environment variables for the job
DOCKER_REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}
TAG: ${{ github.sha }}

steps:
- name: Check out code
uses: actions/checkout@v4

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
with:
install: true

# Log in to the GitHub Container Registry only when not running on a pull request event
- name: Login to Docker Registry
uses: docker/login-action@v2
with:
registry: ${{ env.DOCKER_REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@v4
with:
images: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}

# Build and push the Docker image to GHCR for the main branch or specific tags
- name: Build and Push Docker Image
if: github.ref == 'refs/heads/main'
uses: docker/build-push-action@v4
with:
context: .
file: Dockerfile
push: true
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
labels: version=${{ github.run_id }}

# For tagged releases, build and push the Docker image with the corresponding tag
- name: Build and Push Docker Image (Tagged)
if: startsWith(github.ref, 'refs/tags/')
uses: docker/build-push-action@v4
with:
context: .
file: Dockerfile
push: true
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
labels: version=${{ github.run_id }}

build-and-push-alt-image:
runs-on: ubuntu-latest

permissions:
contents: read
packages: write

env:
# Set up environment variables for the job
DOCKER_REGISTRY: ghcr.io
IMAGE_NAME: matatonic/openedai-speech-min
TAG: ${{ github.sha }}

steps:
- name: Check out code
uses: actions/checkout@v4

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
with:
install: true

# Log in to the GitHub Container Registry only when not running on a pull request event
- name: Login to Docker Registry
uses: docker/login-action@v2
with:
registry: ${{ env.DOCKER_REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@v4
with:
images: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}

# Build and push the Docker image to GHCR for the main branch or specific tags
- name: Build and Push Docker Image
if: github.ref == 'refs/heads/main'
uses: docker/build-push-action@v4
with:
context: .
file: Dockerfile.min
push: true
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
labels: version=${{ github.run_id }}

# For tagged releases, build and push the Docker image with the corresponding tag
- name: Build and Push Docker Image (Tagged)
if: startsWith(github.ref, 'refs/tags/')
uses: docker/build-push-action@v4
with:
context: .
file: Dockerfile.min
push: true
tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
labels: version=${{ github.run_id }}

4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
voices/
.env
speech.env
config/pre_process_map.yaml
config/voice_to_speaker.yaml

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
13 changes: 3 additions & 10 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,24 +1,17 @@
FROM python:3.11-slim

ENV COQUI_TOS_AGREED=1
ENV PRELOAD_MODEL=xtts
# or PRELOAD_MODEL=parler-tts/parler_tts_mini_v0.1

RUN apt-get update && \
apt-get install --no-install-recommends -y curl git ffmpeg

#RUN git clone https://github.com/matatonic/openedai-speech /app
RUN mkdir -p /app/voices
# default clone of the default voice is really bad, use a better default
COPY voices/alloy-alt.wav /app/voices/
WORKDIR /app
COPY *.txt /app/
RUN pip install --no-cache -r requirements.txt
COPY *.sh /app/
RUN ./download_voices_tts-1.sh
RUN ./download_voices_tts-1-hd.sh
COPY *.py *.yaml *.md LICENSE /app/
COPY *.sh *.py *.yaml *.md LICENSE config /app/

RUN apt-get clean && rm -rf /var/lib/apt/lists/*

CMD python speech.py --host 0.0.0.0 --port 8000 --preload $PRELOAD_MODEL
ENV CLI_COMMAND="python speech.py"
CMD $CLI_COMMAND
8 changes: 3 additions & 5 deletions Dockerfile.min
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,13 @@ FROM python:3.11-slim
RUN apt-get update && \
apt-get install --no-install-recommends -y ffmpeg curl

RUN pip install --no-cache piper-tts pyyaml fastapi uvicorn
RUN pip install --no-cache piper-tts==1.2.0 pyyaml fastapi uvicorn

#RUN git clone https://github.com/matatonic/openedai-speech /app
RUN mkdir -p /app/voices
COPY *.py *.yaml *.txt *.md *.sh LICENSE /app/
WORKDIR /app

RUN ./download_voices_tts-1.sh

RUN apt-get clean && rm -rf /var/lib/apt/lists/*

CMD python speech.py --host 0.0.0.0 --port 8000 --xtts_device none
ENV CLI_COMMAND="python speech.py --xtts_device none"
CMD $CLI_COMMAND
68 changes: 49 additions & 19 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@ Details:
If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know so I can update the defaults.


Version: 0.10.0, 2024-04-26

* Better upgrades: Reorganize config files under config, voice models under voices
* * **If you customized your `voice_to_speaker.yaml` or `pre_process_map.yaml` you need to move them to the `config/` folder.**
* default listen host to 0.0.0.0

Version: 0.9.0, 2024-04-23

* Fix bug with yaml and loading UTF-8
Expand Down Expand Up @@ -54,45 +60,47 @@ API Documentation
Installation instructions
-------------------------

You can run the server via docker like so (**recommended**):
1) Download the models & voices
```shell
# for tts-1 / piper
bash download_voices_tts-1.sh
# and for tts-1-hd / xtts
bash download_voices_tts-1-hd.sh
```

2a) Docker (**recommended**): You can run the server via docker like so:
```shell
cp sample.env speech.env # edit to suit your environment as needed, you can preload a model on startup
docker compose up
```
If you want a minimal docker image with piper support only (900MB vs. 13.5GB, see: Dockerfile.min). You can edit the `docker-compose.yml` to easily change this.
If you want a minimal docker image with piper support only (~1GB vs. ~10GB, see: Dockerfile.min). You can edit the `docker-compose.yml` to easily change this.

Manual instructions:
2b) Manual instructions:
```shell
# Install the Python requirements
pip install -r requirements.txt
# install ffmpeg and curl
sudo apt install ffmpeg curl
# Download the voice models:
# for tts-1
bash download_voices_tts-1.sh
# and for tts-1-hd
bash download_voices_tts-1-hd.sh
python speech.py
```

Usage
-----

```
usage: speech.py [-h] [--piper_cuda] [--xtts_device XTTS_DEVICE] [--preload PRELOAD] [-P PORT]
[-H HOST]
usage: speech.py [-h] [--piper_cuda] [--xtts_device XTTS_DEVICE] [--preload PRELOAD] [-P PORT] [-H HOST]
OpenedAI Speech API Server
options:
-h, --help show this help message and exit
--piper_cuda Enable cuda for piper. Note: --cuda/onnxruntime-gpu is not working for me,
but cpu is fast enough (default: False)
--piper_cuda Enable cuda for piper. Note: --cuda/onnxruntime-gpu is not working for me, but cpu is fast enough (default: False)
--xtts_device XTTS_DEVICE
Set the device for the xtts model. The special value of 'none' will use
piper for all models. (default: cuda)
--preload PRELOAD Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on
first use. (default: None)
Set the device for the xtts model. The special value of 'none' will use piper for all models. (default: cuda)
--preload PRELOAD Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on first use. (default: None)
-P PORT, --port PORT Server tcp port (default: 8000)
-H HOST, --host HOST Host to listen on, Ex. 0.0.0.0 (default: localhost)
-H HOST, --host HOST Host to listen on, Ex. 0.0.0.0 (default: 0.0.0.0)
```

Sample API Usage
Expand Down Expand Up @@ -141,10 +149,32 @@ with client.audio.speech.with_streaming_response.create(
Also see the `say.py` sample application for an example of how to use the openai-python API.

```
$ python say.py -i "The quick brown fox jumped over the lazy dog." -p # play the audio, requires 'pip install playsound'
$ python say.py -i "The quick brown fox jumped over the lazy dog." -m tts-1-hd -v onyx -f flac -o fox.flac # save to a file.
$ python say.py -t "The quick brown fox jumped over the lazy dog." -p # play the audio, requires 'pip install playsound'
$ python say.py -t "The quick brown fox jumped over the lazy dog." -m tts-1-hd -v onyx -f flac -o fox.flac # save to a file.
```

```
usage: say.py [-h] [-m MODEL] [-v VOICE] [-f {mp3,aac,opus,flac}] [-s SPEED] [-t TEXT] [-i INPUT] [-o OUTPUT] [-p]
Text to speech using the OpenAI API
options:
-h, --help show this help message and exit
-m MODEL, --model MODEL
The model to use (default: tts-1)
-v VOICE, --voice VOICE
The voice of the speaker (default: alloy)
-f {mp3,aac,opus,flac}, --format {mp3,aac,opus,flac}
The output audio format (default: mp3)
-s SPEED, --speed SPEED
playback speed, 0.25-4.0 (default: 1.0)
-t TEXT, --text TEXT Provide text to read on the command line (default: None)
-i INPUT, --input INPUT
Read text from a file (default is to read from stdin) (default: None)
-o OUTPUT, --output OUTPUT
The filename to save the output to (default: None)
-p, --playsound Play the audio (default: False)
```

Custom Voices Howto
-------------------
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
some_other_voice_name_you_want:
model: voices/choose your own model.onnx
speaker: set your own speaker
alloy:
alloy:
model: voices/en_US-libritts_r-medium.onnx
speaker: 79 # 64, 79, 80, 101, 130
echo:
Expand All @@ -24,7 +24,7 @@
model: voices/en_US-libritts_r-medium.onnx
speaker: 163
tts-1-hd:
alloy:
alloy:
model: xtts
speaker: voices/alloy-alt.wav
alloy-orig:
Expand Down
13 changes: 7 additions & 6 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
services:
server:
build:
context: .
dockerfile: Dockerfile # for tts-1-hd support via xtts_v2, ~4GB VRAM required, ~10GB
#dockerfile: Dockerfile.min # piper for all models, no gpu/nvidia required, ~1GB
#command: ["python", "speech.py", "--host", "0.0.0.0", "--port", "8000", "--preload", "parler-tts/parler_tts_mini_v0.1"]
command: ["python", "speech.py", "--host", "0.0.0.0", "--port", "8000", "--preload", "xtts"]
#command: ["python", "speech.py", "--host", "0.0.0.0", "--port", "8000", "--xtts_device", "none"] # min
image: ghcr.io/matatonic/openedai-speech
#image: ghcr.io/matatonic/openedai-speech-min
env_file: speech.env
ports:
- "8000:8000"
# volumes:
# - .:/app/
volumes:
- ./voices:/app/voices
- ./config:/app/config
#restart: unless-stopped # install as a service
# Below can be removed if not using GPU
runtime: nvidia
deploy:
Expand Down
2 changes: 1 addition & 1 deletion download_samples.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/bin/sh
for i in alloy echo fable onyx nova shimmer; do
curl -s https://cdn.openai.com/API/docs/audio/$i.wav | ffmpeg -loglevel error -i - -ar 22050 -ac 1 voices/$i.wav
[ ! -e "voices/$i.wav" ] && curl -s https://cdn.openai.com/API/docs/audio/$i.wav | ffmpeg -loglevel error -i - -ar 22050 -ac 1 voices/$i.wav
done
7 changes: 6 additions & 1 deletion download_voices_tts-1-hd.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
#!/bin/sh
export COQUI_TOS_AGREED=1
python -c "from TTS.utils.manage import ModelManager; ModelManager().download_model('$PRELOAD_MODEL')"
export TTS_HOME=voices

MODELS=${*:-xtts}
for model in $MODELS; do
python -c "from TTS.utils.manage import ModelManager; ModelManager().download_model('$model')"
done
./download_samples.sh
2 changes: 1 addition & 1 deletion download_voices_tts-1.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/sh
models="en_GB-northern_english_male-medium en_US-libritts_r-medium" # en_US-ryan-high
models=${*:-"en_GB-northern_english_male-medium en_US-libritts_r-medium"} # en_US-ryan-high
piper --update-voices --data-dir voices --download-dir voices --model x 2> /dev/null
for i in $models ; do
piper --data-dir voices --download-dir voices --model $i < /dev/null > /dev/null
Expand Down
4 changes: 2 additions & 2 deletions openedai.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
from fastapi.responses import PlainTextResponse

class OpenAIStub(FastAPI):
def __init__(self) -> None:
super().__init__()
def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
self.models = {}

self.add_middleware(
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
fastapi
uvicorn
# piper-tts
piper-tts
piper-tts==1.2.0
onnxruntime-gpu
# xtts
TTS
Expand Down
6 changes: 6 additions & 0 deletions sample.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
TTS_HOME=voices
HF_HOME=voices
#PRELOAD_MODEL=xtts
#PRELOAD_MODEL=xtts_v2.0.2
#CLI_COMMAND="python speech.py --preload $PRELOAD_MODEL"
#CLI_COMMAND="python speech.py --xtts_device none" # for piper only
Loading

0 comments on commit 6864cf0

Please sign in to comment.