Spaces:

imseldrith
/

TTS-OPENAI-FREE

Running

App Files Files Community

Upload folder using huggingface_hub

by imseldrith - opened Jul 24

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+2683

-11

Files changed (31) hide show

.github/workflows/build-docker.yml +196 -0
.gitignore +166 -0
Dockerfile +35 -0
Dockerfile.min +20 -0
LICENSE +661 -0
README.md +397 -11
add_voice.py +63 -0
audio_reader.py +127 -0
config/config_files_will_go_here.txt +0 -0
docker-compose.min.yml +13 -0
docker-compose.rocm.yml +27 -0
docker-compose.yml +21 -0
download_samples.bat +6 -0
download_samples.sh +4 -0
download_voices_tts-1-hd.bat +8 -0
download_voices_tts-1-hd.sh +8 -0
download_voices_tts-1.bat +8 -0
download_voices_tts-1.sh +6 -0
openedai.py +181 -0
pre_process_map.default.yaml +37 -0
requirements-min.txt +5 -0
requirements-rocm.txt +10 -0
requirements.txt +16 -0
sample.env +6 -0
say.py +96 -0
speech.py +415 -0
startup.bat +8 -0
startup.min.sh +7 -0
startup.sh +10 -0
test_voices.sh +67 -0
voice_to_speaker.default.yaml +59 -0

.github/workflows/build-docker.yml ADDED Viewed

	@@ -0,0 +1,196 @@

+name: Build and Publish Docker Image
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - 'main'
+  release:
+    types: [published]
+jobs:
+  build-and-push-image:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    env:
+      # Set up environment variables for the job
+      DOCKER_REGISTRY: ghcr.io
+      IMAGE_NAME: ${{ github.repository }}
+      TAG: ${{ github.sha }}
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v4
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+        with:
+          install: true
+      # Log in to the GitHub Container Registry only when not running on a pull request event
+      - name: Login to Docker Registry
+        uses: docker/login-action@v2
+        with:
+          registry: ${{ env.DOCKER_REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@v4
+        with:
+          images: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}
+      # Build and push the Docker image to GHCR for the main branch or specific tags
+      - name: Build and Push Docker Image
+        if: github.ref == 'refs/heads/main'
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: Dockerfile
+          push: true
+          tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
+          labels: version=${{ github.run_id }}
+          platforms: linux/amd64,linux/arm64
+      # For tagged releases, build and push the Docker image with the corresponding tag
+      - name: Build and Push Docker Image (Tagged)
+        if: startsWith(github.ref, 'refs/tags/')
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: Dockerfile
+          push: true
+          tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
+          labels: version=${{ github.run_id }}
+          platforms: linux/amd64,linux/arm64
+  build-and-push-min-image:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    env:
+      # Set up environment variables for the job
+      DOCKER_REGISTRY: ghcr.io
+      IMAGE_NAME: ${{ github.repository }}-min
+      TAG: ${{ github.sha }}
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v4
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+        with:
+          install: true
+      # Log in to the GitHub Container Registry only when not running on a pull request event
+      - name: Login to Docker Registry
+        uses: docker/login-action@v2
+        with:
+          registry: ${{ env.DOCKER_REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@v4
+        with:
+          images: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}
+      # Build and push the Docker image to GHCR for the main branch or specific tags
+      - name: Build and Push Docker Image
+        if: github.ref == 'refs/heads/main'
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: Dockerfile.min
+          push: true
+          tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
+          labels: version=${{ github.run_id }}
+          platforms: linux/amd64,linux/arm64
+      # For tagged releases, build and push the Docker image with the corresponding tag
+      - name: Build and Push Docker Image (Tagged)
+        if: startsWith(github.ref, 'refs/tags/')
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: Dockerfile.min
+          push: true
+          tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
+          labels: version=${{ github.run_id }}
+          platforms: linux/amd64,linux/arm64
+  build-and-push-rocm-image:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+    env:
+      # Set up environment variables for the job
+      USE_ROCM: 1
+      DOCKER_REGISTRY: ghcr.io
+      IMAGE_NAME: ${{ github.repository }}-rocm
+      TAG: ${{ github.sha }}
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v4
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+        with:
+          install: true
+      # Log in to the GitHub Container Registry only when not running on a pull request event
+      - name: Login to Docker Registry
+        uses: docker/login-action@v2
+        with:
+          registry: ${{ env.DOCKER_REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@v4
+        with:
+          images: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}
+      # Build and push the Docker image to GHCR for the main branch or specific tags
+      - name: Build and Push Docker Image
+        if: github.ref == 'refs/heads/main'
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: Dockerfile
+          push: true
+          tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:latest
+          labels: version=${{ github.run_id }}
+          platforms: linux/amd64,linux/arm64
+          build-args: |
+            USE_ROCM=1
+      # For tagged releases, build and push the Docker image with the corresponding tag
+      - name: Build and Push Docker Image (Tagged)
+        if: startsWith(github.ref, 'refs/tags/')
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: Dockerfile
+          push: true
+          tags: ${{ env.DOCKER_REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}
+          labels: version=${{ github.run_id }}
+          platforms: linux/amd64,linux/arm64
+          build-args: |
+            USE_ROCM=1

.gitignore ADDED Viewed

	@@ -0,0 +1,166 @@

+voices/
+.env
+speech.env
+config/pre_process_map.yaml
+config/voice_to_speaker.yaml
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

Dockerfile ADDED Viewed

	@@ -0,0 +1,35 @@

+FROM python:3.11-slim
+RUN --mount=type=cache,target=/root/.cache/pip pip install -U pip
+ARG TARGETPLATFORM
+RUN apt-get update && apt-get install --no-install-recommends -y curl ffmpeg
+RUN if [ "$TARGETPLATFORM" != "linux/amd64" ]; then apt-get install --no-install-recommends -y build-essential ; fi
+RUN if [ "$TARGETPLATFORM" != "linux/amd64" ]; then curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y ; fi
+ENV PATH="/root/.cargo/bin:${PATH}"
+# for deepspeed support - doesn't seem worth it, image +7.5GB, over the 10GB ghcr.io limit, and no noticable gain in speed or VRAM usage?
+#RUN curl -O https://developer.download.nvidia.com/compute/cuda/repos/debian11/x86_64/cuda-keyring_1.1-1_all.deb
+#RUN dpkg -i cuda-keyring_1.1-1_all.deb && rm cuda-keyring_1.1-1_all.deb
+#RUN apt-get update && apt-get install --no-install-recommends -y libaio-dev build-essential cuda-toolkit
+#ENV CUDA_HOME=/usr/local/cuda
+RUN apt-get clean && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+RUN mkdir -p voices config
+ARG USE_ROCM
+ENV USE_ROCM=${USE_ROCM}
+COPY requirements*.txt /app/
+RUN if [ "${USE_ROCM}" = "1" ]; then mv /app/requirements-rocm.txt /app/requirements.txt; fi
+RUN --mount=type=cache,target=/root/.cache/pip pip install -r requirements.txt
+COPY *.py *.sh *.default.yaml README.md LICENSE /app/
+ARG PRELOAD_MODEL
+ENV PRELOAD_MODEL=${PRELOAD_MODEL}
+ENV TTS_HOME=voices
+ENV HF_HOME=voices
+ENV COQUI_TOS_AGREED=1
+CMD bash startup.sh

Dockerfile.min ADDED Viewed

	@@ -0,0 +1,20 @@

+FROM python:3.11-slim
+ARG TARGETPLATFORM
+RUN apt-get update && apt-get install --no-install-recommends -y curl ffmpeg
+RUN if [ "$TARGETPLATFORM" != "linux/amd64" ]; then apt-get install --no-install-recommends -y build-essential ; fi
+RUN if [ "$TARGETPLATFORM" != "linux/amd64" ]; then curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y ; fi
+ENV PATH="/root/.cargo/bin:${PATH}"
+RUN apt-get clean && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+RUN mkdir -p voices config
+COPY requirements*.txt /app/
+RUN --mount=type=cache,target=/root/.cache/pip pip install -r requirements-min.txt
+COPY *.py *.sh *.default.yaml README.md LICENSE /app/
+ENV TTS_HOME=voices
+ENV HF_HOME=voices
+CMD bash startup.min.sh

LICENSE ADDED Viewed

	@@ -0,0 +1,661 @@

+                    GNU AFFERO GENERAL PUBLIC LICENSE
+                       Version 3, 19 November 2007
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+                            Preamble
+  The GNU Affero General Public License is a free, copyleft license for
+software and other kinds of works, specifically designed to ensure
+cooperation with the community in the case of network server software.
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+our General Public Licenses are intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+  Developers that use our General Public Licenses protect your rights
+with two steps: (1) assert copyright on the software, and (2) offer
+you this License which gives you legal permission to copy, distribute
+and/or modify the software.
+  A secondary benefit of defending all users' freedom is that
+improvements made in alternate versions of the program, if they
+receive widespread use, become available for other developers to
+incorporate.  Many developers of free software are heartened and
+encouraged by the resulting cooperation.  However, in the case of
+software used on network servers, this result may fail to come about.
+The GNU General Public License permits making a modified version and
+letting the public access it on a server without ever releasing its
+source code to the public.
+  The GNU Affero General Public License is designed specifically to
+ensure that, in such cases, the modified source code becomes available
+to the community.  It requires the operator of a network server to
+provide the source code of the modified version running there to the
+users of that server.  Therefore, public use of a modified version, on
+a publicly accessible server, gives the public access to the source
+code of the modified version.
+  An older license, called the Affero General Public License and
+published by Affero, was designed to accomplish similar goals.  This is
+a different license, not a version of the Affero GPL, but Affero has
+released a new version of the Affero GPL which permits relicensing under
+this license.
+  The precise terms and conditions for copying, distribution and
+modification follow.
+                       TERMS AND CONDITIONS
+  0. Definitions.
+  "This License" refers to version 3 of the GNU Affero General Public License.
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+  1. Source Code.
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+  The Corresponding Source for a work in source code form is that
+same work.
+  2. Basic Permissions.
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+  4. Conveying Verbatim Copies.
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+  5. Conveying Modified Source Versions.
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+  6. Conveying Non-Source Forms.
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+  7. Additional Terms.
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+  8. Termination.
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+  9. Acceptance Not Required for Having Copies.
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+  10. Automatic Licensing of Downstream Recipients.
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+  11. Patents.
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+  12. No Surrender of Others' Freedom.
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+  13. Remote Network Interaction; Use with the GNU General Public License.
+  Notwithstanding any other provision of this License, if you modify the
+Program, your modified version must prominently offer all users
+interacting with it remotely through a computer network (if your version
+supports such interaction) an opportunity to receive the Corresponding
+Source of your version by providing access to the Corresponding Source
+from a network server at no charge, through some standard or customary
+means of facilitating copying of software.  This Corresponding Source
+shall include the Corresponding Source for any work covered by version 3
+of the GNU General Public License that is incorporated pursuant to the
+following paragraph.
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the work with which it is combined will remain governed by version
+3 of the GNU General Public License.
+  14. Revised Versions of this License.
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU Affero General Public License from time to time.  Such new versions
+will be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU Affero General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU Affero General Public License, you may choose any version ever published
+by the Free Software Foundation.
+  If the Program specifies that a proxy can decide which future
+versions of the GNU Affero General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+  15. Disclaimer of Warranty.
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+  16. Limitation of Liability.
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+  17. Interpretation of Sections 15 and 16.
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+                     END OF TERMS AND CONDITIONS
+            How to Apply These Terms to Your New Programs
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License as published
+    by the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+    You should have received a copy of the GNU Affero General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+Also add information on how to contact you by electronic and paper mail.
+  If your software can interact with users remotely through a computer
+network, you should also make sure that it provides a way for users to
+get its source.  For example, if your program is a web application, its
+interface could display a "Source" link that leads users to an archive
+of the code.  There are many ways you could offer source, and different
+solutions will be better for different programs; see section 13 for the
+specific requirements.
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU AGPL, see
+<https://www.gnu.org/licenses/>.

README.md CHANGED Viewed

@@ -1,11 +1,397 @@
----
-title: TTS OPENAI FREE
-emoji: 👀
-colorFrom: indigo
-colorTo: blue
-sdk: docker
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# OpenedAI Speech
+An OpenAI API compatible text to speech server.
+* Compatible with the OpenAI audio/speech API
+* Serves the [/v1/audio/speech endpoint](https://platform.openai.com/docs/api-reference/audio/createSpeech)
+* Not affiliated with OpenAI in any way, does not require an OpenAI API Key
+* A free, private, text-to-speech server with custom voice cloning
+Full Compatibility:
+* `tts-1`: `alloy`, `echo`, `fable`, `onyx`, `nova`, and `shimmer` (configurable)
+* `tts-1-hd`:  `alloy`, `echo`, `fable`, `onyx`, `nova`, and `shimmer` (configurable, uses OpenAI samples by default)
+* response_format: `mp3`, `opus`, `aac`, `flac`, `wav` and `pcm`
+* speed 0.25-4.0 (and more)
+Details:
+* Model `tts-1` via [piper tts](https://github.com/rhasspy/piper) (very fast, runs on cpu)
+  * You can map your own [piper voices](https://rhasspy.github.io/piper-samples/) via the `voice_to_speaker.yaml` configuration file
+* Model `tts-1-hd` via [coqui-ai/TTS](https://github.com/coqui-ai/TTS) xtts_v2 voice cloning (fast, but requires around 4GB GPU VRAM)
+  * Custom cloned voices can be used for tts-1-hd, See: [Custom Voices Howto](#custom-voices-howto)
+  * 🌐 [Multilingual](#multilingual) support with XTTS voices, the language is automatically detected if not set
+  * [Custom fine-tuned XTTS model support](#custom-fine-tuned-model-support)
+  * Configurable [generation parameters](#generation-parameters)
+  * Streamed output while generating
+* Occasionally, certain words or symbols may sound incorrect, you can fix them with regex via `pre_process_map.yaml`
+* Tested with python 3.9-3.11, piper does not install on python 3.12 yet
+If you find a better voice match for `tts-1` or `tts-1-hd`, please let me know so I can update the defaults.
+## Recent Changes
+Version 0.17.2, 2024-07-01
+* fix -min image (re: langdetect)
+Version 0.17.1, 2024-07-01
+* fix ROCm (add langdetect to requirements-rocm.txt)
+* Fix zh-cn for xtts
+Version 0.17.0, 2024-07-01
+* Automatic language detection, thanks [@RodolfoCastanheira](https://github.com/RodolfoCastanheira)
+Version 0.16.0, 2024-06-29
+* Multi-client safe version. Audio generation is synchronized in a single process. The estimated 'realtime' factor of XTTS on a GPU is roughly 1/3, this means that multiple streams simultaneously, or `speed` over 2, may experience audio underrun (delays or pauses in playback). This makes multiple clients possible and safe, but in practice 2 or 3 simultaneous streams is the maximum without audio underrun.
+Version 0.15.1, 2024-06-27
+* Remove deepspeed from requirements.txt, it's too complex for typical users. A more detailed deepspeed install document will be required.
+Version 0.15.0, 2024-06-26
+* Switch to [coqui-tts](https://github.com/idiap/coqui-ai-TTS) (updated fork), updated simpler dependencies, torch 2.3, etc.
+* Resolve cuda threading issues
+Version 0.14.1, 2024-06-26
+* Make deepspeed possible (`--use-deepspeed`), but not enabled in pre-built docker images (too large). Requires the cuda-toolkit installed, see the Dockerfile comment for details
+Version 0.14.0, 2024-06-26
+* Added `response_format`: `wav` and `pcm` support
+* Output streaming (while generating) for `tts-1` and `tts-1-hd`
+* Enhanced [generation parameters](#generation-parameters) for xtts models (temperature, top_p, etc.)
+* Idle unload timer (optional) - doesn't work perfectly yet
+* Improved error handling
+Version 0.13.0, 2024-06-25
+* Added [Custom fine-tuned XTTS model support](#custom-fine-tuned-model-support)
+* Initial prebuilt arm64 image support (Apple M-series, Raspberry Pi - MPS is not supported in XTTS/torch), thanks [@JakeStevenson](https://github.com/JakeStevenson), [@hchasens](https://github.com/hchasens)
+* Initial attempt at AMD GPU (ROCm 5.7) support
+* Parler-tts support removed
+* Move the *.default.yaml to the root folder
+* Run the docker as a service by default (`restart: unless-stopped`)
+* Added `audio_reader.py` for streaming text input and reading long texts
+Version 0.12.3, 2024-06-17
+* Additional logging details for BadRequests (400)
+Version 0.12.2, 2024-06-16
+* Fix :min image requirements (numpy<2?)
+Version 0.12.0, 2024-06-16
+* Improved error handling and logging
+* Restore the original alloy tts-1-hd voice by default, use alloy-alt for the old voice.
+Version 0.11.0, 2024-05-29
+* 🌐 [Multilingual](#multilingual) support (16 languages) with XTTS
+* Remove high Unicode filtering from the default `config/pre_process_map.yaml`
+* Update Docker build & app startup. thanks @justinh-rahb
+* Fix: "Plan failed with a cudnnException"
+* Remove piper cuda support
+Version: 0.10.1, 2024-05-05
+* Remove `runtime: nvidia` from docker-compose.yml, this assumes nvidia/cuda compatible runtime is available by default. thanks [@jmtatsch](https://github.com/jmtatsch)
+Version: 0.10.0, 2024-04-27
+* Pre-built & tested docker images, smaller docker images (8GB or 860MB)
+* Better upgrades: reorganize config files under `config/`, voice models under `voices/`
+* **Compatibility!** If you customized your `voice_to_speaker.yaml` or `pre_process_map.yaml` you need to move them to the `config/` folder.
+* default listen host to 0.0.0.0
+Version: 0.9.0, 2024-04-23
+* Fix bug with yaml and loading UTF-8
+* New sample text-to-speech application `say.py`
+* Smaller docker base image
+* Add beta [parler-tts](https://huggingface.co/parler-tts/parler_tts_mini_v0.1) support (you can describe very basic features of the speaker voice), See: (https://www.text-description-to-speech.com/) for some examples of how to describe voices. Voices can be defined in the `voice_to_speaker.default.yaml`. Two example [parler-tts](https://huggingface.co/parler-tts/parler_tts_mini_v0.1) voices are included in the `voice_to_speaker.default.yaml` file. `parler-tts` is experimental software and is kind of slow. The exact voice will be slightly different each generation but should be similar to the basic description.
+...
+Version: 0.7.3, 2024-03-20
+* Allow different xtts versions per voice in `voice_to_speaker.yaml`, ex. xtts_v2.0.2
+* Quality: Fix xtts sample rate (24000 vs. 22050 for piper) and pops
+## Installation instructions
+### Create a `speech.env` environment file
+Copy the `sample.env` to `speech.env` (customize if needed)
+```bash
+cp sample.env speech.env
+```
+#### Defaults
+```bash
+TTS_HOME=voices
+HF_HOME=voices
+#PRELOAD_MODEL=xtts
+#PRELOAD_MODEL=xtts_v2.0.2
+#EXTRA_ARGS=--log-level DEBUG --unload-timer 300
+#USE_ROCM=1
+```
+### Option A: Manual installation
+```shell
+# install curl and ffmpeg
+sudo apt install curl ffmpeg
+# Create & activate a new virtual environment (optional but recommended)
+python -m venv .venv
+source .venv/bin/activate
+# Install the Python requirements
+# - use requirements-rocm.txt for AMD GPU (ROCm support)
+# - use requirements-min.txt for piper only (CPU only)
+pip install -U -r requirements.txt
+# run the server
+bash startup.sh
+```
+> On first run, the voice models will be downloaded automatically. This might take a while depending on your network connection.
+### Option B: Docker Image (*recommended*)
+#### Nvidia GPU (cuda)
+```shell
+docker compose up
+```
+#### AMD GPU (ROCm support)
+```shell
+docker compose -f docker-compose.rocm.yml up
+```
+#### ARM64 (Apple M-series, Raspberry Pi)
+> XTTS only has CPU support here and will be very slow, you can use the Nvidia image for XTTS with CPU (slow), or use the piper only image (recommended)
+#### CPU only, No GPU (piper only)
+> For a minimal docker image with only piper support (<1GB vs. 8GB).
+```shell
+docker compose -f docker-compose.min.yml up
+```
+## Server Options
+```shell
+usage: speech.py [-h] [--xtts_device XTTS_DEVICE] [--preload PRELOAD] [--unload-timer UNLOAD_TIMER] [--use-deepspeed] [--no-cache-speaker] [-P PORT] [-H HOST]
+                 [-L {DEBUG,INFO,WARNING,ERROR,CRITICAL}]
+OpenedAI Speech API Server
+options:
+  -h, --help            show this help message and exit
+  --xtts_device XTTS_DEVICE
+                        Set the device for the xtts model. The special value of 'none' will use piper for all models. (default: cuda)
+  --preload PRELOAD     Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on first use. (default: None)
+  --unload-timer UNLOAD_TIMER
+                        Idle unload timer for the XTTS model in seconds, Ex. 900 for 15 minutes (default: None)
+  --use-deepspeed       Use deepspeed with xtts (this option is unsupported) (default: False)
+  --no-cache-speaker    Don't use the speaker wav embeddings cache (default: False)
+  -P PORT, --port PORT  Server tcp port (default: 8000)
+  -H HOST, --host HOST  Host to listen on, Ex. 0.0.0.0 (default: 0.0.0.0)
+  -L {DEBUG,INFO,WARNING,ERROR,CRITICAL}, --log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}
+                        Set the log level (default: INFO)
+```
+## Sample Usage
+You can use it like this:
+```shell
+curl http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -d '{
+    "model": "tts-1",
+    "input": "The quick brown fox jumped over the lazy dog.",
+    "voice": "alloy",
+    "response_format": "mp3",
+    "speed": 1.0
+  }' > speech.mp3
+```
+Or just like this:
+```shell
+curl -s http://localhost:8000/v1/audio/speech -H "Content-Type: application/json" -d '{
+    "input": "The quick brown fox jumped over the lazy dog."}' > speech.mp3
+```
+Or like this example from the [OpenAI Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech):
+```python
+import openai
+client = openai.OpenAI(
+  # This part is not needed if you set these environment variables before import openai
+  # export OPENAI_API_KEY=sk-11111111111
+  # export OPENAI_BASE_URL=http://localhost:8000/v1
+  api_key = "sk-111111111",
+  base_url = "http://localhost:8000/v1",
+)
+with client.audio.speech.with_streaming_response.create(
+  model="tts-1",
+  voice="alloy",
+  input="Today is a wonderful day to build something people love!"
+) as response:
+  response.stream_to_file("speech.mp3")
+```
+Also see the `say.py` sample application for an example of how to use the openai-python API.
+```shell
+# play the audio, requires 'pip install playsound'
+python say.py -t "The quick brown fox jumped over the lazy dog." -p
+# save to a file in flac format
+python say.py -t "The quick brown fox jumped over the lazy dog." -m tts-1-hd -v onyx -f flac -o fox.flac
+```
+You can also try the included `audio_reader.py` for listening to longer text and streamed input.
+Example usage:
+```bash
+python audio_reader.py -s 2 < LICENSE # read the software license - fast
+```
+## OpenAI API Documentation and Guide
+* [OpenAI Text to speech guide](https://platform.openai.com/docs/guides/text-to-speech)
+* [OpenAI API Reference](https://platform.openai.com/docs/api-reference/audio/createSpeech)
+## Custom Voices Howto
+### Piper
+  1. Select the piper voice and model from the [piper samples](https://rhasspy.github.io/piper-samples/)
+  2. Update the `config/voice_to_speaker.yaml` with a new section for the voice, for example:
+```yaml
+...
+tts-1:
+  ryan:
+    model: voices/en_US-ryan-high.onnx
+    speaker: # default speaker
+```
+  3. New models will be downloaded as needed, of you can download them in advance with `download_voices_tts-1.sh`. For example:
+```shell
+bash download_voices_tts-1.sh en_US-ryan-high
+```
+### Coqui XTTS v2
+Coqui XTTS v2 voice cloning can work with as little as 6 seconds of clear audio. To create a custom voice clone, you must prepare a WAV file sample of the voice.
+#### Guidelines for preparing good sample files for Coqui XTTS v2
+* Mono (single channel) 22050 Hz WAV file
+* 6-30 seconds long - longer isn't always better (I've had some good results with as little as 4 seconds)
+* low noise (no hiss or hum)
+* No partial words, breathing, laughing, music or backgrounds sounds
+* An even speaking pace with a variety of words is best, like in interviews or audiobooks.
+You can use FFmpeg to prepare your audio files, here are some examples:
+```shell
+# convert a multi-channel audio file to mono, set sample rate to 22050 hz, trim to 6 seconds, and output as WAV file.
+ffmpeg -i input.mp3 -ac 1 -ar 22050 -t 6 -y me.wav
+# use a simple noise filter to clean up audio, and select a start time start for sampling.
+ffmpeg -i input.wav -af "highpass=f=200, lowpass=f=3000" -ac 1 -ar 22050 -ss 00:13:26.2 -t 6 -y me.wav
+# A more complex noise reduction setup, including volume adjustment
+ffmpeg -i input.mkv -af "highpass=f=200, lowpass=f=3000, volume=5, afftdn=nf=25" -ac 1 -ar 22050 -ss 00:13:26.2 -t 6 -y me.wav
+```
+Once your WAV file is prepared, save it in the `/voices/` directory and update the `config/voice_to_speaker.yaml` file with the new file name.
+For example:
+```yaml
+...
+tts-1-hd:
+  me:
+    model: xtts
+    speaker: voices/me.wav # this could be you
+```
+## Multilingual
+Multilingual cloning support was added in version 0.11.0 and is available only with the XTTS v2 model. To use multilingual voices with piper simply download a language specific voice.
+Coqui XTTSv2 has support for multiple languages: English (`en`), Spanish (`es`), French (`fr`), German (`de`), Italian (`it`), Portuguese (`pt`), Polish (`pl`), Turkish (`tr`), Russian (`ru`), Dutch (`nl`), Czech (`cs`), Arabic (`ar`), Chinese (`zh-cn`), Hungarian (`hu`), Korean (`ko`), Japanese (`ja`), and Hindi (`hi`). When not set, an attempt will be made to automatically detect the language, falling back to English (`en`).
+Unfortunately the OpenAI API does not support language, but you can create your own custom speaker voice and set the language for that.
+1) Create the WAV file for your speaker, as in [Custom Voices Howto](#custom-voices-howto)
+2) Add the voice to `config/voice_to_speaker.yaml` and include the correct Coqui `language` code for the speaker. For example:
+```yaml
+  xunjiang:
+    model: xtts
+    speaker: voices/xunjiang.wav
+    language: zh-cn
+```
+3) Don't remove high unicode characters in your `config/pre_process_map.yaml`! If you have these lines, you will need to remove them. For example:
+Remove:
+```yaml
+- - '[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U00002702-\U000027B0\U000024C2-\U0001F251]+'
+  - ''
+```
+These lines were added to the `config/pre_process_map.yaml` config file by default before version 0.11.0:
+4) Your new multi-lingual speaker voice is ready to use!
+## Custom Fine-Tuned Model Support
+Adding a custom xtts model is simple. Here is an example of how to add a custom fine-tuned 'halo' XTTS model.
+1) Save the model folder under `voices/` (all 4 files are required, including the vocab.json from the model)
+```
+openedai-speech$ ls voices/halo/
+config.json  vocab.json  model.pth  sample.wav
+```
+2) Add the custom voice entry under the `tts-1-hd` section of `config/voice_to_speaker.yaml`:
+```yaml
+tts-1-hd:
+...
+  halo:
+    model: halo # This name is required to be unique
+    speaker: voices/halo/sample.wav # voice sample is required
+    model_path: voices/halo
+```
+3) The model will be loaded when you access the voice for the first time (`--preload` doesn't work with custom models yet)
+## Generation Parameters
+The generation of XTTSv2 voices can be fine tuned with the following options (defaults included below):
+```yaml
+tts-1-hd:
+  alloy:
+    model: xtts
+    speaker: voices/alloy.wav
+    enable_text_splitting: True
+    length_penalty: 1.0
+    repetition_penalty: 10
+    speed: 1.0
+    temperature: 0.75
+    top_k: 50
+    top_p: 0.85
+```

add_voice.py ADDED Viewed

	@@ -0,0 +1,63 @@

+#!/usr/bin/env python
+import argparse
+import os
+import shutil
+import yaml
+print("!! WARNING EXPERIMENTAL !! - THIS TOOL WILL ERASE ALL COMMENTS FROM THE CONFIG FILES .. OR WORSE!!")
+parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('sample', action='store', help="Set the wav sample file")
+parser.add_argument('-n', '--name', action='store', help="Set the name for the voice (by default will use the WAV file name)")
+parser.add_argument('-l', '--language', action='store', default="auto", help="Set the language for the voice",
+                    choices=['auto', 'en', 'es', 'fr', 'de', 'it', 'pt', 'pl', 'tr', 'ru', 'nl', 'cs', 'ar', 'zh-cn', 'ja', 'hu', 'ko', 'hi'])
+parser.add_argument('--openai-model', action='store', default="tts-1-hd", help="Set the openai model for the voice")
+parser.add_argument('--xtts-model', action='store', default="xtts", help="Set the xtts model for the voice (if using a custom model, also set model_path)")
+parser.add_argument('--model-path', action='store', default=None, help="Set the path for a custom xtts model")
+parser.add_argument('--config-path', action='store', default="config/voice_to_speaker.yaml", help="Set the config file path")
+parser.add_argument('--voice-path', action='store', default="voices", help="Set the default voices file path")
+parser.add_argument('--default-path', action='store', default="voice_to_speaker.default.yaml", help="Set the default config file path")
+args = parser.parse_args()
+basename = os.path.basename(args.sample)
+name_noext, ext = os.path.splitext(basename)
+if not args.name:
+    args.name = name_noext
+else:
+    basename = f"{args.name}.wav"
+dest_file = os.path.join(args.voice_path, basename)
+if args.sample != dest_file:
+    shutil.copy2(args.sample, dest_file)
+if not os.path.exists(args.config_path):
+    shutil.copy2(args.default_path, args.config_path)
+with open(args.config_path, 'r', encoding='utf8') as file:
+    voice_map = yaml.safe_load(file)
+model_conf = voice_map.get(args.openai_model, {})
+model_conf[args.name] = {
+    'model': args.xtts_model,
+    'speaker': os.path.join(args.voice_path, basename),
+    'language': args.language,
+}
+if args.model_path:
+    model_conf[args.name]['model_path'] = args.model_path
+voice_map[args.openai_model] = model_conf
+with open(args.config_path, 'w', encoding='utf8') as ofile:
+    yaml.safe_dump(voice_map, ofile, default_flow_style=False, allow_unicode=True)
+print(f"Updated: {args.config_path}")
+print(f"Added voice: {args.openai_model}/{args.name}")
+print(f"Added section:")
+print(f"{args.openai_model}:")
+print(f"  {args.name}:")
+print(f"    model: {model_conf[args.name]['model']}")
+print(f"    speaker: {model_conf[args.name]['speaker']}")
+print(f"    language: {model_conf[args.name]['language']}")

audio_reader.py ADDED Viewed

	@@ -0,0 +1,127 @@

+#!/usr/bin/env python3
+try:
+    import dotenv
+    dotenv.load_dotenv()
+except ImportError:
+    pass
+import argparse
+import os
+import pysbd
+import queue
+import sys
+import tempfile
+import threading
+import shutil
+import sys
+import tempfile
+import contextlib
+import openai
+try:
+    from playsound import playsound
+except ImportError:
+    print("Error: missing required package 'playsound'. !pip install playsound")
+    sys.exit(1)
+@contextlib.contextmanager
+def tempdir():
+    path = tempfile.mkdtemp()
+    try:
+        yield path
+    finally:
+        try:
+            shutil.rmtree(path)
+        except IOError:
+            sys.stderr.write('Failed to clean up temp dir {}'.format(path))
+class SimpleAudioPlayer:
+    def __init__(self):
+        self._queue = queue.Queue()
+        self.running = True
+        self._thread = threading.Thread(target=self.__play_audio_loop, daemon=True)
+        self._thread.start()
+    def put(self, file):
+        self._queue.put(file)
+    def stop(self):
+        self.running = False
+        self._thread.join()
+        try:
+            while True:
+                file = self._queue.get_nowait()
+                if os.path.exists(file):
+                    os.unlink(file)
+        except queue.Empty as e:
+            pass
+    def __play_audio_loop(self):
+        while self.running:
+            try:
+                while True:
+                    file = self._queue.get(block=True, timeout=0.01)
+                    try:
+                        playsound(file)
+                    finally:
+                        os.unlink(file)
+            except queue.Empty as e:
+                continue
+class OpenAI_tts:
+    def __init__(self, model, voice, speed, base_dir):
+        self.base_dir = base_dir
+        self.openai_client = openai.OpenAI(
+            # export OPENAI_API_KEY=sk-11111111111
+            # export OPENAI_BASE_URL=http://localhost:8000/v1
+            api_key = os.environ.get("OPENAI_API_KEY", "sk-ip"),
+            base_url = os.environ.get("OPENAI_BASE_URL", "http://localhost:8000/v1"),
+        )
+        self.params = {
+            'model': model,
+            'voice': voice,
+            'speed': speed
+        }
+    def speech_to_file(self, text: str) -> None:
+        with self.openai_client.audio.speech.with_streaming_response.create(
+                input=text, response_format='opus', **self.params
+            ) as response:
+            tf, output_filename = tempfile.mkstemp(suffix='.wav', prefix="audio_reader_", dir=self.base_dir)
+            response.stream_to_file(output_filename)
+            return output_filename
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description='Text to speech player',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('-m', '--model', action='store', default="tts-1", help="The OpenAI model")
+    parser.add_argument('-v', '--voice', action='store', default="alloy", help="The voice to use")
+    parser.add_argument('-s', '--speed', action='store', default=1.0, help="How fast to read the audio")
+    args = parser.parse_args()
+    try:
+        with tempdir() as base_dir:
+            player = SimpleAudioPlayer()
+            reader = OpenAI_tts(voice=args.voice, model=args.model, speed=args.speed, base_dir=base_dir)
+            seg = pysbd.Segmenter(language='en', clean=True) # text is dirty, clean it up.
+            for raw_line in sys.stdin:
+                for line in seg.segment(raw_line):
+                    if not line:
+                        continue
+                    print(line)
+                    player.put(reader.speech_to_file(line))
+            player.stop()
+    except KeyboardInterrupt:
+        pass

config/config_files_will_go_here.txt ADDED Viewed

File without changes

docker-compose.min.yml ADDED Viewed

	@@ -0,0 +1,13 @@

+services:
+  server:
+    build:
+      dockerfile: Dockerfile.min # piper for all models, no gpu/nvidia required, ~1GB
+    image: ghcr.io/matatonic/openedai-speech-min
+    env_file: speech.env
+    ports:
+      - "8000:8000"
+    volumes:
+      - ./voices:/app/voices
+      - ./config:/app/config
+    # To install as a service
+    restart: unless-stopped

docker-compose.rocm.yml ADDED Viewed

	@@ -0,0 +1,27 @@

+services:
+  server:
+    build:
+      dockerfile: Dockerfile
+      args:
+        - USE_ROCM=1
+    image: ghcr.io/matatonic/openedai-speech-rocm
+    env_file: speech.env
+    ports:
+      - "8000:8000"
+    volumes:
+      - ./voices:/app/voices
+      - ./config:/app/config
+    # To install as a service
+    restart: unless-stopped
+    # For AMD GPU (ROCm) Support
+    cap_add:
+      - SYS_PTRACE
+    devices:
+      - /dev/kfd
+      - /dev/dri
+    security_opt:
+      - seccomp=unconfined
+    group_add:
+      - video
+      - audio
+    ipc: host

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,21 @@

+services:
+  server:
+    build:
+      dockerfile: Dockerfile
+    image: ghcr.io/matatonic/openedai-speech
+    env_file: speech.env
+    ports:
+      - "8000:8000"
+    volumes:
+      - ./voices:/app/voices
+      - ./config:/app/config
+    # To install as a service
+    restart: unless-stopped
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              #device_ids: ['0', '1'] # Select a gpu, or
+              count: all
+              capabilities: [gpu]

download_samples.bat ADDED Viewed

	@@ -0,0 +1,6 @@

+@echo off
+for %%i in (alloy echo fable onyx nova shimmer) do (
+    if not exist "voices\%%i.wav" (
+        curl -s https://cdn.openai.com/API/docs/audio/%%i.wav | ffmpeg -loglevel error -i - -ar 22050 -ac 1 voices\%%i.wav
+    )
+)

download_samples.sh ADDED Viewed

	@@ -0,0 +1,4 @@

+#!/bin/sh
+for i in alloy echo fable onyx nova shimmer; do
+	[ ! -e "voices/$i.wav" ] && curl -s https://cdn.openai.com/API/docs/audio/$i.wav | ffmpeg -loglevel error -i - -ar 22050 -ac 1 voices/$i.wav
+done

download_voices_tts-1-hd.bat ADDED Viewed

	@@ -0,0 +1,8 @@

+@echo off
+set COQUI_TOS_AGREED=1
+set TTS_HOME=voices
+for %%i in (%*) do (
+    python -c "from TTS.utils.manage import ModelManager; ModelManager().download_model('%%i')"
+)
+call download_samples.bat

download_voices_tts-1-hd.sh ADDED Viewed

	@@ -0,0 +1,8 @@

+#!/bin/sh
+export COQUI_TOS_AGREED=1
+export TTS_HOME=voices
+for model in $*; do
+	python -c "from TTS.utils.manage import ModelManager; ModelManager().download_model('$model')"
+done
+./download_samples.sh

download_voices_tts-1.bat ADDED Viewed

	@@ -0,0 +1,8 @@

+@echo off
+set models=%*
+if "%models%" == "" set models=en_GB-northern_english_male-medium en_US-libritts_r-medium
+piper --update-voices --data-dir voices --download-dir voices --model x 2> nul
+for %%i in (%models%) do (
+    if not exist "voices\%%i.onnx" piper --data-dir voices --download-dir voices --model %%i > nul
+)

download_voices_tts-1.sh ADDED Viewed

	@@ -0,0 +1,6 @@

+#!/bin/sh
+models=${*:-"en_GB-northern_english_male-medium en_US-libritts_r-medium"} # en_US-ryan-high
+piper --update-voices --data-dir voices --download-dir voices --model x 2> /dev/null
+for i in $models ; do
+    [ ! -e "voices/$i.onnx" ] && piper --data-dir voices --download-dir voices --model $i < /dev/null > /dev/null
+done

openedai.py ADDED Viewed

	@@ -0,0 +1,181 @@

+from fastapi import FastAPI, Request
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import PlainTextResponse, JSONResponse
+from loguru import logger
+class OpenAIError(Exception):
+    pass
+class APIError(OpenAIError):
+    message: str
+    code: str = None
+    param: str = None
+    type: str = None
+    def __init__(self, message: str, code: int = 500, param: str = None, internal_message: str = ''):
+        super().__init__(message)
+        self.message = message
+        self.code = code
+        self.param = param
+        self.type = self.__class__.__name__,
+        self.internal_message = internal_message
+    def __repr__(self):
+        return "%s(message=%r, code=%d, param=%s)" % (
+            self.__class__.__name__,
+            self.message,
+            self.code,
+            self.param,
+        )
+class InternalServerError(APIError):
+    pass
+class ServiceUnavailableError(APIError):
+    def __init__(self, message="Service unavailable, please try again later.", code=503, internal_message=''):
+        super().__init__(message, code, internal_message)
+class APIStatusError(APIError):
+    status_code: int = 400
+    def __init__(self, message: str, param: str = None, internal_message: str = ''):
+        super().__init__(message, self.status_code, param, internal_message)
+class BadRequestError(APIStatusError):
+    status_code: int = 400
+class AuthenticationError(APIStatusError):
+    status_code: int = 401
+class PermissionDeniedError(APIStatusError):
+    status_code: int = 403
+class NotFoundError(APIStatusError):
+    status_code: int = 404
+class ConflictError(APIStatusError):
+    status_code: int = 409
+class UnprocessableEntityError(APIStatusError):
+    status_code: int = 422
+class RateLimitError(APIStatusError):
+    status_code: int = 429
+class OpenAIStub(FastAPI):
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.models = {}
+        self.add_middleware(
+            CORSMiddleware,
+            allow_origins=["*"],
+            allow_credentials=True,
+            allow_methods=["*"],
+            allow_headers=["*"]
+        )
+        @self.exception_handler(Exception)
+        def openai_exception_handler(request: Request, exc: Exception) -> JSONResponse:
+            # Generic server errors
+            #logger.opt(exception=exc).error("Logging exception traceback")
+            return JSONResponse(status_code=500, content={
+                'message': 'InternalServerError',
+                'code': 500,
+            })
+        @self.exception_handler(APIError)
+        def openai_apierror_handler(request: Request, exc: APIError) -> JSONResponse:
+            # Server error
+            logger.opt(exception=exc).error("Logging exception traceback")
+            if exc.internal_message:
+                logger.info(exc.internal_message)
+            return JSONResponse(status_code = exc.code, content={
+                'message': exc.message,
+                'code': exc.code,
+                'type': exc.__class__.__name__,
+                'param': exc.param,
+            })
+        @self.exception_handler(APIStatusError)
+        def openai_statuserror_handler(request: Request, exc: APIStatusError) -> JSONResponse:
+            # Client side error
+            logger.info(repr(exc))
+            if exc.internal_message:
+                logger.info(exc.internal_message)
+            return JSONResponse(status_code = exc.code, content={
+                'message': exc.message,
+                'code': exc.code,
+                'type': exc.__class__.__name__,
+                'param': exc.param,
+            })
+        @self.middleware("http")
+        async def log_requests(request: Request, call_next):
+            logger.debug(f"Request path: {request.url.path}")
+            logger.debug(f"Request method: {request.method}")
+            logger.debug(f"Request headers: {request.headers}")
+            logger.debug(f"Request query params: {request.query_params}")
+            logger.debug(f"Request body: {await request.body()}")
+            response = await call_next(request)
+            logger.debug(f"Response status code: {response.status_code}")
+            logger.debug(f"Response headers: {response.headers}")
+            return response
+        @self.get('/v1/billing/usage')
+        @self.get('/v1/dashboard/billing/usage')
+        async def handle_billing_usage():
+            return { 'total_usage': 0 }
+        @self.get("/", response_class=PlainTextResponse)
+        @self.head("/", response_class=PlainTextResponse)
+        @self.options("/", response_class=PlainTextResponse)
+        async def root():
+            return PlainTextResponse(content="", status_code=200 if self.models else 503)
+        @self.get("/health")
+        async def health():
+            return {"status": "ok" if self.models else "unk" }
+        @self.get("/v1/models")
+        async def get_model_list():
+            return self.model_list()
+        @self.get("/v1/models/{model}")
+        async def get_model_info(model_id: str):
+            return self.model_info(model_id)
+    def register_model(self, name: str, model: str = None) -> None:
+        self.models[name] = model if model else name
+    def deregister_model(self, name: str) -> None:
+        if name in self.models:
+            del self.models[name]
+    def model_info(self, model: str) -> dict:
+        result = {
+            "id": model,
+            "object": "model",
+            "created": 0,
+            "owned_by": "user"
+        }
+        return result
+    def model_list(self) -> dict:
+        if not self.models:
+            return {}
+        result = {
+            "object": "list",
+            "data": [ self.model_info(model) for model in list(set(self.models.keys() | self.models.values())) if model ]
+        }
+        return result

pre_process_map.default.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+# regex pairs to clean the text before speaking
+- - ([^.])\.$
+  - \1
+- - '&amp;'
+  - '&'
+- - '&lt;'
+  - <
+- - '&gt;'
+  - '>'
+- - '&quot;'
+  - '"'
+- - '&#x27;'
+  - ''''
+- - '&copy;'
+  - '©'
+- - '&reg;'
+  - '®'
+- - '&nbsp;'
+  - ' '
+- - '"'
+  - ''
+- - ' biases '
+  - ' bias''s '
+- - ex\.
+  - for example
+- - e\.g\.
+  - for example
+- - ' ESG '
+  - ' E.S.G. '
+- - ' FY '
+  - ' F.Y. '
+- - ([0-9]+)-([0-9]+)
+  - \1 to \2
+- - '\*\*\*'
+  - '*'
+- - '\*\*'
+  - '*'

requirements-min.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+fastapi
+uvicorn
+loguru
+numpy<2
+piper-tts

requirements-rocm.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+fastapi
+uvicorn
+loguru
+piper-tts
+coqui-tts
+langdetect
+# Creating an environment where deepspeed works is complex, for now it will be disabled by default.
+#deepspeed
+torch; --index-url https://download.pytorch.org/whl/rocm5.7; sys_platform == "linux"
+torchaudio; --index-url https://download.pytorch.org/whl/rocm5.7; sys_platform == "linux"

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+fastapi
+uvicorn
+loguru
+piper-tts
+coqui-tts[languages]
+langdetect
+# Creating an environment where deepspeed works is complex, for now it will be disabled by default.
+#deepspeed
+torch; sys_platform != "darwin"
+torchaudio; sys_platform != "darwin"
+# for MPS accelerated torch on Mac - doesn't work yet, incomplete support in torch and torchaudio
+torch; --index-url https://download.pytorch.org/whl/cpu; sys_platform == "darwin"
+torchaudio; --index-url https://download.pytorch.org/whl/cpu; sys_platform == "darwin"
+# ROCM (Linux only) - use requirements.amd.txt

sample.env ADDED Viewed

	@@ -0,0 +1,6 @@

+TTS_HOME=voices
+HF_HOME=voices
+#PRELOAD_MODEL=xtts
+#PRELOAD_MODEL=xtts_v2.0.2
+#EXTRA_ARGS=--log-level DEBUG --unload-timer 300
+#USE_ROCM=1

say.py ADDED Viewed

	@@ -0,0 +1,96 @@

+#!/usr/bin/env python
+import sys
+import os
+import atexit
+import tempfile
+import argparse
+try:
+    import dotenv
+    dotenv.load_dotenv(override=True)
+except ImportError:
+    pass
+try:
+    from playsound import playsound
+except ImportError:
+    playsound = None
+import openai
+def parse_args(argv):
+    parser = argparse.ArgumentParser(
+        description='Text to speech using the OpenAI API',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument("-m", "--model", type=str, default="tts-1", help="The model to use")#, choices=["tts-1", "tts-1-hd"])
+    parser.add_argument("-v", "--voice", type=str, default="alloy", help="The voice of the speaker")#, choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"])
+    parser.add_argument("-f", "--format", type=str, default="mp3", choices=["mp3", "aac", "opus", "flac"], help="The output audio format")
+    parser.add_argument("-s", "--speed", type=float, default=1.0, help="playback speed, 0.25-4.0")
+    parser.add_argument("-t", "--text", type=str, default=None, help="Provide text to read on the command line")
+    parser.add_argument("-i", "--input", type=str, default=None, help="Read text from a file (default is to read from stdin)")
+    if playsound is None:
+        parser.add_argument("-o", "--output", type=str, help="The filename to save the output to") # required
+        parser.add_argument("-p", "--playsound", type=None, default=None, help="python playsound not found. pip install playsound")
+    else:
+        parser.add_argument("-o", "--output", type=str, default=None, help="The filename to save the output to") # not required
+        parser.add_argument("-p", "--playsound", action="store_true", help="Play the audio")
+    args = parser.parse_args(argv)
+    return args
+if __name__ == "__main__":
+    args = parse_args(sys.argv[1:])
+    if args.playsound and playsound is None:
+        print("playsound module not found, audio will not be played, use -o <filename> to save output to a file. pip install playsound")
+        sys.exit(1)
+    if not args.playsound and not args.output:
+        print("Must select one of playsound (-p) or output file name (-o)")
+        sys.exit(1)
+    if args.input is None and args.text is None:
+        text = sys.stdin.read()
+    elif args.text:
+        text = args.text
+    elif args.input:
+        if os.path.exists(args.input):
+            with open(args.input, 'r') as f:
+                text = f.read()
+        else:
+            print(f"Warning! File not found: {args.input}\nFalling back to old behavior for -i")
+            text = args.input
+    client = openai.OpenAI(
+        # This part is not needed if you set these environment variables before import openai
+        # export OPENAI_API_KEY=sk-11111111111
+        # export OPENAI_BASE_URL=http://localhost:8000/v1
+        api_key = os.environ.get("OPENAI_API_KEY", "sk-ip"),
+        base_url = os.environ.get("OPENAI_BASE_URL", "http://localhost:8000/v1"),
+    )
+    if args.playsound and args.output is None:
+        _, args.output = tempfile.mkstemp(suffix='.wav')
+        def cleanup():
+            os.unlink(args.output)
+        atexit.register(cleanup)
+    with client.audio.speech.with_streaming_response.create(
+        model=args.model,
+        voice=args.voice,
+        speed=args.speed,
+        response_format=args.format,
+        input=text,
+    ) as response:
+        response.stream_to_file(args.output)
+        if args.playsound:
+            playsound(args.output)

speech.py ADDED Viewed

	@@ -0,0 +1,415 @@

+#!/usr/bin/env python3
+import argparse
+import contextlib
+import gc
+import os
+import queue
+import re
+import subprocess
+import sys
+import threading
+import time
+import yaml
+from fastapi.responses import StreamingResponse
+from loguru import logger
+from openedai import OpenAIStub, BadRequestError, ServiceUnavailableError
+from pydantic import BaseModel
+import uvicorn
+@contextlib.asynccontextmanager
+async def lifespan(app):
+    yield
+    gc.collect()
+    try:
+        import torch
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.ipc_collect()
+    except:
+        pass
+app = OpenAIStub(lifespan=lifespan)
+xtts = None
+args = None
+def unload_model():
+    import torch, gc
+    global xtts
+    if xtts:
+        logger.info("Unloading model")
+        xtts.xtts.to('cpu') # this was required to free up GPU memory...
+        del xtts
+        xtts = None
+        gc.collect()
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
+class xtts_wrapper():
+    check_interval: int = 1 # too aggressive?
+    def __init__(self, model_name, device, model_path=None, unload_timer=None):
+        self.model_name = model_name
+        self.unload_timer = unload_timer
+        self.last_used = time.time()
+        self.timer = None
+        self.lock = threading.Lock()
+        logger.info(f"Loading model {self.model_name} to {device}")
+        if model_path is None:
+            model_path = ModelManager().download_model(model_name)[0]
+        config_path = os.path.join(model_path, 'config.json')
+        config = XttsConfig()
+        config.load_json(config_path)
+        self.xtts = Xtts.init_from_config(config)
+        self.xtts.load_checkpoint(config, checkpoint_dir=model_path, use_deepspeed=args.use_deepspeed)  # XXX there are no prebuilt deepspeed wheels??
+        self.xtts = self.xtts.to(device=device)
+        self.xtts.eval()
+        if self.unload_timer:
+            logger.info(f"Setting unload timer to {self.unload_timer} seconds")
+            self.last_used = time.time()
+            self.check_idle()
+    def check_idle(self):
+        with self.lock:
+            if time.time() - self.last_used >= self.unload_timer:
+                print("Unloading TTS model due to inactivity")
+                unload_model()
+            else:
+                # Reschedule the check
+                self.timer = threading.Timer(self.check_interval, self.check_idle)
+                self.timer.daemon = True
+                self.timer.start()
+    def tts(self, text, language, speaker_wav, **hf_generate_kwargs):
+        with torch.no_grad():
+            self.last_used = time.time()
+            tokens = 0
+            try:
+                with self.lock:
+                    gpt_cond_latent, speaker_embedding = self.xtts.get_conditioning_latents(audio_path=[speaker_wav]) # not worth caching calls, it's < 0.001s after model is loaded
+                    pcm_stream = self.xtts.inference_stream(text, language, gpt_cond_latent, speaker_embedding, **hf_generate_kwargs)
+                    self.last_used = time.time()
+                while True:
+                    with self.lock:
+                        yield next(pcm_stream).cpu().numpy().tobytes()
+                        self.last_used = time.time()
+                    tokens += 1
+            except StopIteration:
+                pass
+            finally:
+                logger.debug(f"Generated {tokens} tokens in {time.time() - self.last_used:.2f}s @ {tokens / (time.time() - self.last_used):.2f} T/s")
+                self.last_used = time.time()
+def default_exists(filename: str):
+    if not os.path.exists(filename):
+        fpath, ext = os.path.splitext(filename)
+        basename = os.path.basename(fpath)
+        default = f"{basename}.default{ext}"
+        logger.info(f"{filename} does not exist, setting defaults from {default}")
+        with open(default, 'r', encoding='utf8') as from_file:
+            with open(filename, 'w', encoding='utf8') as to_file:
+                to_file.write(from_file.read())
+# Read pre process map on demand so it can be changed without restarting the server
+def preprocess(raw_input):
+    #logger.debug(f"preprocess: before: {[raw_input]}")
+    default_exists('config/pre_process_map.yaml')
+    with open('config/pre_process_map.yaml', 'r', encoding='utf8') as file:
+        pre_process_map = yaml.safe_load(file)
+        for a, b in pre_process_map:
+            raw_input = re.sub(a, b, raw_input)
+    raw_input = raw_input.strip()
+    #logger.debug(f"preprocess: after: {[raw_input]}")
+    return raw_input
+# Read voice map on demand so it can be changed without restarting the server
+def map_voice_to_speaker(voice: str, model: str):
+    default_exists('config/voice_to_speaker.yaml')
+    with open('config/voice_to_speaker.yaml', 'r', encoding='utf8') as file:
+        voice_map = yaml.safe_load(file)
+        try:
+            return voice_map[model][voice]
+        except KeyError as e:
+            raise BadRequestError(f"Error loading voice: {voice}, KeyError: {e}", param='voice')
+class GenerateSpeechRequest(BaseModel):
+    model: str = "tts-1" # or "tts-1-hd"
+    input: str
+    voice: str = "alloy"  # alloy, echo, fable, onyx, nova, and shimmer
+    response_format: str = "mp3" # mp3, opus, aac, flac
+    speed: float = 1.0 # 0.25 - 4.0
+def build_ffmpeg_args(response_format, input_format, sample_rate):
+    # Convert the output to the desired format using ffmpeg
+    if input_format == 'WAV':
+        ffmpeg_args = ["ffmpeg", "-loglevel", "error", "-f", "WAV", "-i", "-"]
+    else:
+        ffmpeg_args = ["ffmpeg", "-loglevel", "error", "-f", input_format, "-ar", sample_rate, "-ac", "1", "-i", "-"]
+    if response_format == "mp3":
+        ffmpeg_args.extend(["-f", "mp3", "-c:a", "libmp3lame", "-ab", "64k"])
+    elif response_format == "opus":
+        ffmpeg_args.extend(["-f", "ogg", "-c:a", "libopus"])
+    elif response_format == "aac":
+        ffmpeg_args.extend(["-f", "adts", "-c:a", "aac", "-ab", "64k"])
+    elif response_format == "flac":
+        ffmpeg_args.extend(["-f", "flac", "-c:a", "flac"])
+    elif response_format == "wav":
+        ffmpeg_args.extend(["-f", "wav", "-c:a", "pcm_s16le"])
+    elif response_format == "pcm": # even though pcm is technically 'raw', we still use ffmpeg to adjust the speed
+        ffmpeg_args.extend(["-f", "s16le", "-c:a", "pcm_s16le"])
+    return ffmpeg_args
+@app.post("/v1/audio/speech", response_class=StreamingResponse)
+async def generate_speech(request: GenerateSpeechRequest):
+    global xtts, args
+    if len(request.input) < 1:
+        raise BadRequestError("Empty Input", param='input')
+    input_text = preprocess(request.input)
+    if len(input_text) < 1:
+        raise BadRequestError("Input text empty after preprocess.", param='input')
+    model = request.model
+    voice = request.voice
+    response_format = request.response_format.lower()
+    speed = request.speed
+    # Set the Content-Type header based on the requested format
+    if response_format == "mp3":
+        media_type = "audio/mpeg"
+    elif response_format == "opus":
+        media_type = "audio/ogg;codec=opus" # codecs?
+    elif response_format == "aac":
+        media_type = "audio/aac"
+    elif response_format == "flac":
+        media_type = "audio/x-flac"
+    elif response_format == "wav":
+        media_type = "audio/wav"
+    elif response_format == "pcm":
+        if model == 'tts-1': # piper
+            media_type = "audio/pcm;rate=22050"
+        elif model == 'tts-1-hd': # xtts
+            media_type = "audio/pcm;rate=24000"
+    else:
+        raise BadRequestError(f"Invalid response_format: '{response_format}'", param='response_format')
+    ffmpeg_args = None
+    # Use piper for tts-1, and if xtts_device == none use for all models.
+    if model == 'tts-1' or args.xtts_device == 'none':
+        voice_map = map_voice_to_speaker(voice, 'tts-1')
+        try:
+            piper_model = voice_map['model']
+        except KeyError as e:
+            raise ServiceUnavailableError(f"Configuration error: tts-1 voice '{voice}' is missing 'model:' setting. KeyError: {e}")
+        speaker = voice_map.get('speaker', None)
+        tts_args = ["piper", "--model", str(piper_model), "--data-dir", "voices", "--download-dir", "voices", "--output-raw"]
+        if speaker:
+            tts_args.extend(["--speaker", str(speaker)])
+        if speed != 1.0:
+            tts_args.extend(["--length-scale", f"{1.0/speed}"])
+        tts_proc = subprocess.Popen(tts_args, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+        tts_proc.stdin.write(bytearray(input_text.encode('utf-8')))
+        tts_proc.stdin.close()
+        ffmpeg_args = build_ffmpeg_args(response_format, input_format="s16le", sample_rate="22050")
+        # Pipe the output from piper/xtts to the input of ffmpeg
+        ffmpeg_args.extend(["-"])
+        ffmpeg_proc = subprocess.Popen(ffmpeg_args, stdin=tts_proc.stdout, stdout=subprocess.PIPE)
+        return StreamingResponse(content=ffmpeg_proc.stdout, media_type=media_type)
+    # Use xtts for tts-1-hd
+    elif model == 'tts-1-hd':
+        voice_map = map_voice_to_speaker(voice, 'tts-1-hd')
+        try:
+            tts_model = voice_map.pop('model')
+            speaker = voice_map.pop('speaker')
+        except KeyError as e:
+            raise ServiceUnavailableError(f"Configuration error: tts-1-hd voice '{voice}' is missing setting. KeyError: {e}")
+        if xtts and xtts.model_name != tts_model:
+            unload_model()
+        tts_model_path = voice_map.pop('model_path', None) # XXX changing this on the fly is ignored if you keep the same name
+        if xtts is None:
+            xtts = xtts_wrapper(tts_model, device=args.xtts_device, model_path=tts_model_path, unload_timer=args.unload_timer)
+        ffmpeg_args = build_ffmpeg_args(response_format, input_format="f32le", sample_rate="24000")
+        # tts speed doesn't seem to work well
+        speed = voice_map.pop('speed', speed)
+        if speed < 0.5:
+            speed = speed / 0.5
+            ffmpeg_args.extend(["-af", "atempo=0.5"])
+        if speed > 1.0:
+            ffmpeg_args.extend(["-af", f"atempo={speed}"])
+            speed = 1.0
+        # Pipe the output from piper/xtts to the input of ffmpeg
+        ffmpeg_args.extend(["-"])
+        language = voice_map.pop('language', 'auto')
+        if language == 'auto':
+            try:
+                language = detect(input_text)
+                if language not in [
+                    'en', 'es', 'fr', 'de', 'it', 'pt', 'pl', 'tr',
+                    'ru', 'nl', 'cs', 'ar', 'zh-cn', 'hu', 'ko', 'ja', 'hi'
+                ]:
+                    logger.debug(f"Detected language {language} not supported, defaulting to en")
+                    language = 'en'
+                else:
+                    logger.debug(f"Detected language: {language}")
+            except:
+                language = 'en'
+                logger.debug(f"Failed to detect language, defaulting to en")
+        comment = voice_map.pop('comment', None) # ignored.
+        hf_generate_kwargs = dict(
+            speed=speed,
+            **voice_map,
+        )
+        hf_generate_kwargs['enable_text_splitting'] = hf_generate_kwargs.get('enable_text_splitting', True) # change the default to true
+        if hf_generate_kwargs['enable_text_splitting']:
+            if language == 'zh-cn':
+                split_lang = 'zh'
+            else:
+                split_lang = language
+            all_text = split_sentence(input_text, split_lang, xtts.xtts.tokenizer.char_limits[split_lang])
+        else:
+            all_text = [input_text]
+        ffmpeg_proc = subprocess.Popen(ffmpeg_args, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+        in_q = queue.Queue() # speech pcm
+        ex_q = queue.Queue() # exceptions
+        def exception_check(exq: queue.Queue):
+            try:
+                e = exq.get_nowait()
+            except queue.Empty:
+                return
+            raise e
+        def generator():
+            # text -> in_q
+            try:
+                for text in all_text:
+                    for chunk in xtts.tts(text=text, language=language, speaker_wav=speaker, **hf_generate_kwargs):
+                        exception_check(ex_q)
+                        in_q.put(chunk)
+            except BrokenPipeError as e: # client disconnect lands here
+                logger.info("Client disconnected - 'Broken pipe'")
+            except Exception as e:
+                logger.error(f"Exception: {repr(e)}")
+                raise e
+            finally:
+                in_q.put(None) # sentinel
+        def out_writer():
+            # in_q -> ffmpeg
+            try:
+                while True:
+                    chunk = in_q.get()
+                    if chunk is None: # sentinel
+                        break
+                    ffmpeg_proc.stdin.write(chunk) # BrokenPipeError from here on client disconnect
+            except Exception as e: # BrokenPipeError
+                ex_q.put(e)  # we need to get this exception into the generation loop
+                ffmpeg_proc.kill()
+                return
+            finally:
+                ffmpeg_proc.stdin.close()
+        generator_worker = threading.Thread(target=generator, daemon=True)
+        generator_worker.start()
+        out_writer_worker = threading.Thread(target=out_writer, daemon=True)
+        out_writer_worker.start()
+        def cleanup():
+            ffmpeg_proc.kill()
+            del generator_worker
+            del out_writer_worker
+        return StreamingResponse(content=ffmpeg_proc.stdout, media_type=media_type, background=cleanup)
+    else:
+        raise BadRequestError("No such model, must be tts-1 or tts-1-hd.", param='model')
+# We return 'mps' but currently XTTS will not work with mps devices as the cuda support is incomplete
+def auto_torch_device():
+    try:
+        import torch
+        return 'cuda' if torch.cuda.is_available() else 'mps' if ( torch.backends.mps.is_available() and torch.backends.mps.is_built() ) else 'cpu'
+    except:
+        return 'none'
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description='OpenedAI Speech API Server',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--xtts_device', action='store', default=auto_torch_device(), help="Set the device for the xtts model. The special value of 'none' will use piper for all models.")
+    parser.add_argument('--preload', action='store', default=None, help="Preload a model (Ex. 'xtts' or 'xtts_v2.0.2'). By default it's loaded on first use.")
+    parser.add_argument('--unload-timer', action='store', default=None, type=int, help="Idle unload timer for the XTTS model in seconds, Ex. 900 for 15 minutes")
+    parser.add_argument('--use-deepspeed', action='store_true', default=False, help="Use deepspeed with xtts (this option is unsupported)")
+    parser.add_argument('--no-cache-speaker', action='store_true', default=False, help="Don't use the speaker wav embeddings cache")
+    parser.add_argument('-P', '--port', action='store', default=8000, type=int, help="Server tcp port")
+    parser.add_argument('-H', '--host', action='store', default='0.0.0.0', help="Host to listen on, Ex. 0.0.0.0")
+    parser.add_argument('-L', '--log-level', default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], help="Set the log level")
+    args = parser.parse_args()
+    default_exists('config/pre_process_map.yaml')
+    default_exists('config/voice_to_speaker.yaml')
+    logger.remove()
+    logger.add(sink=sys.stderr, level=args.log_level)
+    if args.xtts_device != "none":
+        import torch
+        from TTS.tts.configs.xtts_config import XttsConfig
+        from TTS.tts.models.xtts import Xtts
+        from TTS.utils.manage import ModelManager
+        from TTS.tts.layers.xtts.tokenizer import split_sentence
+        from langdetect import detect
+    if args.preload:
+        xtts = xtts_wrapper(args.preload, device=args.xtts_device, unload_timer=args.unload_timer)
+    app.register_model('tts-1')
+    app.register_model('tts-1-hd')
+    uvicorn.run(app, host=args.host, port=args.port)

startup.bat ADDED Viewed

	@@ -0,0 +1,8 @@

+@echo off
+set /p < speech.env
+call download_voices_tts-1.bat
+call download_voices_tts-1-hd.bat %PRELOAD_MODEL%
+python speech.py %PRELOAD_MODEL:+--preload %PRELOAD_MODEL% %EXTRA_ARGS%

startup.min.sh ADDED Viewed

	@@ -0,0 +1,7 @@

+#!/bin/bash
+[ -f speech.env ] && . speech.env
+bash download_voices_tts-1.sh
+python speech.py --xtts_device none $EXTRA_ARGS $@

startup.sh ADDED Viewed

	@@ -0,0 +1,10 @@

+#!/bin/bash
+[ -f speech.env ] && . speech.env
+echo "First startup may download 2GB of speech models. Please wait."
+bash download_voices_tts-1.sh
+bash download_voices_tts-1-hd.sh $PRELOAD_MODEL
+python speech.py ${PRELOAD_MODEL:+--preload $PRELOAD_MODEL} $EXTRA_ARGS $@

test_voices.sh ADDED Viewed

	@@ -0,0 +1,67 @@

+#!/bin/bash
+URL=${1:-http://localhost:8000/v1/audio/speech}
+curl -s $URL -H "Content-Type: application/json" -d "{
+    \"model\": \"tts-1\",
+    \"input\": \"I'm going to play you the original voice, followed by the piper voice and finally the X T T S version 2 voice\",
+    \"voice\": \"echo\",
+    \"speed\": 1.0
+  }" | mpv --really-quiet -
+for voice in alloy echo fable onyx nova shimmer ; do
+echo $voice
+curl -s $URL -H "Content-Type: application/json" -d "{
+    \"model\": \"tts-1\",
+    \"input\": \"original\",
+    \"voice\": \"echo\",
+    \"speed\": 1.0
+  }" | mpv --really-quiet -
+curl -s https://cdn.openai.com/API/docs/audio/$voice.wav | mpv --really-quiet -
+curl -s $URL -H "Content-Type: application/json" -d "{
+    \"model\": \"tts-1\",
+    \"input\": \"The quick brown fox jumped over the lazy dog. This voice is called $voice, how do you like this voice?\",
+    \"voice\": \"$voice\",
+    \"speed\": 1.0
+  }" | mpv --really-quiet -
+curl -s $URL -H "Content-Type: application/json" -d "{
+    \"model\": \"tts-1-hd\",
+    \"input\": \"The quick brown fox jumped over the lazy dog. This HD voice is called $voice, how do you like this voice?\",
+    \"voice\": \"$voice\",
+    \"speed\": 1.0
+  }" | mpv --really-quiet -
+done
+curl -s $URL -H "Content-Type: application/json" -d "{
+    \"model\": \"tts-1\",
+    \"input\": \"the slowest voice\",
+    \"voice\": \"onyx\",
+    \"speed\": 0.25
+  }" | mpv --really-quiet -
+curl -s $URL -H "Content-Type: application/json" -d "{
+    \"model\": \"tts-1-hd\",
+    \"input\": \"the slowest HD voice\",
+    \"voice\": \"onyx\",
+    \"speed\": 0.25
+  }" | mpv --really-quiet -
+curl -s $URL -H "Content-Type: application/json" -d "{
+    \"model\": \"tts-1\",
+    \"input\": \"And this is how fast it can go, the fastest voice\",
+    \"voice\": \"nova\",
+    \"speed\": 4.0
+  }" | mpv --really-quiet -
+curl -s $URL -H "Content-Type: application/json" -d "{
+    \"model\": \"tts-1-hd\",
+    \"input\": \"And this is how fast it can go, the fastest HD voice\",
+    \"voice\": \"nova\",
+    \"speed\": 4.0
+  }" | mpv --really-quiet -

voice_to_speaker.default.yaml ADDED Viewed

	@@ -0,0 +1,59 @@

+tts-1:
+  some_other_voice_name_you_want:
+    model: voices/choose your own model.onnx
+    speaker: set your own speaker
+  alloy:
+    model: voices/en_US-libritts_r-medium.onnx
+    speaker: 79 # 64, 79, 80, 101, 130
+  echo:
+    model: voices/en_US-libritts_r-medium.onnx
+    speaker: 134 # 52, 102, 134
+  echo-alt:
+    model: voices/en_US-ryan-high.onnx
+    speaker: # default speaker
+  fable:
+    model: voices/en_GB-northern_english_male-medium.onnx
+    speaker: # default speaker
+  onyx:
+    model: voices/en_US-libritts_r-medium.onnx
+    speaker: 159 # 55, 90, 132, 136, 137, 159
+  nova:
+    model: voices/en_US-libritts_r-medium.onnx
+    speaker: 107 # 57, 61, 107, 150, 162
+  shimmer:
+    model: voices/en_US-libritts_r-medium.onnx
+    speaker: 163
+tts-1-hd:
+  alloy-alt:
+    model: xtts
+    speaker: voices/alloy-alt.wav
+  alloy:
+    model: xtts
+    speaker: voices/alloy.wav
+  echo:
+    model: xtts
+    speaker: voices/echo.wav
+  fable:
+    model: xtts
+    speaker: voices/fable.wav
+  onyx:
+    model: xtts
+    speaker: voices/onyx.wav
+  nova:
+    model: xtts
+    speaker: voices/nova.wav
+  shimmer:
+    model: xtts
+    speaker: voices/shimmer.wav
+  me:
+    model: xtts_v2.0.2 # you can specify an older xtts version
+    speaker: voices/me.wav # this could be you
+    language: auto
+    enable_text_splitting: True
+    length_penalty: 1.0
+    repetition_penalty: 10
+    speed: 1.0
+    temperature: 0.75
+    top_k: 50
+    top_p: 0.85
+    comment: You can add a comment here also, which will be persistent and otherwise ignored.