Spaces:

joytou
/

my-speech

Running

App Files Files Community

joytou commited on Sep 25, 2024

Commit

882ea5e

1 Parent(s): 5c84f37

init project

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +7 -0
.github/ISSUE_TEMPLATE/bug_report.yml +50 -0
.github/ISSUE_TEMPLATE/config.yml +5 -0
.github/ISSUE_TEMPLATE/feature_request.yml +40 -0
.github/pull_request_template.md +7 -0
.github/workflows/build-docker-image.yml +70 -0
.github/workflows/docs.yml +33 -0
.github/workflows/stale.yml +25 -0
.gitignore +31 -0
.pre-commit-config.yaml +25 -0
.project-root +0 -0
.readthedocs.yaml +19 -0
API_FLAGS.txt +6 -0
Dockerfile +44 -0
LICENSE +437 -0
docker-compose.dev.yml +16 -0
dockerfile.dev +33 -0
docs/CNAME +1 -0
docs/assets/figs/VS_1.jpg +0 -0
docs/assets/figs/VS_1_pt-BR.png +0 -0
docs/assets/figs/diagram.png +0 -0
docs/assets/figs/diagrama.png +0 -0
docs/en/finetune.md +125 -0
docs/en/index.md +133 -0
docs/en/inference.md +124 -0
docs/en/samples.md +223 -0
docs/ja/finetune.md +125 -0
docs/ja/index.md +128 -0
docs/ja/inference.md +157 -0
docs/ja/samples.md +223 -0
docs/pt/finetune.md +125 -0
docs/pt/index.md +131 -0
docs/pt/inference.md +153 -0
docs/pt/samples.md +223 -0
docs/requirements.txt +3 -0
docs/stylesheets/extra.css +3 -0
docs/zh/finetune.md +136 -0
docs/zh/index.md +191 -0
docs/zh/inference.md +134 -0
docs/zh/samples.md +223 -0
entrypoint.sh +10 -0
fish_speech/callbacks/__init__.py +3 -0
fish_speech/callbacks/grad_norm.py +113 -0
fish_speech/configs/base.yaml +87 -0
fish_speech/configs/firefly_gan_vq.yaml +33 -0
fish_speech/configs/lora/r_8_alpha_16.yaml +4 -0
fish_speech/configs/text2semantic_finetune.yaml +83 -0
fish_speech/conversation.py +2 -0
fish_speech/datasets/concat_repeat.py +53 -0
fish_speech/datasets/protos/text-data.proto +24 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,7 @@

+.git
+.github
+results
+data
+*.filelist
+/data_server/target
+checkpoints

.github/ISSUE_TEMPLATE/bug_report.yml ADDED Viewed

	@@ -0,0 +1,50 @@

+name: "🕷️ Bug report"
+description: Report errors or unexpected behavior
+labels:
+  - bug
+body:
+  - type: checkboxes
+    attributes:
+      label: Self Checks
+      description: "To make sure we get to you in time, please check the following :)"
+      options:
+        - label: This is only for bug report, if you would like to ask a question, please head to [Discussions](https://github.com/fishaudio/fish-speech/discussions).
+          required: true
+        - label: I have searched for existing issues [search for existing issues](https://github.com/fishaudio/fish-speech/issues), including closed ones.
+          required: true
+        - label: I confirm that I am using English to submit this report (我已阅读并同意 [Language Policy](https://github.com/fishaudio/fish-speech/issues/515)).
+          required: true
+        - label: "[FOR CHINESE USERS] 请务必使用英文提交 Issue，否则会被关闭。谢谢！:）"
+          required: true
+        - label: "Please do not modify this template :) and fill in all the required fields."
+          required: true
+  - type: dropdown
+    attributes:
+      label: Cloud or Self Hosted
+      multiple: true
+      options:
+        - Cloud
+        - Self Hosted (Docker)
+        - Self Hosted (Source)
+    validations:
+      required: true
+  - type: textarea
+    attributes:
+      label: Steps to reproduce
+      description: We highly suggest including screenshots and a bug report log. Please use the right markdown syntax for code blocks.
+      placeholder: Having detailed steps helps us reproduce the bug.
+    validations:
+      required: true
+  - type: textarea
+    attributes:
+      label: ✔️ Expected Behavior
+      placeholder: What were you expecting?
+    validations:
+      required: false
+  - type: textarea
+    attributes:
+      label: ❌ Actual Behavior
+      placeholder: What happened instead?
+    validations:
+      required: false

.github/ISSUE_TEMPLATE/config.yml ADDED Viewed

	@@ -0,0 +1,5 @@

+blank_issues_enabled: false
+contact_links:
+  - name: "\U0001F4E7 Discussions"
+    url: https://github.com/fishaudio/fish-speech/discussions
+    about: General discussions and request help from the community

.github/ISSUE_TEMPLATE/feature_request.yml ADDED Viewed

	@@ -0,0 +1,40 @@

+name: "⭐ Feature or enhancement request"
+description: Propose something new.
+labels:
+  - enhancement
+body:
+  - type: checkboxes
+    attributes:
+      label: Self Checks
+      description: "To make sure we get to you in time, please check the following :)"
+      options:
+        - label: I have searched for existing issues [search for existing issues]([https://github.com/langgenius/dify/issues](https://github.com/fishaudio/fish-speech/issues)), including closed ones.
+          required: true
+        - label: I confirm that I am using English to submit this report (我已阅读并同意 [Language Policy](https://github.com/fishaudio/fish-speech/issues/515)).
+          required: true
+        - label: "[FOR CHINESE USERS] 请务必使用英文提交 Issue，否则会被关闭。谢谢！:）"
+          required: true
+        - label: "Please do not modify this template :) and fill in all the required fields."
+          required: true
+  - type: textarea
+    attributes:
+      label: 1. Is this request related to a challenge you're experiencing? Tell me about your story.
+      placeholder: Please describe the specific scenario or problem you're facing as clearly as possible. For instance "I was trying to use [feature] for [specific task], and [what happened]... It was frustrating because...."
+    validations:
+      required: true
+  - type: textarea
+    attributes:
+      label: 2. Additional context or comments
+      placeholder: (Any other information, comments, documentations, links, or screenshots that would provide more clarity. This is the place to add anything else not covered above.)
+    validations:
+      required: false
+  - type: checkboxes
+    attributes:
+      label: 3. Can you help us with this feature?
+      description: Let us know! This is not a commitment, but a starting point for collaboration.
+      options:
+        - label: I am interested in contributing to this feature.
+          required: false
+  - type: markdown
+    attributes:
+      value: Please limit one request per issue.

.github/pull_request_template.md ADDED Viewed

	@@ -0,0 +1,7 @@

+**Is this PR adding new feature or fix a BUG?**
+Add feature / Fix BUG.
+**Is this pull request related to any issue? If yes, please link the issue.**
+#xxx

.github/workflows/build-docker-image.yml ADDED Viewed

	@@ -0,0 +1,70 @@

+name: Build Image
+on:
+  push:
+    branches:
+      - main
+    tags:
+      - 'v*'
+jobs:
+  build:
+    runs-on: ubuntu-latest-16c64g
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Get Version
+        run: |
+          if [[ $GITHUB_REF == refs/tags/v* ]]; then
+            version=$(basename ${GITHUB_REF})
+          else
+            version=nightly
+          fi
+          echo "version=${version}" >> $GITHUB_ENV
+          echo "Current version: ${version}"
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKER_USER }}
+          password: ${{ secrets.DOCKER_PAT }}
+      - name: Build and Push Image
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: dockerfile
+          platforms: linux/amd64
+          push: true
+          tags: |
+            fishaudio/fish-speech:${{ env.version }}
+            fishaudio/fish-speech:latest
+          outputs: type=image,oci-mediatypes=true,compression=zstd,compression-level=3,force-compression=true
+          cache-from: type=registry,ref=fishaudio/fish-speech:latest
+          cache-to: type=inline
+      - name: Build and Push Dev Image
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: dockerfile.dev
+          platforms: linux/amd64
+          push: true
+          build-args: |
+            VERSION=${{ env.version }}
+            BASE_IMAGE=fishaudio/fish-speech:${{ env.version }}
+          tags: |
+            fishaudio/fish-speech:${{ env.version }}-dev
+            fishaudio/fish-speech:latest-dev
+          outputs: type=image,oci-mediatypes=true,compression=zstd,compression-level=3,force-compression=true
+          cache-from: type=registry,ref=fishaudio/fish-speech:latest-dev
+          cache-to: type=inline
+      - name: Push README to Dockerhub
+        uses: peter-evans/dockerhub-description@v4
+        with:
+          username: ${{ secrets.DOCKER_USER }}
+          password: ${{ secrets.DOCKER_PAT }}
+          repository: fishaudio/fish-speech

.github/workflows/docs.yml ADDED Viewed

	@@ -0,0 +1,33 @@

+name: docs
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - 'docs/**'
+      - 'mkdocs.yml'
+permissions:
+  contents: write
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Configure Git Credentials
+        run: |
+          git config user.name github-actions[bot]
+          git config user.email 41898282+github-actions[bot]@users.noreply.github.com
+      - uses: actions/setup-python@v5
+        with:
+          python-version: 3.x
+      - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
+      - uses: actions/cache@v4
+        with:
+          key: mkdocs-material-${{ env.cache_id }}
+          path: .cache
+          restore-keys: |
+            mkdocs-material-
+      - run: pip install -r docs/requirements.txt
+      - run: mkdocs gh-deploy --force

.github/workflows/stale.yml ADDED Viewed

	@@ -0,0 +1,25 @@

+name: Close inactive issues
+on:
+  schedule:
+    - cron: "0 0 * * *"
+jobs:
+  close-issues:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+    steps:
+      - uses: actions/stale@v9
+        with:
+          days-before-issue-stale: 30
+          days-before-issue-close: 14
+          stale-issue-label: "stale"
+          stale-issue-message: "This issue is stale because it has been open for 30 days with no activity."
+          close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
+          days-before-pr-stale: 30
+          days-before-pr-close: 30
+          stale-pr-label: "stale"
+          stale-pr-message: "This PR is stale because it has been open for 30 days with no activity."
+          close-pr-message: "This PR was closed because it has been inactive for 30 days since being marked as stale."
+          repo-token: ${{ secrets.GITHUB_TOKEN }}

.gitignore ADDED Viewed

	@@ -0,0 +1,31 @@

+.DS_Store
+.pgx.*
+.pdm-python
+/fish_speech.egg-info
+__pycache__
+/results
+/data
+/*.test.sh
+*.filelist
+filelists
+/fish_speech/text/cmudict_cache.pickle
+/checkpoints
+/.vscode
+/data_server/target
+/*.npy
+/*.wav
+/*.mp3
+/*.lab
+/results
+/data
+/.idea
+ffmpeg.exe
+ffprobe.exe
+asr-label*
+/.cache
+/fishenv
+/.locale
+/demo-audios
+/references
+/example
+/faster_whisper

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+ci:
+  autoupdate_schedule: monthly
+repos:
+  - repo: https://github.com/pycqa/isort
+    rev: 5.13.2
+    hooks:
+      - id: isort
+        args: [--profile=black]
+  - repo: https://github.com/psf/black
+    rev: 24.8.0
+    hooks:
+      - id: black
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.6.0
+    hooks:
+      - id: end-of-file-fixer
+      - id: check-yaml
+      - id: check-json
+      - id: mixed-line-ending
+        args: ['--fix=lf']
+      - id: check-added-large-files
+        args: ['--maxkb=5000']

.project-root ADDED Viewed

File without changes

.readthedocs.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+# Read the Docs configuration file for MkDocs projects
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+# Required
+version: 2
+# Set the version of Python and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.12"
+mkdocs:
+  configuration: mkdocs.yml
+# Optionally declare the Python requirements required to build your docs
+python:
+  install:
+  - requirements: docs/requirements.txt

API_FLAGS.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+# --infer
+# --api
+--listen 0.0.0.0:8080 \
+--llama-checkpoint-path "checkpoints/fish-speech-1.4" \
+--decoder-checkpoint-path "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth" \
+--decoder-config-name firefly_gan_vq

Dockerfile ADDED Viewed

	@@ -0,0 +1,44 @@

+FROM python:3.12-slim-bookworm AS stage-1
+ARG TARGETARCH
+ARG HUGGINGFACE_MODEL=fish-speech-1.4
+ARG HF_ENDPOINT=https://huggingface.co
+WORKDIR /opt/fish-speech
+RUN set -ex \
+    && pip install huggingface_hub \
+    && HF_ENDPOINT=${HF_ENDPOINT} huggingface-cli download --resume-download fishaudio/${HUGGINGFACE_MODEL} --local-dir checkpoints/${HUGGINGFACE_MODEL}
+FROM python:3.12-slim-bookworm
+ARG TARGETARCH
+ARG DEPENDENCIES="  \
+    ca-certificates \
+    libsox-dev \
+    ffmpeg"
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt,sharing=locked \
+    set -ex \
+    && rm -f /etc/apt/apt.conf.d/docker-clean \
+    && echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' >/etc/apt/apt.conf.d/keep-cache \
+    && apt-get update \
+    && apt-get -y install --no-install-recommends ${DEPENDENCIES} \
+    && echo "no" | dpkg-reconfigure dash
+WORKDIR /opt/fish-speech
+COPY . .
+RUN --mount=type=cache,target=/root/.cache,sharing=locked \
+    set -ex \
+    && pip install -e .[stable]
+COPY --from=stage-1 /opt/fish-speech/checkpoints /opt/fish-speech/checkpoints
+ENV GRADIO_SERVER_NAME="0.0.0.0"
+EXPOSE 7860
+CMD ["./entrypoint.sh"]

LICENSE ADDED Viewed

	@@ -0,0 +1,437 @@

+Attribution-NonCommercial-ShareAlike 4.0 International
+=======================================================================
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+Using Creative Commons Public Licenses
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+    wiki.creativecommons.org/Considerations_for_licensors
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More considerations
+     for the public:
+    wiki.creativecommons.org/Considerations_for_licensees
+=======================================================================
+Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International
+Public License
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial-ShareAlike 4.0 International Public License
+("Public License"). To the extent this Public License may be
+interpreted as a contract, You are granted the Licensed Rights in
+consideration of Your acceptance of these terms and conditions, and the
+Licensor grants You such rights in consideration of benefits the
+Licensor receives from making the Licensed Material available under
+these terms and conditions.
+Section 1 -- Definitions.
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+  c. BY-NC-SA Compatible License means a license listed at
+     creativecommons.org/compatiblelicenses, approved by Creative
+     Commons as essentially the equivalent of this Public License.
+  d. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+  e. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+  f. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+  g. License Elements means the license attributes listed in the name
+     of a Creative Commons Public License. The License Elements of this
+     Public License are Attribution, NonCommercial, and ShareAlike.
+  h. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+  i. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+  j. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+  k. NonCommercial means not primarily intended for or directed towards
+     commercial advantage or monetary compensation. For purposes of
+     this Public License, the exchange of the Licensed Material for
+     other material subject to Copyright and Similar Rights by digital
+     file-sharing or similar means is NonCommercial provided there is
+     no payment of monetary compensation in connection with the
+     exchange.
+  l. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+  m. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+  n. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+Section 2 -- Scope.
+  a. License grant.
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+            a. reproduce and Share the Licensed Material, in whole or
+               in part, for NonCommercial purposes only; and
+            b. produce, reproduce, and Share Adapted Material for
+               NonCommercial purposes only.
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+       5. Downstream recipients.
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+            b. Additional offer from the Licensor -- Adapted Material.
+               Every recipient of Adapted Material from You
+               automatically receives an offer from the Licensor to
+               exercise the Licensed Rights in the Adapted Material
+               under the conditions of the Adapter's License You apply.
+            c. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+  b. Other rights.
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties, including when
+          the Licensed Material is used other than for NonCommercial
+          purposes.
+Section 3 -- License Conditions.
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+  a. Attribution.
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+                ii. a copyright notice;
+               iii. a notice that refers to this Public License;
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+  b. ShareAlike.
+     In addition to the conditions in Section 3(a), if You Share
+     Adapted Material You produce, the following conditions also apply.
+       1. The Adapter's License You apply must be a Creative Commons
+          license with the same License Elements, this version or
+          later, or a BY-NC-SA Compatible License.
+       2. You must include the text of, or the URI or hyperlink to, the
+          Adapter's License You apply. You may satisfy this condition
+          in any reasonable manner based on the medium, means, and
+          context in which You Share Adapted Material.
+       3. You may not offer or impose any additional or different terms
+          or conditions on, or apply any Effective Technological
+          Measures to, Adapted Material that restrict exercise of the
+          rights granted under the Adapter's License You apply.
+Section 4 -- Sui Generis Database Rights.
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database for NonCommercial purposes
+     only;
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material,
+     including for purposes of Section 3(b); and
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+Section 6 -- Term and Termination.
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+       2. upon express reinstatement by the Licensor.
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+Section 7 -- Other Terms and Conditions.
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+Section 8 -- Interpretation.
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+=======================================================================
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+Creative Commons may be contacted at creativecommons.org.

docker-compose.dev.yml ADDED Viewed

	@@ -0,0 +1,16 @@

+version: '3.8'
+services:
+  fish-speech:
+    build: .
+    container_name: fish-speech
+    volumes:
+      - ./:/exp
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    command: tail -f /dev/null

dockerfile.dev ADDED Viewed

	@@ -0,0 +1,33 @@

+ARG VERSION=dev
+ARG BASE_IMAGE=ghcr.io/fishaudio/fish-speech:${VERSION}
+FROM ${BASE_IMAGE}
+ARG TOOLS="               \
+        git               \
+        curl              \
+        build-essential   \
+        ffmpeg            \
+        libsm6            \
+        libxext6          \
+        libjpeg-dev       \
+        zlib1g-dev        \
+        aria2             \
+        zsh               \
+        openssh-server    \
+        sudo              \
+        protobuf-compiler \
+        cmake"
+RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
+    --mount=type=cache,target=/var/lib/apt,sharing=locked \
+    set -ex \
+    && apt-get update \
+    && apt-get -y install --no-install-recommends ${TOOLS}
+# Install oh-my-zsh so your terminal looks nice
+RUN sh -c "$(curl https://raw.githubusercontent.com/robbyrussell/oh-my-zsh/master/tools/install.sh)" "" --unattended
+# Set zsh as default shell
+RUN chsh -s /usr/bin/zsh
+ENV SHELL=/usr/bin/zsh

docs/CNAME ADDED Viewed

	@@ -0,0 +1 @@


1	+ speech.fish.audio

docs/assets/figs/VS_1.jpg ADDED Viewed

docs/assets/figs/VS_1_pt-BR.png ADDED Viewed

docs/assets/figs/diagram.png ADDED Viewed

docs/assets/figs/diagrama.png ADDED Viewed

docs/en/finetune.md ADDED Viewed

	@@ -0,0 +1,125 @@

+# Fine-tuning
+Obviously, when you opened this page, you were not satisfied with the performance of the few-shot pre-trained model. You want to fine-tune a model to improve its performance on your dataset.
+In current version, you only need to finetune the 'LLAMA' part.
+## Fine-tuning LLAMA
+### 1. Prepare the dataset
+```
+.
+├── SPK1
+│   ├── 21.15-26.44.lab
+│   ├── 21.15-26.44.mp3
+│   ├── 27.51-29.98.lab
+│   ├── 27.51-29.98.mp3
+│   ├── 30.1-32.71.lab
+│   └── 30.1-32.71.mp3
+└── SPK2
+    ├── 38.79-40.85.lab
+    └── 38.79-40.85.mp3
+```
+You need to convert your dataset into the above format and place it under `data`. The audio file can have the extensions `.mp3`, `.wav`, or `.flac`, and the annotation file should have the extensions `.lab`.
+!!! warning
+    It's recommended to apply loudness normalization to the dataset. You can use [fish-audio-preprocess](https://github.com/fishaudio/audio-preprocess) to do this.
+    ```bash
+    fap loudness-norm data-raw data --clean
+    ```
+### 2. Batch extraction of semantic tokens
+Make sure you have downloaded the VQGAN weights. If not, run the following command:
+```bash
+huggingface-cli download fishaudio/fish-speech-1.4 --local-dir checkpoints/fish-speech-1.4
+```
+You can then run the following command to extract semantic tokens:
+```bash
+python tools/vqgan/extract_vq.py data \
+    --num-workers 1 --batch-size 16 \
+    --config-name "firefly_gan_vq" \
+    --checkpoint-path "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"
+```
+!!! note
+    You can adjust `--num-workers` and `--batch-size` to increase extraction speed, but please make sure not to exceed your GPU memory limit.
+    For the VITS format, you can specify a file list using `--filelist xxx.list`.
+This command will create `.npy` files in the `data` directory, as shown below:
+```
+.
+├── SPK1
+│   ├── 21.15-26.44.lab
+│   ├── 21.15-26.44.mp3
+│   ├── 21.15-26.44.npy
+│   ├── 27.51-29.98.lab
+│   ├── 27.51-29.98.mp3
+│   ├── 27.51-29.98.npy
+│   ├── 30.1-32.71.lab
+│   ├── 30.1-32.71.mp3
+│   └── 30.1-32.71.npy
+└── SPK2
+    ├── 38.79-40.85.lab
+    ├── 38.79-40.85.mp3
+    └── 38.79-40.85.npy
+```
+### 3. Pack the dataset into protobuf
+```bash
+python tools/llama/build_dataset.py \
+    --input "data" \
+    --output "data/protos" \
+    --text-extension .lab \
+    --num-workers 16
+```
+After the command finishes executing, you should see the `quantized-dataset-ft.protos` file in the `data` directory.
+### 4. Finally, fine-tuning with LoRA
+Similarly, make sure you have downloaded the `LLAMA` weights. If not, run the following command:
+```bash
+huggingface-cli download fishaudio/fish-speech-1.4 --local-dir checkpoints/fish-speech-1.4
+```
+Finally, you can start the fine-tuning by running the following command:
+```bash
+python fish_speech/train.py --config-name text2semantic_finetune \
+    project=$project \
+    +lora@model.model.lora_config=r_8_alpha_16
+```
+!!! note
+    You can modify the training parameters such as `batch_size`, `gradient_accumulation_steps`, etc. to fit your GPU memory by modifying `fish_speech/configs/text2semantic_finetune.yaml`.
+!!! note
+    For Windows users, you can use `trainer.strategy.process_group_backend=gloo` to avoid `nccl` issues.
+After training is complete, you can refer to the [inference](inference.md) section, and use `--speaker SPK1` to generate speech.
+!!! info
+    By default, the model will only learn the speaker's speech patterns and not the timbre. You still need to use prompts to ensure timbre stability.
+    If you want to learn the timbre, you can increase the number of training steps, but this may lead to overfitting.
+After training, you need to convert the LoRA weights to regular weights before performing inference.
+```bash
+python tools/llama/merge_lora.py \
+	--lora-config r_8_alpha_16 \
+	--base-weight checkpoints/fish-speech-1.4 \
+	--lora-weight results/$project/checkpoints/step_000000010.ckpt \
+	--output checkpoints/fish-speech-1.4-yth-lora/
+```
+!!! note
+    You may also try other checkpoints. We suggest using the earliest checkpoint that meets your requirements, as they often perform better on out-of-distribution (OOD) data.

docs/en/index.md ADDED Viewed

	@@ -0,0 +1,133 @@

+# Introduction
+<div>
+<a target="_blank" href="https://discord.gg/Es5qTB9BcN">
+<img alt="Discord" src="https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square"/>
+</a>
+<a target="_blank" href="http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=jCKlUP7QgSm9kh95UlBoYv6s1I-Apl1M&authKey=xI5ttVAp3do68IpEYEalwXSYZFdfxZSkah%2BctF5FIMyN2NqAa003vFtLqJyAVRfF&noverify=0&group_code=593946093">
+<img alt="QQ" src="https://img.shields.io/badge/QQ Group-%2312B7F5?logo=tencent-qq&logoColor=white&style=flat-square"/>
+</a>
+<a target="_blank" href="https://hub.docker.com/r/fishaudio/fish-speech">
+<img alt="Docker" src="https://img.shields.io/docker/pulls/fishaudio/fish-speech?style=flat-square&logo=docker"/>
+</a>
+</div>
+!!! warning
+    We assume no responsibility for any illegal use of the codebase. Please refer to the local laws regarding DMCA (Digital Millennium Copyright Act) and other relevant laws in your area. <br/>
+    This codebase and all models are released under the CC-BY-NC-SA-4.0 license.
+<p align="center">
+   <img src="../assets/figs/diagram.png" width="75%">
+</p>
+## Requirements
+- GPU Memory: 4GB (for inference), 8GB (for fine-tuning)
+- System: Linux, Windows
+## Windows Setup
+Professional Windows users may consider using WSL2 or Docker to run the codebase.
+```bash
+# Create a python 3.10 virtual environment, you can also use virtualenv
+conda create -n fish-speech python=3.10
+conda activate fish-speech
+# Install pytorch
+pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
+# Install fish-speech
+pip3 install -e .
+# (Enable acceleration) Install triton-windows
+pip install https://github.com/AnyaCoder/fish-speech/releases/download/v0.1.0/triton_windows-0.1.0-py3-none-any.whl
+```
+Non-professional Windows users can consider the following basic methods to run the project without a Linux environment (with model compilation capabilities, i.e., `torch.compile`):
+1. Extract the project package.
+2. Click `install_env.bat` to install the environment.
+3. If you want to enable compilation acceleration, follow this step:
+    1. Download the LLVM compiler from the following links:
+        - [LLVM-17.0.6 (Official Site Download)](https://huggingface.co/fishaudio/fish-speech-1/resolve/main/LLVM-17.0.6-win64.exe?download=true)
+        - [LLVM-17.0.6 (Mirror Site Download)](https://hf-mirror.com/fishaudio/fish-speech-1/resolve/main/LLVM-17.0.6-win64.exe?download=true)
+        - After downloading `LLVM-17.0.6-win64.exe`, double-click to install, select an appropriate installation location, and most importantly, check the `Add Path to Current User` option to add the environment variable.
+        - Confirm that the installation is complete.
+    2. Download and install the Microsoft Visual C++ Redistributable to solve potential .dll missing issues:
+        - [MSVC++ 14.40.33810.0 Download](https://aka.ms/vs/17/release/vc_redist.x64.exe)
+    3. Download and install Visual Studio Community Edition to get MSVC++ build tools and resolve LLVM's header file dependencies:
+        - [Visual Studio Download](https://visualstudio.microsoft.com/zh-hans/downloads/)
+        - After installing Visual Studio Installer, download Visual Studio Community 2022.
+        - As shown below, click the `Modify` button and find the `Desktop development with C++` option to select and download.
+    4. Download and install [CUDA Toolkit 12.x](https://developer.nvidia.com/cuda-12-1-0-download-archive?target_os=Windows&target_arch=x86_64)
+4. Double-click `start.bat` to open the training inference WebUI management interface. If needed, you can modify the `API_FLAGS` as prompted below.
+!!! info "Optional"
+	Want to start the inference WebUI?
+    Edit the `API_FLAGS.txt` file in the project root directory and modify the first three lines as follows:
+    ```
+     --infer
+     # --api
+     # --listen ...
+     ...
+    ```
+!!! info "Optional"
+	Want to start the API server?
+    Edit the `API_FLAGS.txt` file in the project root directory and modify the first three lines as follows:
+    ```
+    # --infer
+    --api
+    --listen ...
+    ...
+    ```
+!!! info "Optional"
+	Double-click `run_cmd.bat` to enter the conda/python command line environment of this project.
+## Linux Setup
+```bash
+# Create a python 3.10 virtual environment, you can also use virtualenv
+conda create -n fish-speech python=3.10
+conda activate fish-speech
+# Install pytorch
+pip3 install torch torchvision torchaudio
+# Install fish-speech
+pip3 install -e .[stable]
+# (Ubuntu / Debian User) Install sox
+apt install libsox-dev
+```
+## Changelog
+- 2024/09/10: Updated Fish-Speech to 1.4 version, with an increase in dataset size and a change in the quantizer's n_groups from 4 to 8.
+- 2024/07/02: Updated Fish-Speech to 1.2 version, remove VITS Decoder, and greatly enhanced zero-shot ability.
+- 2024/05/10: Updated Fish-Speech to 1.1 version, implement VITS decoder to reduce WER and improve timbre similarity.
+- 2024/04/22: Finished Fish-Speech 1.0 version, significantly modified VQGAN and LLAMA models.
+- 2023/12/28: Added `lora` fine-tuning support.
+- 2023/12/27: Add `gradient checkpointing`, `causual sampling`, and `flash-attn` support.
+- 2023/12/19: Updated webui and HTTP API.
+- 2023/12/18: Updated fine-tuning documentation and related examples.
+- 2023/12/17: Updated `text2semantic` model, supporting phoneme-free mode.
+- 2023/12/13: Beta version released, includes VQGAN model and a language model based on LLAMA (phoneme support only).
+## Acknowledgements
+- [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2)
+- [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2)
+- [GPT VITS](https://github.com/innnky/gpt-vits)
+- [MQTTS](https://github.com/b04901014/MQTTS)
+- [GPT Fast](https://github.com/pytorch-labs/gpt-fast)
+- [Transformers](https://github.com/huggingface/transformers)
+- [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)

docs/en/inference.md ADDED Viewed

	@@ -0,0 +1,124 @@

+# Inference
+Inference support command line, HTTP API and web UI.
+!!! note
+    Overall, reasoning consists of several parts:
+    1. Encode a given ~10 seconds of voice using VQGAN.
+    2. Input the encoded semantic tokens and the corresponding text into the language model as an example.
+    3. Given a new piece of text, let the model generate the corresponding semantic tokens.
+    4. Input the generated semantic tokens into VITS / VQGAN to decode and generate the corresponding voice.
+## Command Line Inference
+Download the required `vqgan` and `llama` models from our Hugging Face repository.
+```bash
+huggingface-cli download fishaudio/fish-speech-1.4 --local-dir checkpoints/fish-speech-1.4
+```
+### 1. Generate prompt from voice:
+!!! note
+    If you plan to let the model randomly choose a voice timbre, you can skip this step.
+```bash
+python tools/vqgan/inference.py \
+    -i "paimon.wav" \
+    --checkpoint-path "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"
+```
+You should get a `fake.npy` file.
+### 2. Generate semantic tokens from text:
+```bash
+python tools/llama/generate.py \
+    --text "The text you want to convert" \
+    --prompt-text "Your reference text" \
+    --prompt-tokens "fake.npy" \
+    --checkpoint-path "checkpoints/fish-speech-1.4" \
+    --num-samples 2 \
+    --compile
+```
+This command will create a `codes_N` file in the working directory, where N is an integer starting from 0.
+!!! note
+    You may want to use `--compile` to fuse CUDA kernels for faster inference (~30 tokens/second -> ~500 tokens/second).
+    Correspondingly, if you do not plan to use acceleration, you can comment out the `--compile` parameter.
+!!! info
+    For GPUs that do not support bf16, you may need to use the `--half` parameter.
+### 3. Generate vocals from semantic tokens:
+#### VQGAN Decoder
+```bash
+python tools/vqgan/inference.py \
+    -i "codes_0.npy" \
+    --checkpoint-path "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"
+```
+## HTTP API Inference
+We provide a HTTP API for inference. You can use the following command to start the server:
+```bash
+python -m tools.api \
+    --listen 0.0.0.0:8080 \
+    --llama-checkpoint-path "checkpoints/fish-speech-1.4" \
+    --decoder-checkpoint-path "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth" \
+    --decoder-config-name firefly_gan_vq
+```
+If you want to speed up inference, you can add the --compile parameter.
+After that, you can view and test the API at http://127.0.0.1:8080/.
+Below is an example of sending a request using `tools/post_api.py`.
+```bash
+python -m tools.post_api \
+    --text "Text to be input" \
+    --reference_audio "Path to reference audio" \
+    --reference_text "Text content of the reference audio" \
+    --streaming True
+```
+The above command indicates synthesizing the desired audio according to the reference audio information and returning it in a streaming manner.
+The following example demonstrates that you can use **multiple** reference audio paths and reference audio texts at once. Separate them with spaces in the command.
+```bash
+python -m tools.post_api \
+    --text "Text to input" \
+    --reference_audio "reference audio path1" "reference audio path2" \
+    --reference_text "reference audio text1" "reference audio text2"\
+    --streaming False \
+    --output "generated" \
+    --format "mp3"
+```
+The above command synthesizes the desired `MP3` format audio based on the information from multiple reference audios and saves it as `generated.mp3` in the current directory.
+## GUI Inference
+[Download client](https://github.com/AnyaCoder/fish-speech-gui/releases/tag/v0.1.0)
+## WebUI Inference
+You can start the WebUI using the following command:
+```bash
+python -m tools.webui \
+    --llama-checkpoint-path "checkpoints/fish-speech-1.4" \
+    --decoder-checkpoint-path "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth" \
+    --decoder-config-name firefly_gan_vq
+```
+!!! note
+    You can use Gradio environment variables, such as `GRADIO_SHARE`, `GRADIO_SERVER_PORT`, `GRADIO_SERVER_NAME` to configure WebUI.
+Enjoy!

docs/en/samples.md ADDED Viewed

	@@ -0,0 +1,223 @@

+# Samples
+v1.2 samples are available on [Bilibili](https://www.bilibili.com/video/BV1wz421B71D/).
+The following samples are from the v1.1 model.
+## Chinese Sentence 1
+```
+人间灯火倒映湖中，她的渴望让静水泛起涟漪。若代价只是孤独，那就让这份愿望肆意流淌。
+流入她所注视的世间，也流入她如湖水般澄澈的目光。
+```
+<table>
+    <thead>
+    <tr>
+        <th>Speaker</th>
+        <th>Input Audio</th>
+        <th>Synthesized Audio</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+        <td>Nahida (Genshin Impact)</td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/0_input.wav" /></td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/0_output.wav" /></td>
+    </tr>
+    <tr>
+        <td>Zhongli (Genshin Impact)</td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/1_input.wav" /></td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/1_output.wav" /></td>
+    </tr>
+    <tr>
+        <td>Furina (Genshin Impact)</td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/2_input.wav" /></td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/2_output.wav" /></td>
+    </tr>
+    <tr>
+        <td>Random Speaker 1</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/4_output.wav" /></td>
+    </tr>
+    <tr>
+        <td>Random Speaker 2</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/5_output.wav" /></td>
+    </tr>
+    </tbody>
+</table>
+## Chinese Sentence 2
+```
+你们这个是什么群啊，你们这是害人不浅啊你们这个群！谁是群主，出来！真的太过分了。你们搞这个群干什么？
+我儿子每一科的成绩都不过那个平均分呐，他现在初二，你叫我儿子怎么办啊？他现在还不到高中啊？
+你们害死我儿子了！快点出来你这个群主！再这样我去报警了啊！我跟你们说你们这一帮人啊，一天到晚啊，
+搞这些什么游戏啊，动漫啊，会害死你们的，你们没有前途我跟你说。你们这九百多个人，好好学习不好吗？
+一天到晚在上网。有什么意思啊？麻烦你重视一下你们的生活的目标啊？有一点学习目标行不行？一天到晚上网是不是人啊？
+```
+<table>
+    <thead>
+    <tr>
+        <th>Speaker</th>
+        <th>Input Audio</th>
+        <th>Synthesized Audio</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+        <td>Nahida (Genshin Impact)</td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/0_input.wav" /></td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/6_output.wav" /></td>
+    </tr>
+    <tr>
+        <td>Random Speaker</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/7_output.wav" /></td>
+    </tr>
+    </tbody>
+</table>
+## Chinese Sentence 3
+```
+大家好，我是 Fish Audio 开发的开源文本转语音模型。经过十五万小时的数据训练，
+我已经能够熟练掌握中文、日语和英语，我的语言处理能力接近人类水平，声音表现形式丰富多变。
+作为一个仅有亿级参数的模型，我相信社区成员能够在个人设备上轻松运行和微调，让我成为您的私人语音助手。
+```
+<table>
+    <thead>
+    <tr>
+        <th>Speaker</th>
+        <th>Input Audio</th>
+        <th>Synthesized Audio</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+        <td>Random Speaker</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/8_output.wav" /></td>
+    </tr>
+    </tbody>
+</table>
+## English Sentence 1
+```
+In the realm of advanced technology, the evolution of artificial intelligence stands as a
+monumental achievement. This dynamic field, constantly pushing the boundaries of what
+machines can do, has seen rapid growth and innovation. From deciphering complex data
+patterns to driving cars autonomously, AI's applications are vast and diverse.
+```
+<table>
+    <thead>
+    <tr>
+        <th>Speaker</th>
+        <th>Input Audio</th>
+        <th>Synthesized Audio</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+        <td>Random Speaker 1</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/en/0_output.wav" /></td>
+    </tr>
+    <tr>
+        <td>Random Speaker 2</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/en/1_output.wav" /></td>
+    </tr>
+    </tbody>
+</table>
+## English Sentence 2
+```
+Hello everyone, I am an open-source text-to-speech model developed by
+Fish Audio. After training with 150,000 hours of data, I have become proficient
+in Chinese, Japanese, and English, and my language processing abilities
+are close to human level. My voice is capable of a wide range of expressions.
+As a model with only hundreds of millions of parameters, I believe community
+members can easily run and fine-tune me on their personal devices, allowing
+me to serve as your personal voice assistant.
+```
+<table>
+    <thead>
+    <tr>
+        <th>Speaker</th>
+        <th>Input Audio</th>
+        <th>Synthesized Audio</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+        <td>Random Speaker</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/en/2_output.wav" /></td>
+    </tr>
+    </tbody>
+</table>
+## Japanese Sentence 1
+```
+先進技術の領域において、人工知能の進化は画期的な成果として立っています。常に機械ができることの限界を
+押し広げているこのダイナミックな分野は、急速な成長と革新を見せています。複雑なデータパターンの解読か
+ら自動運転車の操縦まで、AIの応用は広範囲に及びます。
+```
+<table>
+    <thead>
+    <tr>
+        <th>Speaker</th>
+        <th>Input Audio</th>
+        <th>Synthesized Audio</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+        <td>Random Speaker 1</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/ja/0_output.wav" /></td>
+    </tr>
+    <tr>
+        <td>Random Speaker 2</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/ja/1_output.wav" /></td>
+    </tr>
+    </tbody>
+</table>
+## Japanese Sentence 2
+```
+皆さん、こんにちは。私はフィッシュオーディオによって開発されたオープンソースのテ
+キストから音声への変換モデルです。15万時間のデータトレーニングを経て、
+中国語、日本語、英語を熟知しており、言語処理能力は人間に近いレベルです。
+声の表現も多彩で豊かです。数億のパラメータを持つこのモデルは、コミュニティ
+のメンバーが個人のデバイスで簡単に実行し、微調整することができると
+信じています。これにより、私を個人の音声アシスタントとして活用できます。
+```
+<table>
+    <thead>
+    <tr>
+        <th>Speaker</th>
+        <th>Input Audio</th>
+        <th>Synthesized Audio</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+        <td>Random Speaker</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/ja/2_output.wav" /></td>
+    </tr>
+    </tbody>
+</table>

docs/ja/finetune.md ADDED Viewed

	@@ -0,0 +1,125 @@

+# 微調整
+明らかに、このページを開いたとき、few-shot 事前トレーニングモデルのパフォーマンスに満足していなかったことでしょう。データセット上でのパフォーマンスを向上させるためにモデルを微調整したいと考えています。
+現在のバージョンでは、「LLAMA」部分のみを微調整する必要があります。
+## LLAMAの微調整
+### 1. データセットの準備
+```
+.
+├── SPK1
+│   ├── 21.15-26.44.lab
+│   ├── 21.15-26.44.mp3
+│   ├── 27.51-29.98.lab
+│   ├── 27.51-29.98.mp3
+│   ├── 30.1-32.71.lab
+│   └── 30.1-32.71.mp3
+└── SPK2
+    ├── 38.79-40.85.lab
+    └── 38.79-40.85.mp3
+```
+データセットを上記の形式に変換し、「data」ディレクトリに配置する必要があります。音声ファイルの拡張子は「.mp3」、「.wav」、または「.flac」にすることができ、注釈ファイルの拡張子は「.lab」にする必要があります。
+!!! warning
+    データセットにラウドネス正規化を適用することをお勧めします。これを行うには、[fish-audio-preprocess](https://github.com/fishaudio/audio-preprocess) を使用できます。
+    ```bash
+    fap loudness-norm data-raw data --clean
+    ```
+### 2. セマンティックトークンのバッチ抽出
+VQGANの重みをダウンロードしたことを確認してください。まだダウンロードしていない場合は、次のコマンドを実行してください。
+```bash
+huggingface-cli download fishaudio/fish-speech-1.4 --local-dir checkpoints/fish-speech-1.4
+```
+次に、次のコマンドを実行してセマンティックトークンを抽出できます。
+```bash
+python tools/vqgan/extract_vq.py data \
+    --num-workers 1 --batch-size 16 \
+    --config-name "firefly_gan_vq" \
+    --checkpoint-path "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"
+```
+!!! note
+    `--num-workers` と `--batch-size` を調整して抽出速度を上げることができますが、GPUメモリの制限を超えないようにしてください。
+    VITS形式の場合、`--filelist xxx.list` を使用してファイルリストを指定できます。
+このコマンドは、`data`ディレクトリに`.npy`ファイルを作成します。以下のように表示されます。
+```
+.
+├── SPK1
+│   ├── 21.15-26.44.lab
+│   ├── 21.15-26.44.mp3
+│   ├── 21.15-26.44.npy
+│   ├── 27.51-29.98.lab
+│   ├── 27.51-29.98.mp3
+│   ├── 27.51-29.98.npy
+│   ├── 30.1-32.71.lab
+│   ├── 30.1-32.71.mp3
+│   └── 30.1-32.71.npy
+└── SPK2
+    ├── 38.79-40.85.lab
+    ├── 38.79-40.85.mp3
+    └── 38.79-40.85.npy
+```
+### 3. データセットをprotobufにパックする
+```bash
+python tools/llama/build_dataset.py \
+    --input "data" \
+    --output "data/protos" \
+    --text-extension .lab \
+    --num-workers 16
+```
+コマンドの実行が完了すると、`data`ディレクトリに`quantized-dataset-ft.protos`ファイルが表示されます。
+### 4. 最後に、LoRAを使用して微調整する
+同様に、`LLAMA`の重みをダウンロードしたことを確認してください。まだダウンロードしていない場合は、次のコマンドを実行してください。
+```bash
+huggingface-cli download fishaudio/fish-speech-1.4 --local-dir checkpoints/fish-speech-1.4
+```
+最後に、次のコマンドを実行して微調整を開始できます。
+```bash
+python fish_speech/train.py --config-name text2semantic_finetune \
+    project=$project \
+    +lora@model.model.lora_config=r_8_alpha_16
+```
+!!! note
+    `fish_speech/configs/text2semantic_finetune.yaml` を変更して、`batch_size`、`gradient_accumulation_steps` などのトレーニングパラメータを変更し、GPUメモリに適合させることができます。
+!!! note
+    Windowsユーザーの場合、`trainer.strategy.process_group_backend=gloo` を使用して `nccl` の問題を回避できます。
+トレーニングが完了したら、[推論](inference.md)セクションを参照し、`--speaker SPK1` を使用して音声を生成します。
+!!! info
+    デフォルトでは、モデルは話者の発話パターンのみを学習し、音色は学習しません。音色の安定性を確保するためにプロンプトを使用する必要があります。
+    音色を学習したい場合は、トレーニングステップ数を増やすことができますが、これにより過学習が発生する可能性があります。
+トレーニングが完了したら、推論を行う前にLoRAの重みを通常の重みに変換する必要があります。
+```bash
+python tools/llama/merge_lora.py \
+	--lora-config r_8_alpha_16 \
+	--base-weight checkpoints/fish-speech-1.4 \
+	--lora-weight results/$project/checkpoints/step_000000010.ckpt \
+	--output checkpoints/fish-speech-1.4-yth-lora/
+```
+!!! note
+    他のチェックポイントを試すこともできます。要件を満たす最も早いチェックポイントを使用することをお勧めします。これらは通常、分布外（OOD）データでより良いパフォーマンスを発揮します。

docs/ja/index.md ADDED Viewed

	@@ -0,0 +1,128 @@

+# Fish Speech の紹介
+<div>
+<a target="_blank" href="https://discord.gg/Es5qTB9BcN">
+<img alt="Discord" src="https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square"/>
+</a>
+<a target="_blank" href="http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=jCKlUP7QgSm9kh95UlBoYv6s1I-Apl1M&authKey=xI5ttVAp3do68IpEYEalwXSYZFdfxZSkah%2BctF5FIMyN2NqAa003vFtLqJyAVRfF&noverify=0&group_code=593946093">
+<img alt="QQ" src="https://img.shields.io/badge/QQ Group-%2312B7F5?logo=tencent-qq&logoColor=white&style=flat-square"/>
+</a>
+<a target="_blank" href="https://hub.docker.com/r/fishaudio/fish-speech">
+<img alt="Docker" src="https://img.shields.io/docker/pulls/fishaudio/fish-speech?style=flat-square&logo=docker"/>
+</a>
+</div>
+!!! warning
+    私たちは、コードベースの違法な使用について一切の責任を負いません。お住まいの地域の DMCA（デジタルミレニアム著作権法）およびその他の関連法を参照してください。 <br/>
+    このコードベースとモデルは、CC-BY-NC-SA-4.0 ライセンス下でリリースされています。
+<p align="center">
+   <img src="../assets/figs/diagram.png" width="75%">
+</p>
+## 要件
+- GPU メモリ: 4GB（推論用）、8GB（ファインチューニング用）
+- システム: Linux、Windows
+## Windows セットアップ
+Window にて開発を行っている方へ: 本コードベースを実行するのに WSL2 または Docker を利用することができます。
+あまり詳しくない人は、Linux 環境なしでコードベースを実行するために以下の手順に従ってください。（モデルコンパイル機能`torch.compile`を利用できます。）：
+<ol>
+   <li>プロジェクトの圧縮ファイルをダウンロードし、展開</li>
+   <li><code>install_env.bat</code>を開いて実行に必要な環境を整えます。
+      <ul>
+            <li><code>install_env.bat</code>の<code>USE_MIRROR</code>ミラーサイトを使用する場合、項目を編集してください。</li>
+            <li><code>USE_MIRROR=false</code>は、最新の安定版の<code>torch</code>をオリジナルサイトからダウンロードします。<code>USE_MIRROR=true</code>は、最新の<code>torch</code>をミラーサイトからダウンロードします。デフォルトは<code>true</code>です。</li>
+            <li><code>install_env.bat</code>の<code>INSTALL_TYPE</code>を編集して、コンパイル環境をダウンロードするかを設定できます。</li>
+            <li><code>INSTALL_TYPE=preview</code>は、コンパイル環境付きのプレビュー版をダウンロードします。<code>INSTALL_TYPE=stable</code>は、コンパイル環境なしの安定版をダウンロードします。</li>
+      </ul>
+   </li>
+   <li>ステップ2で<code>USE_MIRROR=preview</code>の場合、オプション、コンパイルモデル環境を有効にするたに以下のステップを実行してください。：
+      <ol>
+            <li>以下のリンクからLLVMコンパイラをダウンロードします：
+               <ul>
+                  <li><a href="https://huggingface.co/fishaudio/fish-speech-1/resolve/main/LLVM-17.0.6-win64.exe?download=true">LLVM-17.0.6（オリジナルサイト）</a></li>
+                  <li><a href="https://hf-mirror.com/fishaudio/fish-speech-1/resolve/main/LLVM-17.0.6-win64.exe?download=true">LLVM-17.0.6（ミラーサイト）</a></li>
+                  <li><code>LLVM-17.0.6-win64.exe</code>をダウンロードした後、ダブルクリックしてインストールし、適当な場所にインストールしてください。必ず<code>Add Path to Current User</code>をチェックして環境変数に追加することです。</li>
+                  <li>インストールが完了したことを確認してください。</li>
+               </ul>
+            </li>
+            <li>Microsoft Visual C++ 再頒布可能パッケージをダウンロードしてインストールし、dllの欠落問題を解決します。
+               <ul>
+                  <li><a href="https://aka.ms/vs/17/release/vc_redist.x64.exe">MSVC++ 14.40.33810.0 ダウンロード</a></li>
+               </ul>
+            </li>
+            <li>Visual Studio Community Editionをダウンロードしてインストールし、MSVC++ビルドツールを取得し、LLVMのヘッダーファイル依存関係を解決します。
+               <ul>
+                  <li><a href="https://visualstudio.microsoft.com/zh-hans/downloads/">Visual Studio ダウンロード</a></li>
+                  <li>Visual Studio Installerをインストールした後、Visual Studio Community 2022をダウンロードします。</li>
+                  <li>以下のスクリーンショットのように<code>Modify</code>ボタンをクリックし、<code>Desktop development with C++</code>オプションにチェックをつけてダウンロ���ドします。</li>
+                  <p align="center">
+                     <img src="../assets/figs/VS_1.jpg" width="75%">
+                  </p>
+               </ul>
+            </li>
+            <li>インストール <a href="https://developer.nvidia.com/cuda-12-1-0-download-archive?target_os=Windows&target_arch=x86_64">CUDA Toolkit 12</a></li>
+      </ol>
+   </li>
+   <li><code>start.bat</code>を実行し、Fish-Speechのトレーニング/推論設定WebUIを開いてください。。
+      <ul>
+            <li>（オプション）直接推論ページに行きたい場合は、プロジェクトルートディレクトリの<code>API_FLAGS.txt</code>の最初の3行を次のように変更してください：
+               <pre><code>--infer
+# --api
+# --listen ...
+...</code></pre>
+            </li>
+            <li>（オプション）APIサーバーを起動したい場合は、プロジェクトルートディレクトリの<code>API_FLAGS.txt</code>の最初の3行を次のように変更してください：
+               <pre><code># --infer
+--api
+--listen ...
+...</code></pre>
+            </li>
+      </ul>
+   </li>
+   <li>（オプション）<code>run_cmd.bat</code>をダブルクリックして、このプロジェクトの仮想環境を有効化できます。</li>
+</ol>
+## Linux セットアップ
+```bash
+# python 3.10の仮想環境を作成します。virtualenvも使用できます。
+conda create -n fish-speech python=3.10
+conda activate fish-speech
+# pytorchをインストールします。
+pip3 install torch torchvision torchaudio
+# fish-speechをインストールします。
+pip3 install -e .[stable]
+# (Ubuntu / Debianユーザー) soxをインストールします。
+apt install libsox-dev
+```
+## 変更履歴
+- 2024/07/02: Fish-Speech を Ver.1.2 に更新し、VITS デコーダーを削除し、ゼロショット能力を大幅に強化しました。
+- 2024/05/10: Fish-Speech を Ver.1.1 に更新し、VITS デコーダーを実装して WER を減少させ、音色の類似性を向上させました。
+- 2024/04/22: Fish-Speech Ver.1.0 を完成させ、VQGAN および LLAMA モデルを大幅に修正しました。
+- 2023/12/28: `lora`微調整サポートを追加しました。
+- 2023/12/27: `gradient checkpointing`、`causual sampling`、および`flash-attn`サポートを追加しました。
+- 2023/12/19: webui および HTTP API を更新しました。
+- 2023/12/18: 微調整ドキュメントおよび関連例を更新しました。
+- 2023/12/17: `text2semantic`モデルを更新し、自由音素モードをサポートしました。
+- 2023/12/13: ベータ版をリリースし、VQGAN モデルおよび LLAMA に基づく言語モデル（音素のみサポート）を含みます。
+## 謝辞
+- [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2)
+- [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2)
+- [GPT VITS](https://github.com/innnky/gpt-vits)
+- [MQTTS](https://github.com/b04901014/MQTTS)
+- [GPT Fast](https://github.com/pytorch-labs/gpt-fast)
+- [Transformers](https://github.com/huggingface/transformers)
+- [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)

docs/ja/inference.md ADDED Viewed

	@@ -0,0 +1,157 @@

+# 推論
+推論は、コマンドライン、HTTP API、および Web UI をサポートしています。
+!!! note
+    全体として、推論は次のいくつかの部分で構成されています：
+    1. VQGANを使用して、与えられた約10秒の音声をエンコードします。
+    2. エンコードされたセマンティックトークンと対応するテキストを例として言語モデルに入力します。
+    3. 新しいテキストが与えられた場合、モデルに対応するセマンティックトークンを生成させます。
+    4. 生成されたセマンティックトークンをVITS / VQGANに入力してデコードし、対応する音声を生成します。
+## コマンドライン推論
+必要な`vqgan`および`llama`モデルを Hugging Face リポジトリからダウンロードします。
+```bash
+huggingface-cli download fishaudio/fish-speech-1.4 --local-dir checkpoints/fish-speech-1.4
+```
+### 1. 音声からプロンプトを生成する：
+!!! note
+    モデルにランダムに音声の音色を選ばせる場合、このステップをスキップできます。
+```bash
+python tools/vqgan/inference.py \
+    -i "paimon.wav" \
+    --checkpoint-path "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"
+```
+`fake.npy`ファイルが生成されるはずです。
+### 2. テキストからセマンティックトークンを生成する：
+```bash
+python tools/llama/generate.py \
+    --text "変換したいテキスト" \
+    --prompt-text "参照テキスト" \
+    --prompt-tokens "fake.npy" \
+    --checkpoint-path "checkpoints/fish-speech-1.4" \
+    --num-samples 2 \
+    --compile
+```
+このコマンドは、作業ディレクトリに`codes_N`ファイルを作成します。ここで、N は 0 から始まる整数です。
+!!! note
+    `--compile`を使用して CUDA カーネルを融合し、より高速な推論を実現することができます（約 30 トークン/秒 -> 約 500 トークン/秒）。
+    それに対応して、加速を使用しない場合は、`--compile`パラメータをコメントアウトできます。
+!!! info
+    bf16 をサポートしていない GPU の場合、`--half`パラメータを使用する必要があるかもしれません。
+### 3. セマンティックトークンから音声を生成する：
+#### VQGAN デコーダー
+```bash
+python tools/vqgan/inference.py \
+    -i "codes_0.npy" \
+    --checkpoint-path "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"
+```
+## HTTP API 推論
+推論のための HTTP API を提供しています。次のコマンドを使用してサーバーを起動できます：
+```bash
+python -m tools.api \
+    --listen 0.0.0.0:8080 \
+    --llama-checkpoint-path "checkpoints/fish-speech-1.4" \
+    --decoder-checkpoint-path "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth" \
+    --decoder-config-name firefly_gan_vq
+```
+推論を高速化したい場合は、--compile パラメータを追加できます。
+その後、`http://127.0.0.1:8080/`で API を表示およびテストできます。
+以下は、`tools/post_api.py` を使用してリクエストを送信する例です。
+```bash
+python -m tools.post_api \
+    --text "入力するテキスト" \
+    --reference_audio "参照音声へのパス" \
+    --reference_text "参照音声テキスト" \
+    --streaming True
+```
+上記のコマンドは、参照音声の情報に基づいて必要な音声を合成し、ストリーミング方式で返すことを示しています。
+`{SPEAKER}`と`{EMOTION}`に基づいて参照音声をランダムに選択する必要がある場合は、以下の手順に従って設定します：
+### 1. プロジェクトのルートディレクトリに`ref_data`フォルダを作成します。
+### 2. `ref_data`フォルダ内に次のような構造のディレクトリを作成します。
+```
+.
+├── SPEAKER1
+│    ├──EMOTION1
+│    │    ├── 21.15-26.44.lab
+│    │    ├── 21.15-26.44.wav
+│    │    ├── 27.51-29.98.lab
+│    │    ├── 27.51-29.98.wav
+│    │    ├── 30.1-32.71.lab
+│    │    └── 30.1-32.71.flac
+│    └──EMOTION2
+│         ├── 30.1-32.71.lab
+│         └── 30.1-32.71.mp3
+└── SPEAKER2
+    └─── EMOTION3
+          ├── 30.1-32.71.lab
+          └── 30.1-32.71.mp3
+```
+つまり、まず`ref_data`に`{SPEAKER}`フォルダを配置し、各スピーカーの下に`{EMOTION}`フォルダを配置し、各感情フォルダの下に任意の数の音声-テキストペアを配置します
+### 3. 仮想環境で以下のコマンドを入力します.
+```bash
+python tools/gen_ref.py
+```
+参照ディレクトリを生成します。
+### 4. API を呼び出します。
+```bash
+python -m tools.post_api \
+    --text "入力��るテキスト" \
+    --speaker "${SPEAKER1}" \
+    --emotion "${EMOTION1}" \
+    --streaming True
+```
+上記の例はテスト目的のみです。
+## WebUI 推論
+次のコマンドを使用して WebUI を起動できます：
+```bash
+python -m tools.webui \
+    --llama-checkpoint-path "checkpoints/fish-speech-1.4" \
+    --decoder-checkpoint-path "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth" \
+    --decoder-config-name firefly_gan_vq
+```
+!!! note
+    Gradio 環境変数（`GRADIO_SHARE`、`GRADIO_SERVER_PORT`、`GRADIO_SERVER_NAME`など）を使用して WebUI を構成できます。
+お楽しみください！

docs/ja/samples.md ADDED Viewed

	@@ -0,0 +1,223 @@

+# サンプル
+v1.2のサンプルは[Bilibili](https://www.bilibili.com/video/BV1wz421B71D/)で利用可能です。
+以下のサンプルはv1.1モデルからのものです。
+## 中国語の文1
+```
+人間灯火倒映湖中，她的渴望让静水泛起涟漪。若代价只是孤独，那就让这份愿望肆意流淌。
+流入她所注视的世间，也流入她如湖水般澄澈的目光。
+```
+<table>
+    <thead>
+    <tr>
+        <th>話者</th>
+        <th>入力音声</th>
+        <th>合成音声</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+        <td>ナヒーダ (原神)</td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/0_input.wav" /></td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/0_output.wav" /></td>
+    </tr>
+    <tr>
+        <td>鍾離 (原神)</td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/1_input.wav" /></td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/1_output.wav" /></td>
+    </tr>
+    <tr>
+        <td>フリナ (原神)</td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/2_input.wav" /></td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/2_output.wav" /></td>
+    </tr>
+    <tr>
+        <td>ランダム話者1</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/4_output.wav" /></td>
+    </tr>
+    <tr>
+        <td>ランダム話者2</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/5_output.wav" /></td>
+    </tr>
+    </tbody>
+</table>
+## 中国語の文2
+```
+你们这个是什么群啊，你们这是害人不浅啊你们这个群！谁是群主，出来！真的太过分了。你们搞这个群干什么？
+我儿子每一科的成绩都不过那个平均分呐，他现在初二，你叫我儿子怎么办啊？他现在还不到高中啊？
+你们害死我儿子了！快点出来你这个群主！再这样我去报警了啊！我跟你们说你们这一帮人啊，一天到晚啊，
+搞这些什么游戏啊，动漫啊，会害死你们的，你们没有前途我跟你说。你们这九百多个人，好好学习不好吗？
+一天到晚在上网。有什么意思啊？麻烦你重视一下你们的生活的目标啊？有一点学习目标行不行？一天到晚上网是不是人啊？
+```
+<table>
+    <thead>
+    <tr>
+        <th>話者</th>
+        <th>入力音声</th>
+        <th>合成音声</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+        <td>ナヒーダ (原神)</td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/0_input.wav" /></td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/6_output.wav" /></td>
+    </tr>
+    <tr>
+        <td>ランダム話者</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/7_output.wav" /></td>
+    </tr>
+    </tbody>
+</table>
+## 中国語の文3
+```
+大家好，我是 Fish Audio 开发的开源文本转语音模型。经过十五万小时的数据训练，
+我已经能够熟练掌握中文、日语和英语，我的语言处理能力接近人类水平，声音表现形式丰富多变。
+作为一个仅有亿级参数的模型，我相信社区成员能够在个人设备上轻松运行和微调，让我成为您的私人语音助手。
+```
+<table>
+    <thead>
+    <tr>
+        <th>話者</th>
+        <th>入力音声</th>
+        <th>合成音声</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+        <td>ランダム話者</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/8_output.wav" /></td>
+    </tr>
+    </tbody>
+</table>
+## 英語の文1
+```
+In the realm of advanced technology, the evolution of artificial intelligence stands as a
+monumental achievement. This dynamic field, constantly pushing the boundaries of what
+machines can do, has seen rapid growth and innovation. From deciphering complex data
+patterns to driving cars autonomously, AI's applications are vast and diverse.
+```
+<table>
+    <thead>
+    <tr>
+        <th>話者</th>
+        <th>入力音声</th>
+        <th>合成音声</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+        <td>ランダム話者1</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/en/0_output.wav" /></td>
+    </tr>
+    <tr>
+        <td>ランダム話者2</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/en/1_output.wav" /></td>
+    </tr>
+    </tbody>
+</table>
+## 英語の文2
+```
+Hello everyone, I am an open-source text-to-speech model developed by
+Fish Audio. After training with 150,000 hours of data, I have become proficient
+in Chinese, Japanese, and English, and my language processing abilities
+are close to human level. My voice is capable of a wide range of expressions.
+As a model with only hundreds of millions of parameters, I believe community
+members can easily run and fine-tune me on their personal devices, allowing
+me to serve as your personal voice assistant.
+```
+<table>
+    <thead>
+    <tr>
+        <th>話者</th>
+        <th>入力音声</th>
+        <th>合成音声</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+        <td>ランダム話者</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/en/2_output.wav" /></td>
+    </tr>
+    </tbody>
+</table>
+## 日本語の文1
+```
+先進技術の領域において、人工知能の進化は画期的な成果として立っています。常に機械ができることの限界を
+押し広げているこのダイナミックな分野は、急速な成長と革新を見せています。複雑なデータパターンの解読か
+ら自動運転車の操縦まで、AIの応用は広範囲に及びます。
+```
+<table>
+    <thead>
+    <tr>
+        <th>話者</th>
+        <th>入力音声</th>
+        <th>合成音声</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+        <td>ランダム話者1</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/ja/0_output.wav" /></td>
+    </tr>
+    <tr>
+        <td>ランダム話者2</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/ja/1_output.wav" /></td>
+    </tr>
+    </tbody>
+</table>
+## 日本語の文2
+```
+皆さん、こんにちは。私はフィッシュオーディオによって開発されたオープンソースのテ
+キストから音声への変換モデルです。15万時間のデータトレーニングを経て、
+中国語、日本語、英語を熟知しており、言語処理能力は人間に近いレベルです。
+声の表現も多彩で豊かです。数億のパラメータを持つこのモデルは、コミュニティ
+のメンバーが個人のデバイスで簡単に実行し、微調整することができると
+信じています。これにより、私を個人の音声アシスタントとして活用できます。
+```
+<table>
+    <thead>
+    <tr>
+        <th>話者</th>
+        <th>入力音声</th>
+        <th>合成音声</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+        <td>ランダム話者</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/ja/2_output.wav" /></td>
+    </tr>
+    </tbody>
+</table>

docs/pt/finetune.md ADDED Viewed

	@@ -0,0 +1,125 @@

+# Ajuste Fino
+É óbvio que ao abrir esta página, você não deve estar muito satisfeito com o desempenho do modelo pré-treinado com poucos exemplos. Você pode querer ajustar o modelo para melhorar seu desempenho em seu conjunto de dados.
+Na atual versão, a única coisa que você precisa ajustar é a parte do 'LLAMA'.
+## Ajuste Fino do LLAMA
+### 1. Preparando o conjunto de dados
+```
+.
+├── SPK1
+│   ├── 21.15-26.44.lab
+│   ├── 21.15-26.44.mp3
+│   ├── 27.51-29.98.lab
+│   ├── 27.51-29.98.mp3
+│   ├── 30.1-32.71.lab
+│   └── 30.1-32.71.mp3
+└── SPK2
+    ├── 38.79-40.85.lab
+    └── 38.79-40.85.mp3
+```
+Você precisa converter seu conjunto de dados para o formato acima e colocá-lo em `data`. O arquivo de áudio pode ter as extensões `.mp3`, `.wav` ou `.flac`, e o arquivo de anotação deve ter a extensão `.lab`.
+!!! warning
+    É recomendado aplicar normalização de volume ao conjunto de dados. Você pode usar o [fish-audio-preprocess](https://github.com/fishaudio/audio-preprocess) para fazer isso.
+    ```bash
+    fap loudness-norm data-raw data --clean
+    ```
+### 2. Extração em lote de tokens semânticos
+Certifique-se de ter baixado os pesos do VQGAN. Se não, execute o seguinte comando:
+```bash
+huggingface-cli download fishaudio/fish-speech-1.4 --local-dir checkpoints/fish-speech-1.4
+```
+Em seguida, você pode executar o seguinte comando para extrair os tokens semânticos:
+```bash
+python tools/vqgan/extract_vq.py data \
+    --num-workers 1 --batch-size 16 \
+    --config-name "firefly_gan_vq" \
+    --checkpoint-path "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"
+```
+!!! note
+    Você pode ajustar `--num-workers` e `--batch-size` para aumentar a velocidade de extração, mas certifique-se de não exceder o limite de memória da sua GPU.
+    Para o formato VITS, você pode especificar uma lista de arquivos usando `--filelist xxx.list`.
+Este comando criará arquivos `.npy` no diretório `data`, como mostrado abaixo:
+```
+.
+├── SPK1
+│   ├── 21.15-26.44.lab
+│   ├── 21.15-26.44.mp3
+│   ├── 21.15-26.44.npy
+│   ├── 27.51-29.98.lab
+│   ├── 27.51-29.98.mp3
+│   ├── 27.51-29.98.npy
+│   ├── 30.1-32.71.lab
+│   ├── 30.1-32.71.mp3
+│   └── 30.1-32.71.npy
+└── SPK2
+    ├── 38.79-40.85.lab
+    ├── 38.79-40.85.mp3
+    └── 38.79-40.85.npy
+```
+### 3. Empacotar o conjunto de dados em protobuf
+```bash
+python tools/llama/build_dataset.py \
+    --input "data" \
+    --output "data/protos" \
+    --text-extension .lab \
+    --num-workers 16
+```
+Após executar o comando, você deverá ver o arquivo `quantized-dataset-ft.protos` no diretório `data`.
+### 4. E finalmente, chegamos ao ajuste fino com LoRA
+Da mesma forma, certifique-se de ter baixado os pesos do `LLAMA`. Se não, execute o seguinte comando:
+```bash
+huggingface-cli download fishaudio/fish-speech-1.4 --local-dir checkpoints/fish-speech-1.4
+```
+E então, execute o seguinte comando para iniciar o ajuste fino:
+```bash
+python fish_speech/train.py --config-name text2semantic_finetune \
+    project=$project \
+    +lora@model.model.lora_config=r_8_alpha_16
+```
+!!! note
+    Se quiser, você pode modificar os parâmetros de treinamento, como `batch_size`, `gradient_accumulation_steps`, etc., para se ajustar à memória da sua GPU, modificando `fish_speech/configs/text2semantic_finetune.yaml`.
+!!! note
+    Para usuários do Windows, é recomendado usar `trainer.strategy.process_group_backend=gloo` para evitar problemas com `nccl`.
+Após concluir o treinamento, consulte a seção [inferência](inference.md), e use `--speaker SPK1` para gerar fala.
+!!! info
+    Por padrão, o modelo aprenderá apenas os padrões de fala do orador e não o timbre. Ainda pode ser preciso usar prompts para garantir a estabilidade do timbre.
+    Se quiser que ele aprenda o timbre, aumente o número de etapas de treinamento, mas isso pode levar ao overfitting (sobreajuste).
+Após o treinamento, é preciso converter os pesos do LoRA em pesos regulares antes de realizar a inferência.
+```bash
+python tools/llama/merge_lora.py \
+    --lora-config r_8_alpha_16 \
+    --base-weight checkpoints/fish-speech-1.4 \
+    --lora-weight results/$project/checkpoints/step_000000010.ckpt \
+    --output checkpoints/fish-speech-1.4-yth-lora/
+```
+!!! note
+    É possível também tentar outros checkpoints. Sugerimos usar o checkpoint que melhor atenda aos seus requisitos, pois eles geralmente têm um desempenho melhor em dados fora da distribuição (OOD).

docs/pt/index.md ADDED Viewed

	@@ -0,0 +1,131 @@

+# Introdução
+<div>
+<a target="_blank" href="https://discord.gg/Es5qTB9BcN">
+<img alt="Discord" src="https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square"/>
+</a>
+<a target="_blank" href="http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=jCKlUP7QgSm9kh95UlBoYv6s1I-Apl1M&authKey=xI5ttVAp3do68IpEYEalwXSYZFdfxZSkah%2BctF5FIMyN2NqAa003vFtLqJyAVRfF&noverify=0&group_code=593946093">
+<img alt="QQ" src="https://img.shields.io/badge/QQ Group-%2312B7F5?logo=tencent-qq&logoColor=white&style=flat-square"/>
+</a>
+<a target="_blank" href="https://hub.docker.com/r/fishaudio/fish-speech">
+<img alt="Docker" src="https://img.shields.io/docker/pulls/fishaudio/fish-speech?style=flat-square&logo=docker"/>
+</a>
+</div>
+!!! warning
+    Não nos responsabilizamos por qualquer uso ilegal do código-fonte. Consulte as leis locais sobre DMCA (Digital Millennium Copyright Act) e outras leis relevantes em sua região. <br/>
+    Este repositório de código e os modelos são distribuídos sob a licença CC-BY-NC-SA-4.0.
+<p align="center">
+   <img src="../assets/figs/diagrama.png" width="75%">
+</p>
+## Requisitos
+- Memória da GPU: 4GB (para inferência), 8GB (para ajuste fino)
+- Sistema: Linux, Windows
+## Configuração para Windows
+No Windows, usuários avançados podem considerar usar o WSL2 ou Docker para executar o código.
+Para Usuários comuns (não-avançados), siga os métodos abaixo para executar o código sem um ambiente Linux (incluindo suporte para `torch.compile`):
+<ol>
+   <li>Extraia o arquivo compactado do projeto.</li>
+   <li>Prepare o ambiente conda:
+    <ul>
+      <li>Abra o <code>install_env.bat</code> para baixar e iniciar a instalação do miniconda.</li>
+      <li>Personalize o download (opcional):
+        <ul>
+          <li>**Site espelho:** Para usar um site espelho para downloads mais rápidos, defina <code>USE_MIRROR=true</code> no <code>install_env.bat</code> (padrão). Caso contrário, use <code>USE_MIRROR=false</code>.</li>
+          <li>**Ambiente compilado:** Para baixar a versão de prévia com o ambiente compilado, defina <code>INSTALL_TYPE=preview</code>. Para a versão estável sem ambiente compilado, use <code>INSTALL_TYPE=stable</code>.</li>
+        </ul>
+      </li>
+      </ul>
+   </li>
+   <li>Se você escolheu a versão de prévia com ambiente compilado (<code>INSTALL_TYPE=preview</code>), siga para a próxima etapa (opcional):
+      <ol>
+            <li>Baixe o compilador LLVM usando os seguintes links:
+               <ul>
+                  <li><a href="https://huggingface.co/fishaudio/fish-speech-1/resolve/main/LLVM-17.0.6-win64.exe?download=true">LLVM-17.0.6 (download do site original)</a></li>
+                  <li><a href="https://hf-mirror.com/fishaudio/fish-speech-1/resolve/main/LLVM-17.0.6-win64.exe?download=true">LLVM-17.0.6 (download do site espelho)</a></li>
+                  <li>Após baixar o <code>LLVM-17.0.6-win64.exe</code>, clique duas vezes para instalá-lo, escolha um local de instalação apropriado. E durante a instalação, marque a opção <code>Add Path to Current User</code> para adicionar às variáveis de ambiente.</li>
+                  <li>Confirme se a instalação foi concluída.</li>
+               </ul>
+            </li>
+            <li>Baixe e instale o pacote Microsoft Visual C++ Redistributable para resolver possíveis problemas de .dll ausentes.
+               <ul>
+                  <li><a href="https://aka.ms/vs/17/release/vc_redist.x64.exe">Download do MSVC++ 14.40.33810.0</a></li>
+               </ul>
+            </li>
+            <li>Baixe e instale o Visual Studio Community Edition para obter as ferramentas de compilação MSVC++, resolvendo as dependências do arquivo de cabeçalho LLVM.
+               <ul>
+                  <li><a href="https://visualstudio.microsoft.com/pt-br/downloads/">Download do Visual Studio</a></li>
+                  <li>Após instalar o Visual Studio Installer, baixe o Visual Studio Community 2022.</li>
+                  <li>Clique no botão <code>Modificar</code>, conforme mostrado abaixo, encontre a opção <code>Desenvolvimento para desktop com C++</code> e marque-a para download.</li>
+                  <p align="center">
+                     <img src="../assets/figs/VS_1_pt-BR.png" width="75%">
+                  </p>
+               </ul>
+            </li>
+            <li>Instale o <a href="https://developer.nvidia.com/cuda-12-1-0-download-archive?target_os=Windows&target_arch=x86_64">CUDA Toolkit 12</a></li>
+      </ol>
+   </li>
+   <li>Clique duas vezes em <code>start.bat</code> para entrar na página da WebUI de configuração de inferência de treinamento do Fish-Speech.
+      <ul>
+            <li>(Opcional) Se desejar ir direto para a página de inferência, edite o arquivo <code>API_FLAGS.txt</code> no diretório raiz do projeto e modifique as três primeiras linhas da seguinte forma:
+               <pre><code>--infer
+# --api
+# --listen ...
+...</code></pre>
+            </li>
+            <li>(Opcional) Se preferir iniciar o servidor da API, edite o arquivo <code>API_FLAGS.txt</code> no diretório raiz do projeto e modifique as três primeiras linhas da seguinte forma:
+               <pre><code># --infer
+--api
+--listen ...
+...</code></pre>
+            </li>
+      </ul>
+   </li>
+   <li>(Opcional) Clique duas vezes em <code>run_cmd.bat</code> para entrar na CLI do conda/python deste projeto.</li>
+</ol>
+## Configuração para Linux
+```bash
+# Crie um ambiente virtual python 3.10, você também pode usar virtualenv
+conda create -n fish-speech python=3.10
+conda activate fish-speech
+# Instale o pytorch
+pip3 install torch torchvision torchaudio
+# Instale o fish-speech
+pip3 install -e .[stable]
+# Para os Usuário do Ubuntu / Debian: Instale o sox
+apt install libsox-dev
+```
+## Histórico de Alterações
+- 02/07/2024: Fish-Speech atualizado para a versão 1.2, removido o Decodificador VITS e aprimorado consideravelmente a capacidade de zero-shot.
+- 10/05/2024: Fish-Speech atualizado para a versão 1.1, implementado o decodificador VITS para reduzir a WER e melhorar a similaridade de timbre.
+- 22/04/2024: Finalizada a versão 1.0 do Fish-Speech, modificados significativamente os modelos VQGAN e LLAMA.
+- 28/12/2023: Adicionado suporte para ajuste fino `lora`.
+- 27/12/2023: Adicionado suporte para `gradient checkpointing`, `causual sampling` e `flash-attn`.
+- 19/12/2023: Atualizada a interface web e a API HTTP.
+- 18/12/2023: Atualizada a documentação de ajuste fino e exemplos relacionados.
+- 17/12/2023: Atualizado o modelo `text2semantic`, suportando o modo sem fonemas.
+- 13/12/2023: Versão beta lançada, incluindo o modelo VQGAN e um modelo de linguagem baseado em LLAMA (suporte apenas a fonemas).
+## Agradecimentos
+- [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2)
+- [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2)
+- [GPT VITS](https://github.com/innnky/gpt-vits)
+- [MQTTS](https://github.com/b04901014/MQTTS)
+- [GPT Fast](https://github.com/pytorch-labs/gpt-fast)
+- [Transformers](https://github.com/huggingface/transformers)
+- [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)

docs/pt/inference.md ADDED Viewed

	@@ -0,0 +1,153 @@

+# Inferência
+Suporte para inferência por linha de comando, API HTTP e interface web (WebUI).
+!!! note
+    O processo de raciocínio, em geral, consiste em várias partes:
+    1. Codificar cerca de 10 segundos de voz usando VQGAN.
+    2. Inserir os tokens semânticos codificados e o texto correspondente no modelo de linguagem como um exemplo.
+    3. Dado um novo trecho de texto, fazer com que o modelo gere os tokens semânticos correspondentes.
+    4. Inserir os tokens semânticos gerados no VITS / VQGAN para decodificar e gerar a voz correspondente.
+## Inferência por Linha de Comando
+Baixe os modelos `vqgan` e `llama` necessários do nosso repositório Hugging Face.
+```bash
+huggingface-cli download fishaudio/fish-speech-1.4 --local-dir checkpoints/fish-speech-1.4
+```
+### 1. Gerar prompt a partir da voz:
+!!! note
+    Se quiser permitir que o modelo escolha aleatoriamente um timbre de voz, pule esta etapa.
+```bash
+python tools/vqgan/inference.py \
+    -i "paimon.wav" \
+    --checkpoint-path "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"
+```
+Você deverá obter um arquivo `fake.npy`.
+### 2. Gerar tokens semânticos a partir do texto:
+```bash
+python tools/llama/generate.py \
+    --text "O texto que você deseja converter" \
+    --prompt-text "Seu texto de referência" \
+    --prompt-tokens "fake.npy" \
+    --checkpoint-path "checkpoints/fish-speech-1.4" \
+    --num-samples 2 \
+    --compile
+```
+Este comando criará um arquivo `codes_N` no diretório de trabalho, onde N é um número inteiro começando de 0.
+!!! note
+    Use `--compile` para fundir kernels CUDA para ter uma inferência mais rápida (~30 tokens/segundo -> ~500 tokens/segundo).
+    Mas, se não planeja usar a aceleração CUDA, comente o parâmetro `--compile`.
+!!! info
+    Para GPUs que não suportam bf16, pode ser necessário usar o parâmetro `--half`.
+### 3. Gerar vocais a partir de tokens semânticos:
+#### Decodificador VQGAN
+```bash
+python tools/vqgan/inference.py \
+    -i "codes_0.npy" \
+    --checkpoint-path "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"
+```
+## Inferência por API HTTP
+Fornecemos uma API HTTP para inferência. O seguinte comando pode ser usado para iniciar o servidor:
+```bash
+python -m tools.api \
+    --listen 0.0.0.0:8080 \
+    --llama-checkpoint-path "checkpoints/fish-speech-1.4" \
+    --decoder-checkpoint-path "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth" \
+    --decoder-config-name firefly_gan_vq
+```
+Para acelerar a inferência, adicione o parâmetro `--compile`.
+Depois disso, é possível visualizar e testar a API em http://127.0.0.1:8080/.
+Abaixo está um exemplo de envio de uma solicitação usando `tools/post_api.py`.
+```bash
+python -m tools.post_api \
+    --text "Texto a ser inserido" \
+    --reference_audio "Caminho para o áudio de referência" \
+    --reference_text "Conteúdo de texto do áudio de referência" \
+    --streaming True
+```
+O comando acima indica a síntese do áudio desejada de acordo com as informações do áudio de referência e a retorna em modo de streaming.
+Caso selecione, de forma aleatória, o áudio de referência com base em `{SPEAKER}` e `{EMOTION}`, o configure de acordo com as seguintes etapas:
+### 1. Crie uma pasta `ref_data` no diretório raiz do projeto.
+### 2. Crie uma estrutura de diretórios semelhante à seguinte dentro da pasta `ref_data`.
+```
+.
+├── SPEAKER1
+│    ├──EMOTION1
+│    │    ├── 21.15-26.44.lab
+│    │    ├── 21.15-26.44.wav
+│    │    ├── 27.51-29.98.lab
+│    │    ├── 27.51-29.98.wav
+│    │    ├── 30.1-32.71.lab
+│    │    └── 30.1-32.71.flac
+│    └──EMOTION2
+│         ├── 30.1-32.71.lab
+│         └── 30.1-32.71.mp3
+└── SPEAKER2
+    └─── EMOTION3
+          ├── 30.1-32.71.lab
+          └── 30.1-32.71.mp3
+```
+Ou seja, primeiro coloque as pastas `{SPEAKER}` em `ref_data`, depois coloque as pastas `{EMOTION}` em cada pasta de orador (speaker) e coloque qualquer número de `pares áudio-texto` em cada pasta de emoção.
+### 3. Digite o seguinte comando no ambiente virtual
+```bash
+python tools/gen_ref.py
+```
+### 4. Chame a API.
+```bash
+python -m tools.post_api \
+    --text "Texto a ser inserido" \
+    --speaker "${SPEAKER1}" \
+    --emotion "${EMOTION1}" \
+    --streaming True
+```
+O exemplo acima é apenas para fins de teste.
+## Inferência por WebUI
+Para iniciar a WebUI de Inferência execute o seguinte comando:
+```bash
+python -m tools.webui \
+    --llama-checkpoint-path "checkpoints/fish-speech-1.4" \
+    --decoder-checkpoint-path "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth" \
+    --decoder-config-name firefly_gan_vq
+```
+!!! note
+    É possível usar variáveis de ambiente do Gradio, como `GRADIO_SHARE`, `GRADIO_SERVER_PORT`, `GRADIO_SERVER_NAME`, para configurar a WebUI.
+Divirta-se!

docs/pt/samples.md ADDED Viewed

	@@ -0,0 +1,223 @@

+# Amostras
+As amostras da v1.2 estão disponíveis em [Bilibili](https://www.bilibili.com/video/BV1wz421B71D/).
+As seguintes amostras são do modelo v1.1.
+## Frase em Chinês 1
+```
+人间灯火倒映湖中，她的渴望让静水泛起涟漪。若代价只是孤独，那就让这份愿望肆意流淌。
+流入她所注视的世间，也流入她如湖水般澄澈的目光。
+```
+<table>
+    <thead>
+    <tr>
+        <th>Orador</th>
+        <th>Áudio de Entrada</th>
+        <th>Áudio Sintetizado</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+        <td>Nahida (Genshin Impact)</td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/0_input.wav" /></td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/0_output.wav" /></td>
+    </tr>
+    <tr>
+        <td>Zhongli (Genshin Impact)</td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/1_input.wav" /></td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/1_output.wav" /></td>
+    </tr>
+    <tr>
+        <td>Furina (Genshin Impact)</td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/2_input.wav" /></td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/2_output.wav" /></td>
+    </tr>
+    <tr>
+        <td>Orador Aleatório 1</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/4_output.wav" /></td>
+    </tr>
+    <tr>
+        <td>Orador Aleatório 2</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/5_output.wav" /></td>
+    </tr>
+    </tbody>
+</table>
+## Frase em Chinês 2
+```
+你们这个是什么群啊，你们这是害人不浅啊你们这个群！谁是群主，出来！真的太过分了。你们搞这个群干什么？
+我儿子每一科的成绩都不过那个平均分呐，他现在初二，你叫我儿子怎么办啊？他现在还不到高中啊？
+你们害死我儿子了！快点出来你这个群主！再这样我去报警了啊！我跟你们说你们这一帮人啊，一天到晚啊，
+搞这些什么游戏啊，动漫啊，会害死你们的，你们没有前途我跟你说。你们这九百多个人，好好学习不好吗？
+一天到晚在上网。有什么意思啊？麻烦你重视一下你们的生活的目标啊？有一点学习目标行不行？一天到晚上网是不是人啊？
+```
+<table>
+    <thead>
+    <tr>
+        <th>Orador</th>
+        <th>Áudio de Entrada</th>
+        <th>Áudio Sintetizado</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+        <td>Nahida (Genshin Impact)</td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/0_input.wav" /></td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/6_output.wav" /></td>
+    </tr>
+    <tr>
+        <td>Orador Aleatório</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/7_output.wav" /></td>
+    </tr>
+    </tbody>
+</table>
+## Frase em Chinês 3
+```
+大家好，我是 Fish Audio 开发的开源文本转语音模型。经过十五万小时的数据训练，
+我已经能够熟练掌握中文、日语和英语，我的语言处理能力接近人类水平，声音表现形式丰富多变。
+作为一个仅有亿级参数的模型，我相信社区成员能够在个人设备上轻松运行和微调，让我成为您的私人语音助手。
+```
+<table>
+    <thead>
+    <tr>
+        <th>Orador</th>
+        <th>Áudio de Entrada</th>
+        <th>Áudio Sintetizado</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+        <td>Orador Aleatório</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/8_output.wav" /></td>
+    </tr>
+    </tbody>
+</table>
+## Frase em Inglês 1
+```
+In the realm of advanced technology, the evolution of artificial intelligence stands as a
+monumental achievement. This dynamic field, constantly pushing the boundaries of what
+machines can do, has seen rapid growth and innovation. From deciphering complex data
+patterns to driving cars autonomously, AI's applications are vast and diverse.
+```
+<table>
+    <thead>
+    <tr>
+        <th>Orador</th>
+        <th>Áudio de Entrada</th>
+        <th>Áudio Sintetizado</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+        <td>Orador Aleatório 1</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/en/0_output.wav" /></td>
+    </tr>
+    <tr>
+        <td>Orador Aleatório 2</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/en/1_output.wav" /></td>
+    </tr>
+    </tbody>
+</table>
+## Frase em Inglês 2
+```
+Hello everyone, I am an open-source text-to-speech model developed by
+Fish Audio. After training with 150,000 hours of data, I have become proficient
+in Chinese, Japanese, and English, and my language processing abilities
+are close to human level. My voice is capable of a wide range of expressions.
+As a model with only hundreds of millions of parameters, I believe community
+members can easily run and fine-tune me on their personal devices, allowing
+me to serve as your personal voice assistant.
+```
+<table>
+    <thead>
+    <tr>
+        <th>Orador</th>
+        <th>Áudio de Entrada</th>
+        <th>Áudio Sintetizado</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+        <td>Orador Aleatório</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/en/2_output.wav" /></td>
+    </tr>
+    </tbody>
+</table>
+## Frase em Japonês 1
+```
+先進技術の領域において、人工知能の進化は画期的な成果として立っています。常に機械ができることの限界を
+押し広げているこのダイナミックな分野は、急速な成長と革新を見せています。複雑なデータパターンの解読か
+ら自動運転車の操縦まで、AIの応用は広範囲に及びます。
+```
+<table>
+    <thead>
+    <tr>
+        <th>Orador</th>
+        <th>Áudio de Entrada</th>
+        <th>Áudio Sintetizado</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+        <td>Orador Aleatório 1</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/ja/0_output.wav" /></td>
+    </tr>
+    <tr>
+        <td>Orador Aleatório 2</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/ja/1_output.wav" /></td>
+    </tr>
+    </tbody>
+</table>
+## Frase em Japonês 2
+```
+皆さん、こんにちは。私はフィッシュオーディオによって開発されたオープンソースのテ
+キストから音声への変換モデルです。15万時間のデータトレーニングを経て、
+中国語、日本語、英語を熟知しており、言語処理能力は人間に近いレベルです。
+声の表現も多彩で豊かです。数億のパラメータを持つこのモデルは、コミュニティ
+のメンバーが個人のデバイスで簡単に実行し、微調整することができると
+信じています。これにより、私を個人の音声アシスタントとして活用できます。
+```
+<table>
+    <thead>
+    <tr>
+        <th>Orador</th>
+        <th>Áudio de Entrada</th>
+        <th>Áudio Sintetizado</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+        <td>Orador Aleatório</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/ja/2_output.wav" /></td>
+    </tr>
+    </tbody>
+</table>

docs/requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+mkdocs-material
+mkdocs-static-i18n[material]
+mkdocs[i18n]

docs/stylesheets/extra.css ADDED Viewed

	@@ -0,0 +1,3 @@

+.md-grid {
+  max-width: 1440px;
+}

docs/zh/finetune.md ADDED Viewed

	@@ -0,0 +1,136 @@

+# 微调
+显然, 当你打开这个页面的时候, 你已经对预训练模型 zero-shot 的效果不算满意. 你想要微调一个模型, 使得它在你的数据集上表现更好.
+在目前版本，你只需要微调'LLAMA'部分即可.
+## LLAMA 微调
+### 1. 准备数据集
+```
+.
+├── SPK1
+│   ├── 21.15-26.44.lab
+│   ├── 21.15-26.44.mp3
+│   ├── 27.51-29.98.lab
+│   ├── 27.51-29.98.mp3
+│   ├── 30.1-32.71.lab
+│   └── 30.1-32.71.mp3
+└── SPK2
+    ├── 38.79-40.85.lab
+    └── 38.79-40.85.mp3
+```
+你需要将数据集转为以上格式, 并放到 `data` 下, 音频后缀可以为 `.mp3`, `.wav` 或 `.flac`, 标注文件后缀建议为 `.lab`.
+!!! warning
+    建议先对数据集进行响度匹配, 你可以使用 [fish-audio-preprocess](https://github.com/fishaudio/audio-preprocess) 来完成这一步骤.
+    ```bash
+    fap loudness-norm data-raw data --clean
+    ```
+### 2. 批量提取语义 token
+确保你已经下载了 vqgan 权重, 如果没有, 请运行以下命令:
+```bash
+huggingface-cli download fishaudio/fish-speech-1.4 --local-dir checkpoints/fish-speech-1.4
+```
+对于中国大陆用户, 可使用 mirror 下载.
+```bash
+HF_ENDPOINT=https://hf-mirror.com huggingface-cli download fishaudio/fish-speech-1.4 --local-dir checkpoints/fish-speech-1.4
+```
+随后可运行以下命令来提取语义 token:
+```bash
+python tools/vqgan/extract_vq.py data \
+    --num-workers 1 --batch-size 16 \
+    --config-name "firefly_gan_vq" \
+    --checkpoint-path "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"
+```
+!!! note
+    你可以调整 `--num-workers` 和 `--batch-size` 来提高提取速度, 但是请注意不要超过你的显存限制.
+该命令会在 `data` 目录下创建 `.npy` 文件, 如下所示:
+```
+.
+├── SPK1
+│   ├── 21.15-26.44.lab
+│   ├── 21.15-26.44.mp3
+│   ├── 21.15-26.44.npy
+│   ├── 27.51-29.98.lab
+│   ├── 27.51-29.98.mp3
+│   ├── 27.51-29.98.npy
+│   ├── 30.1-32.71.lab
+│   ├── 30.1-32.71.mp3
+│   └── 30.1-32.71.npy
+└── SPK2
+    ├── 38.79-40.85.lab
+    ├── 38.79-40.85.mp3
+    └── 38.79-40.85.npy
+```
+### 3. 打包数据集为 protobuf
+```bash
+python tools/llama/build_dataset.py \
+    --input "data" \
+    --output "data/protos" \
+    --text-extension .lab \
+    --num-workers 16
+```
+命令执行完毕后, 你应该能在 `data` 目录下看到 `protos` 文件.
+### 4. 最后, 使用 LoRA 进行微调
+同样的, 请确保你已经下载了 `LLAMA` 权重, 如果没有, 请运行以下命令:
+```bash
+huggingface-cli download fishaudio/fish-speech-1.4 --local-dir checkpoints/fish-speech-1.4
+```
+对于中国大陆用户, 可使用 mirror 下载.
+```bash
+HF_ENDPOINT=https://hf-mirror.com huggingface-cli download fishaudio/fish-speech-1.4 --local-dir checkpoints/fish-speech-1.4
+```
+最后, 你可以运行以下命令来启动微调:
+```bash
+python fish_speech/train.py --config-name text2semantic_finetune \
+    project=$project \
+    +lora@model.model.lora_config=r_8_alpha_16
+```
+!!! note
+    你可以通过修改 `fish_speech/configs/text2semantic_finetune.yaml` 来修改训练参数如 `batch_size`, `gradient_accumulation_steps` 等, 来适应你的显存.
+!!! note
+    对于 Windows 用户, 你可以使用 `trainer.strategy.process_group_backend=gloo` 来避免 `nccl` 的问题.
+训练结束后, 你可以参考 [推理](inference.md) 部分, 并携带 `--speaker SPK1` 参数来测试你的模型.
+!!! info
+    默认配置下, 基本只会学到说话人的发音方式, 而不包含音色, 你依然需要使用 prompt 来保证音色的稳定性.
+    如果你想要学到音色, 请将训练步数调大, 但这有可能会导致过拟合.
+训练完成后, 你需要先将 loRA 的权重转为普通权重, 然后再进行推理.
+```bash
+python tools/llama/merge_lora.py \
+	--lora-config r_8_alpha_16 \
+	--base-weight checkpoints/fish-speech-1.4 \
+	--lora-weight results/$project/checkpoints/step_000000010.ckpt \
+	--output checkpoints/fish-speech-1.4-yth-lora/
+```
+!!! note
+    你也可以尝试其他的 checkpoint, 我们建议你使用最早的满足你要求的 checkpoint, 他们通常在 OOD 上表现更好.

docs/zh/index.md ADDED Viewed

	@@ -0,0 +1,191 @@

+# 介绍
+<div>
+<a target="_blank" href="https://discord.gg/Es5qTB9BcN">
+<img alt="Discord" src="https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square"/>
+</a>
+<a target="_blank" href="http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=jCKlUP7QgSm9kh95UlBoYv6s1I-Apl1M&authKey=xI5ttVAp3do68IpEYEalwXSYZFdfxZSkah%2BctF5FIMyN2NqAa003vFtLqJyAVRfF&noverify=0&group_code=593946093">
+<img alt="QQ" src="https://img.shields.io/badge/QQ Group-%2312B7F5?logo=tencent-qq&logoColor=white&style=flat-square"/>
+</a>
+<a target="_blank" href="https://hub.docker.com/r/fishaudio/fish-speech">
+<img alt="Docker" src="https://img.shields.io/docker/pulls/fishaudio/fish-speech?style=flat-square&logo=docker"/>
+</a>
+</div>
+!!! warning
+    我们不对代码库的任何非法使用承担任何责任. 请参阅您当地关于 DMCA (数字千年法案) 和其他相关法律法规. <br/>
+    此代码库与所有模型根据 CC-BY-NC-SA-4.0 许可证发布.
+<p align="center">
+   <img src="../assets/figs/diagram.png" width="75%">
+</p>
+## 要求
+- GPU 内存: 4GB (用于推理), 8GB (用于微调)
+- 系统: Linux, Windows
+## Windows 配置
+Windows 专业用户可以考虑 WSL2 或 docker 来运行代码库。
+```bash
+# 创建一个 python 3.10 虚拟环境, 你也可以用 virtualenv
+conda create -n fish-speech python=3.10
+conda activate fish-speech
+# 安装 pytorch
+pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
+# 安装 fish-speech
+pip3 install -e .
+# (开启编译加速) 安装 triton-windows
+pip install https://github.com/AnyaCoder/fish-speech/releases/download/v0.1.0/triton_windows-0.1.0-py3-none-any.whl
+```
+Windows 非专业用户可考虑以下为免 Linux 环境的基础运行方法（附带模型编译功能，即 `torch.compile`）：
+1. 解压项目压缩包。
+2. 点击 `install_env.bat` 安装环境。
+3. 若需要开启编译加速则执行这一步:
+    1. 使用如下链接下载 LLVM 编译器。
+        - [LLVM-17.0.6（原站站点下载）](https://huggingface.co/fishaudio/fish-speech-1/resolve/main/LLVM-17.0.6-win64.exe?download=true)
+        - [LLVM-17.0.6（镜像站点下载）](https://hf-mirror.com/fishaudio/fish-speech-1/resolve/main/LLVM-17.0.6-win64.exe?download=true)
+        - 下载完 `LLVM-17.0.6-win64.exe` 后，双击进行安装，选择合适的安装位置，最重要的是勾选 `Add Path to Current User` 添加环境变量。
+        - 确认安装完成。
+    2. 下载安装 Microsoft Visual C++ 可再发行程序包，解决潜在 .dll 丢失问题。
+        - [MSVC++ 14.40.33810.0 下载](https://aka.ms/vs/17/release/vc_redist.x64.exe)
+    3. 下载安装 Visual Studio 社区版以获取 MSVC++ 编译工具, 解决 LLVM 的头文件依赖问题。
+        - [Visual Studio 下载](https://visualstudio.microsoft.com/zh-hans/downloads/)
+        - 安装好 Visual Studio Installer 之后，下载 Visual Studio Community 2022
+        - 如下图点击`修改`按钮，找到`使用C++的桌面开发`项，勾选下载
+    4. 下载安装 [CUDA Toolkit 12.x](https://developer.nvidia.com/cuda-12-1-0-download-archive?target_os=Windows&target_arch=x86_64)
+4. 双击 `start.bat` 打开训练推理 WebUI 管理界面. 如有需要，可照下列提示修改`API_FLAGS`.
+!!! info "可选"
+    想启动 推理 WebUI 界面？编辑项目根目录下的 `API_FLAGS.txt`, 前三行修改成如下格式:
+    ```
+    --infer
+    # --api
+    # --listen ...
+    ...
+    ```
+!!! info "可选"
+    想启动 API 服务器？编辑项目根目录下的 `API_FLAGS.txt`, 前三行修改成如下格式:
+    ```
+    # --infer
+    --api
+    --listen ...
+    ...
+    ```
+!!! info "可选"
+    双击 `run_cmd.bat` 进入本项目的 conda/python 命令行环境
+## Linux 配置
+```bash
+# 创建一个 python 3.10 虚拟环境, 你也可以用 virtualenv
+conda create -n fish-speech python=3.10
+conda activate fish-speech
+# 安装 pytorch
+pip3 install torch torchvision torchaudio
+# 安装 fish-speech
+pip3 install -e .[stable]
+# (Ubuntu / Debian 用户) 安装 sox
+apt install libsox-dev
+```
+## Docker 配置
+1. 安装 NVIDIA Container Toolkit：
+    Docker 如果想使用 GPU 进行模型训练和推理，需要安装 NVIDIA Container Toolkit ：
+    对于 Ubuntu 用户：
+    ```bash
+    # 添加远程仓库
+    curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
+        && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
+            sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
+            sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
+    # 安装 nvidia-container-toolkit
+    sudo apt-get update
+    sudo apt-get install -y nvidia-container-toolkit
+    # 重启 Docker 服务
+    sudo systemctl restart docker
+    ```
+    对于使用其他 Linux 发行版的用户，安装指南请参考：[NVIDIA Container Toolkit Install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)。
+    注：对于中国大陆的用户，您可能需要使用代理来完成相关工具的安装。
+2. 拉取并运行 fish-speech 镜像
+    ```shell
+    # 拉取镜像
+    docker pull fishaudio/fish-speech
+    # 运行镜像
+    docker run -it \
+        --name fish-speech \
+        --gpus all \
+        -p 7860:7860 \
+        fishaudio/fish-speech \
+        zsh
+    # 如果需要使用其他端口，请修改 -p 参数为 YourPort:7860
+    ```
+3. 下载模型依赖
+    确保您在 docker 容器内的终端，然后再从我们的 huggingface 仓库下载所需的 `vqgan` 和 `llama` 模型。
+    ```bash
+    huggingface-cli download fishaudio/fish-speech-1.4 --local-dir checkpoints/fish-speech-1.4
+    ```
+    对于中国大陆用户，可以通过镜像站下载。
+    ```bash
+    HF_ENDPOINT=https://hf-mirror.com huggingface-cli download fishaudio/fish-speech-1.4 --local-dir checkpoints/fish-speech-1.4
+    ```
+4. 配置环境变量，访问 WebUI
+    在 docker 容器内的终端，输入 `export GRADIO_SERVER_NAME="0.0.0.0"` ，从而让外部可以访问 docker 内的 gradio 服务。
+    接着在 docker 容器内的终端，输入 `python tools/webui.py` 即可开启 WebUI 服务。
+    如果是 WSL 或者是 MacOS ，访问 [http://localhost:7860](http://localhost:7860) 即可打开 WebUI 界面。
+    如果是部署在服务器上，更换 localhost 为您的服务器 ip 即可。
+## 更新日志
+- 2024/09/10: 更新了 Fish-Speech 到 1.4, 增加了数据集大小， quantizer n_groups 4 -> 8.
+- 2024/07/02: 更新了 Fish-Speech 到 1.2 版本，移除 VITS Decoder，同时极大幅度提升 zero-shot 能力.
+- 2024/05/10: 更新了 Fish-Speech 到 1.1 版本，引入了 VITS Decoder 来降低口胡和提高音色相似度.
+- 2024/04/22: 完成了 Fish-Speech 1.0 版本, 大幅修改了 VQGAN 和 LLAMA 模型.
+- 2023/12/28: 添加了 `lora` 微调支持.
+- 2023/12/27: 添加了 `gradient checkpointing`, `causual sampling` 和 `flash-attn` 支持.
+- 2023/12/19: 更新了 Webui 和 HTTP API.
+- 2023/12/18: 更新了微调文档和相关例子.
+- 2023/12/17: 更新了 `text2semantic` 模型, 支持无音素模式.
+- 2023/12/13: 测试版发布, 包含 VQGAN 模型和一个基于 LLAMA 的语言模型 (只支持音素).
+## 致谢
+- [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2)
+- [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2)
+- [GPT VITS](https://github.com/innnky/gpt-vits)
+- [MQTTS](https://github.com/b04901014/MQTTS)
+- [GPT Fast](https://github.com/pytorch-labs/gpt-fast)
+- [Transformers](https://github.com/huggingface/transformers)
+- [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)

docs/zh/inference.md ADDED Viewed

	@@ -0,0 +1,134 @@

+# 推理
+推理支持命令行, http api, 以及 webui 三种方式.
+!!! note
+    总的来说, 推理分为几个部分:
+    1. 给定一段 ~10 秒的语音, 将它用 VQGAN 编码.
+    2. 将编码后的语义 token 和对应文本输入语言模型作为例子.
+    3. 给定一段新文本, 让模型生成对应的语义 token.
+    4. 将生成的语义 token 输入 VQGAN 解码, 生成对应的语音.
+## 命令行推理
+从我们的 huggingface 仓库下载所需的 `vqgan` 和 `llama` 模型。
+```bash
+huggingface-cli download fishaudio/fish-speech-1.4 --local-dir checkpoints/fish-speech-1.4
+```
+对于中国大陆用户，可使用 mirror 下载。
+```bash
+HF_ENDPOINT=https://hf-mirror.com huggingface-cli download fishaudio/fish-speech-1.4 --local-dir checkpoints/fish-speech-1.4
+```
+### 1. 从语音生成 prompt:
+!!! note
+    如果你打算让模型随机选择音色, 你可以跳过这一步.
+```bash
+python tools/vqgan/inference.py \
+    -i "paimon.wav" \
+    --checkpoint-path "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"
+```
+你应该能得到一个 `fake.npy` 文件.
+### 2. 从文本生成语义 token:
+```bash
+python tools/llama/generate.py \
+    --text "要转换的文本" \
+    --prompt-text "你的参考文本" \
+    --prompt-tokens "fake.npy" \
+    --checkpoint-path "checkpoints/fish-speech-1.4" \
+    --num-samples 2 \
+    --compile
+```
+该命令会在工作目录下创建 `codes_N` 文件, 其中 N 是从 0 开始的整数.
+!!! note
+    您可能希望使用 `--compile` 来融合 cuda 内核以实现更快的推理 (~30 个 token/秒 -> ~500 个 token/秒).
+    对应的, 如果你不打算使用加速, 你可以注释掉 `--compile` 参数.
+!!! info
+    对于不支持 bf16 的 GPU, 你可能需要使用 `--half` 参数.
+### 3. 从语义 token 生成人声:
+#### VQGAN 解码
+```bash
+python tools/vqgan/inference.py \
+    -i "codes_0.npy" \
+    --checkpoint-path "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"
+```
+## HTTP API 推理
+运行以下命令来启动 HTTP 服务:
+```bash
+python -m tools.api \
+    --listen 0.0.0.0:8080 \
+    --llama-checkpoint-path "checkpoints/fish-speech-1.4" \
+    --decoder-checkpoint-path "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth" \
+    --decoder-config-name firefly_gan_vq
+```
+如果你想要加速推理，可以加上`--compile`参数。
+推荐中国大陆用户运行以下命令来启动 HTTP 服务:
+```bash
+HF_ENDPOINT=https://hf-mirror.com python -m ...(同上)
+```
+随后, 你可以在 `http://127.0.0.1:8080/` 中查看并测试 API.
+下面是使用`tools/post_api.py`发送请求的示例。
+```bash
+python -m tools.post_api \
+    --text "要输入的文本" \
+    --reference_audio "参考音频路径" \
+    --reference_text "参考音频的文本内容" \
+    --streaming True
+```
+上面的命令表示按照参考音频的信息，合成所需的音频并流式返回.
+下面的示例展示了， 可以一次使用**多个** `参考音频路径` 和 `参考音频的文本内容`。在命令里用空格隔开即可。
+```bash
+python -m tools.post_api \
+    --text "要输入的文本" \
+    --reference_audio "参考音频路径1" "参考音频路径2" \
+    --reference_text "参考音频的文本内容1" "参考音频的文本内容2"\
+    --streaming False \
+    --output "generated" \
+    --format "mp3"
+```
+上面的命令表示按照多个参考音频的信息，合成所需的`MP3`格式音频，并保存为当前目录的`generated.mp3`文件。
+## GUI 推理
+[下载客户端](https://github.com/AnyaCoder/fish-speech-gui/releases/tag/v0.1.0)
+## WebUI 推理
+你可以使用以下命令来启动 WebUI:
+```bash
+python -m tools.webui \
+    --llama-checkpoint-path "checkpoints/fish-speech-1.4" \
+    --decoder-checkpoint-path "checkpoints/fish-speech-1.4/firefly-gan-vq-fsq-8x1024-21hz-generator.pth" \
+    --decoder-config-name firefly_gan_vq
+```
+!!! note
+    你可以使用 Gradio 环境变量, 如 `GRADIO_SHARE`, `GRADIO_SERVER_PORT`, `GRADIO_SERVER_NAME` 来配置 WebUI.
+祝大家玩得开心!

docs/zh/samples.md ADDED Viewed

	@@ -0,0 +1,223 @@

+# 例子
+v1.2 的样本可以在 [Bilibili](https://www.bilibili.com/video/BV1wz421B71D/) 观看。
+以下样本来自 v1.1 版本的模型。
+## 中文句子 1
+```
+人间灯火倒映湖中，她的渴望让静水泛起涟漪。若代价只是孤独，那就让这份愿望肆意流淌。
+流入她所注视的世间，也流入她如湖水般澄澈的目光。
+```
+<table>
+    <thead>
+    <tr>
+        <th>说话人</th>
+        <th>输入音频</th>
+        <th>合成音频</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+        <td>纳西妲 (原神)</td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/0_input.wav" /></td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/0_output.wav" /></td>
+    </tr>
+    <tr>
+        <td>钟离 (原神)</td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/1_input.wav" /></td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/1_output.wav" /></td>
+    </tr>
+    <tr>
+        <td>芙宁娜 (原神)</td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/2_input.wav" /></td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/2_output.wav" /></td>
+    </tr>
+    <tr>
+        <td>随机说话人 1</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/4_output.wav" /></td>
+    </tr>
+    <tr>
+        <td>随机说话人 2</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/5_output.wav" /></td>
+    </tr>
+    </tbody>
+</table>
+## 中文句子 2
+```
+你们这个是什么群啊，你们这是害人不浅啊你们这个群！谁是群主，出来！真的太过分了。你们搞这个群干什么？
+我儿子每一科的成绩都不过那个平均分呐，他现在初二，你叫我儿子怎么办啊？他现在还不到高中啊？
+你们害死我儿子了！快点出来你这个群主！再这样我去报警了啊！我跟你们说你们这一帮人啊，一天到晚啊，
+搞这些什么游戏啊，动漫啊，会害死你们的，你们没有前途我跟你说。你们这九百多个人，好好学习不好吗？
+一天到晚在上网。有什么意思啊？麻烦你重视一下你们的生活的目标啊？有一点学习目标行不行？一天到晚上网是不是人啊？
+```
+<table>
+    <thead>
+    <tr>
+        <th>说话人</th>
+        <th>输入音频</th>
+        <th>合成音频</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+        <td>纳西妲 (原神)</td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/0_input.wav" /></td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/6_output.wav" /></td>
+    </tr>
+    <tr>
+        <td>随机说话人</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/7_output.wav" /></td>
+    </tr>
+    </tbody>
+</table>
+## 中文句子 3
+```
+大家好，我是 Fish Audio 开发的开源文本转语音模型。经过十五万小时的数据训练，
+我已经能够熟练掌握中文、日语和英语，我的语言处理能力接近人类水平，声音表现形式丰富多变。
+作为一个仅有亿级参数的模型，我相信社区成员能够在个人设备上轻松运行和微调，让我成为您的私人语音助手。
+```
+<table>
+    <thead>
+    <tr>
+        <th>说话人</th>
+        <th>输入音频</th>
+        <th>合成音频</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+        <td>随机说话人</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/8_output.wav" /></td>
+    </tr>
+    </tbody>
+</table>
+## 英文句子 1
+```
+In the realm of advanced technology, the evolution of artificial intelligence stands as a
+monumental achievement. This dynamic field, constantly pushing the boundaries of what
+machines can do, has seen rapid growth and innovation. From deciphering complex data
+patterns to driving cars autonomously, AI's applications are vast and diverse.
+```
+<table>
+    <thead>
+    <tr>
+        <th>说话人</th>
+        <th>输入音频</th>
+        <th>合成音频</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+        <td>随机说话人 1</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/en/0_output.wav" /></td>
+    </tr>
+    <tr>
+        <td>随机说话人 2</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/en/1_output.wav" /></td>
+    </tr>
+    </tbody>
+</table>
+## 英文句子 2
+```
+Hello everyone, I am an open-source text-to-speech model developed by
+Fish Audio. After training with 150,000 hours of data, I have become proficient
+in Chinese, Japanese, and English, and my language processing abilities
+are close to human level. My voice is capable of a wide range of expressions.
+As a model with only hundreds of millions of parameters, I believe community
+members can easily run and fine-tune me on their personal devices, allowing
+me to serve as your personal voice assistant.
+```
+<table>
+    <thead>
+    <tr>
+        <th>说话人</th>
+        <th>输入音频</th>
+        <th>合成音频</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+        <td>随机说话人</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/en/2_output.wav" /></td>
+    </tr>
+    </tbody>
+</table>
+## 日文句子 1
+```
+先進技術の領域において、人工知能の進化は画期的な成果として立っています。常に機械ができることの限界を
+押し広げているこのダイナミックな分野は、急速な成長と革新を見せています。複雑なデータパターンの解読か
+ら自動運転車の操縦まで、AIの応用は広範囲に及びます。
+```
+<table>
+    <thead>
+    <tr>
+        <th>说话人</th>
+        <th>输入音频</th>
+        <th>合成音频</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+        <td>随机说话人 1</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/ja/0_output.wav" /></td>
+    </tr>
+    <tr>
+        <td>随机说话人 2</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/ja/1_output.wav" /></td>
+    </tr>
+    </tbody>
+</table>
+## 日文句子 2
+```
+皆さん、こんにちは。私はフィッシュオーディオによって開発されたオープンソースのテ
+キストから音声への変換モデルです。15万時間のデータトレーニングを経て、
+中国語、日本語、英語を熟知しており、言語処理能力は人間に近いレベルです。
+声の表現も多彩で豊かです。数億のパラメータを持つこのモデルは、コミュニティ
+のメンバーが個人のデバイスで簡単に実行し、微調整することができると
+信じています。これにより、私を個人の音声アシスタントとして活用できます。
+```
+<table>
+    <thead>
+    <tr>
+        <th>说话人</th>
+        <th>输入音频</th>
+        <th>合成音频</th>
+    </tr>
+    </thead>
+    <tbody>
+    <tr>
+        <td>随机说话人</td>
+        <td> - </td>
+        <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/ja/2_output.wav" /></td>
+    </tr>
+    </tbody>
+</table>

entrypoint.sh ADDED Viewed

	@@ -0,0 +1,10 @@

+#!/bin/bash
+CUDA_ENABLED=${CUDA_ENABLED:-true}
+DEVICE=""
+if [ "${CUDA_ENABLED}" != "true" ]; then
+    DEVICE="--device cpu"
+fi
+exec python tools/webui.py ${DEVICE}

fish_speech/callbacks/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .grad_norm import GradNormMonitor
2	+
3	+ __all__ = ["GradNormMonitor"]

fish_speech/callbacks/grad_norm.py ADDED Viewed

	@@ -0,0 +1,113 @@

+from typing import Optional, Union
+import lightning.pytorch as pl
+import torch
+from lightning import LightningModule, Trainer
+from lightning.pytorch.callbacks import Callback
+from torch import Tensor, nn
+from torch.utils._foreach_utils import (
+    _group_tensors_by_device_and_dtype,
+    _has_foreach_support,
+)
+@torch.no_grad()
+def grad_norm(
+    parameters: Union[Tensor, list[Tensor]],
+    norm_type: float = 2.0,
+) -> float:
+    """
+    Returns the norm of the gradients of the given parameters.
+    Args:
+        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+            single Tensor that will have gradients normalized
+        norm_type (float): type of the used p-norm.
+    Returns:
+        Total norm of the parameter gradients (viewed as a single vector).
+    """  # noqa: E501
+    if isinstance(parameters, Tensor):
+        parameters = [parameters]
+    grads = [p.grad for p in parameters if p.grad is not None]
+    if len(grads) == 0:
+        return None
+    first_device = grads[0].device
+    grouped_grads: dict[
+        tuple[torch.device, torch.dtype], list[list[Tensor]]
+    ] = _group_tensors_by_device_and_dtype(
+        [[g.detach() for g in grads]]
+    )  # type: ignore[assignment]
+    norms = []
+    for (device, _), ([grads], _) in grouped_grads.items():
+        if _has_foreach_support(grads, device=device):
+            norms.extend(torch._foreach_norm(grads, norm_type))
+        else:
+            norms.extend([torch.norm(g, norm_type) for g in grads])
+    return torch.norm(torch.stack([norm.to(first_device) for norm in norms]), norm_type)
+class GradNormMonitor(Callback):
+    """
+    Callback that computes the gradient norm of the model parameters.
+    """
+    def __init__(
+        self,
+        norm_type: float = 2.0,
+        logging_interval: str = "step",
+        sub_module: Optional[Union[str, list[str]]] = None,
+    ) -> None:
+        """
+        Args:
+            norm_type (float): type of the used p-norm.
+            logging_interval (str): "step" or "epoch".
+        """
+        super().__init__()
+        self.norm_type = norm_type
+        self.logging_interval = logging_interval
+        self.sub_module = sub_module
+    def on_after_backward(self, trainer: Trainer, model: LightningModule) -> None:
+        """
+        Computes the gradient norm of the model parameters and logs it to the logger.
+        Args:
+            trainer (Trainer): The trainer object
+            model (LightningModule): The current lightningModule
+        """
+        lightning_model = model
+        if self.sub_module is None:
+            return self.log_sub_module_grad_norm(lightning_model, model, "")
+        sub_modules = self.sub_module
+        if isinstance(sub_modules, str):
+            sub_modules = [sub_modules]
+        for sub_module in sub_modules:
+            self.log_sub_module_grad_norm(
+                lightning_model, getattr(model, sub_module), f"/{sub_module}"
+            )
+    def log_sub_module_grad_norm(
+        self, lightning_model: LightningModule, model: nn.Module, path: str
+    ) -> None:
+        grad_norm_val = grad_norm(model.parameters(), self.norm_type)
+        if grad_norm_val is None:
+            return
+        on_step = self.logging_interval == "step"
+        lightning_model.log(
+            f"train{path}/grad_norm",
+            grad_norm_val,
+            on_step=on_step,
+            on_epoch=not on_step,
+        )

fish_speech/configs/base.yaml ADDED Viewed

	@@ -0,0 +1,87 @@

+# Base configuration for training a model
+paths:
+  run_dir: results/${project}
+  ckpt_dir: ${paths.run_dir}/checkpoints
+hydra:
+  run:
+    dir: ${paths.run_dir}
+# Lightning Trainer
+trainer:
+  _target_: lightning.pytorch.trainer.Trainer
+  default_root_dir: ${paths.run_dir}
+  accelerator: gpu
+  num_nodes: 1
+  devices: auto
+  strategy:
+    _target_: lightning.pytorch.strategies.DDPStrategy
+    process_group_backend: nccl  # This should be override when training on windows
+  precision: bf16-mixed
+  # disable validation by epoch end
+  check_val_every_n_epoch: null
+  val_check_interval: 5000
+  max_steps: 100_000
+  # Use torch.backends.cudnn.benchmark to speed up training
+  benchmark: true
+# Callbacks
+callbacks:
+  model_checkpoint:
+    _target_: lightning.pytorch.callbacks.ModelCheckpoint
+    dirpath: ${paths.ckpt_dir}
+    filename: "step_{step:09d}"
+    save_last: false # additionally always save an exact copy of the last checkpoint to a file last.ckpt
+    save_top_k: 5 # save 5 latest checkpoints
+    monitor: step # use step to monitor checkpoints
+    mode: max # save the latest checkpoint with the highest global_step
+    every_n_epochs: null # don't save checkpoints by epoch end
+    every_n_train_steps: 5000 # save checkpoints every 5000 steps
+    auto_insert_metric_name: false
+  model_summary:
+    _target_: lightning.pytorch.callbacks.ModelSummary
+    max_depth: 2 # the maximum depth of layer nesting that the summary will include
+  learning_rate_monitor:
+    _target_: lightning.pytorch.callbacks.LearningRateMonitor
+    logging_interval: step
+    log_momentum: false
+  grad_norm_monitor:
+    _target_: fish_speech.callbacks.GradNormMonitor
+    norm_type: 2
+    logging_interval: step
+# Logger
+logger:
+  tensorboard:
+    _target_: lightning.pytorch.loggers.tensorboard.TensorBoardLogger
+    save_dir: "${paths.run_dir}/tensorboard/"
+    name: null
+    log_graph: false
+    default_hp_metric: true
+    prefix: ""
+  # wandb:
+  #   _target_: lightning.pytorch.loggers.wandb.WandbLogger
+  #   # name: "" # name of the run (normally generated by wandb)
+  #   save_dir: "${paths.run_dir}"
+  #   offline: False
+  #   id: null # pass correct id to resume experiment!
+  #   anonymous: null # enable anonymous logging
+  #   project: "fish-speech"
+  #   log_model: False # upload lightning ckpts
+  #   prefix: "" # a string to put at the beginning of metric keys
+  #   # entity: "" # set to name of your wandb team
+  #   group: ""
+  #   tags: ["vq", "hq", "finetune"]
+  #   job_type: ""
+# Loop
+train: true
+test: false

fish_speech/configs/firefly_gan_vq.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+_target_: fish_speech.models.vqgan.modules.firefly.FireflyArchitecture
+spec_transform:
+  _target_: fish_speech.utils.spectrogram.LogMelSpectrogram
+  sample_rate: 44100
+  n_mels: 160
+  n_fft: 2048
+  hop_length: 512
+  win_length: 2048
+backbone:
+  _target_: fish_speech.models.vqgan.modules.firefly.ConvNeXtEncoder
+  input_channels: 160
+  depths: [3, 3, 9, 3]
+  dims: [128, 256, 384, 512]
+  drop_path_rate: 0.2
+  kernel_size: 7
+head:
+  _target_: fish_speech.models.vqgan.modules.firefly.HiFiGANGenerator
+  hop_length: 512
+  upsample_rates: [8, 8, 2, 2, 2]  # aka. strides
+  upsample_kernel_sizes: [16, 16, 4, 4, 4]
+  resblock_kernel_sizes: [3, 7, 11]
+  resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+  num_mels: 512
+  upsample_initial_channel: 512
+  pre_conv_kernel_size: 13
+  post_conv_kernel_size: 13
+quantizer:
+  _target_: fish_speech.models.vqgan.modules.fsq.DownsampleFiniteScalarQuantize
+  input_dim: 512
+  n_groups: 8
+  n_codebooks: 1
+  levels: [8, 5, 5, 5]
+  downsample_factor: [2, 2]

fish_speech/configs/lora/r_8_alpha_16.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+_target_: fish_speech.models.text2semantic.lora.LoraConfig
+r: 8
+lora_alpha: 16
+lora_dropout: 0.01

fish_speech/configs/text2semantic_finetune.yaml ADDED Viewed

	@@ -0,0 +1,83 @@

+defaults:
+  - base
+  - _self_
+project: text2semantic_finetune_dual_ar
+max_length: 4096
+pretrained_ckpt_path: checkpoints/fish-speech-1.4
+# Lightning Trainer
+trainer:
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  gradient_clip_algorithm: "norm"
+  max_steps: 1000
+  precision: bf16-true
+  limit_val_batches: 10
+  val_check_interval: 100
+# Dataset Configuration
+tokenizer:
+  _target_: transformers.AutoTokenizer.from_pretrained
+  pretrained_model_name_or_path: ${pretrained_ckpt_path}
+# Dataset Configuration
+train_dataset:
+  _target_: fish_speech.datasets.semantic.AutoTextSemanticInstructionDataset
+  proto_files:
+    - data/protos
+  tokenizer: ${tokenizer}
+  causal: true
+  max_length: ${max_length}
+  use_speaker: false
+  interactive_prob: 0.7
+val_dataset:
+  _target_: fish_speech.datasets.semantic.AutoTextSemanticInstructionDataset
+  proto_files:
+    - data/protos
+  tokenizer: ${tokenizer}
+  causal: true
+  max_length: ${max_length}
+  use_speaker: false
+  interactive_prob: 0.7
+data:
+  _target_: fish_speech.datasets.semantic.SemanticDataModule
+  train_dataset: ${train_dataset}
+  val_dataset: ${val_dataset}
+  num_workers: 4
+  batch_size: 8
+  tokenizer: ${tokenizer}
+  max_length: ${max_length}
+# Model Configuration
+model:
+  _target_: fish_speech.models.text2semantic.lit_module.TextToSemantic
+  model:
+    _target_: fish_speech.models.text2semantic.llama.BaseTransformer.from_pretrained
+    path: ${pretrained_ckpt_path}
+    load_weights: true
+    max_length: ${max_length}
+    lora_config: null
+  optimizer:
+    _target_: torch.optim.AdamW
+    _partial_: true
+    lr: 1e-4
+    weight_decay: 0
+    betas: [0.9, 0.95]
+    eps: 1e-5
+  lr_scheduler:
+    _target_: torch.optim.lr_scheduler.LambdaLR
+    _partial_: true
+    lr_lambda:
+      _target_: fish_speech.scheduler.get_constant_schedule_with_warmup_lr_lambda
+      _partial_: true
+      num_warmup_steps: 10
+# Callbacks
+callbacks:
+  model_checkpoint:
+    every_n_train_steps: ${trainer.val_check_interval}

fish_speech/conversation.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ SEMANTIC_TOKEN = "<\|semantic\|>"
2	+ CODEBOOK_PAD_TOKEN_ID = 0

fish_speech/datasets/concat_repeat.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import bisect
+import random
+from typing import Iterable
+from torch.utils.data import Dataset, IterableDataset
+class ConcatRepeatDataset(Dataset):
+    datasets: list[Dataset]
+    cumulative_sizes: list[int]
+    repeats: list[int]
+    @staticmethod
+    def cumsum(sequence, repeats):
+        r, s = [], 0
+        for dataset, repeat in zip(sequence, repeats):
+            l = len(dataset) * repeat
+            r.append(l + s)
+            s += l
+        return r
+    def __init__(self, datasets: Iterable[Dataset], repeats: list[int]):
+        super().__init__()
+        self.datasets = list(datasets)
+        self.repeats = repeats
+        assert len(self.datasets) > 0, "datasets should not be an empty iterable"
+        assert len(self.datasets) == len(
+            repeats
+        ), "datasets and repeats should have the same length"
+        for d in self.datasets:
+            assert not isinstance(
+                d, IterableDataset
+            ), "ConcatRepeatDataset does not support IterableDataset"
+        self.cumulative_sizes = self.cumsum(self.datasets, self.repeats)
+    def __len__(self):
+        return self.cumulative_sizes[-1]
+    def __getitem__(self, idx):
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        dataset = self.datasets[dataset_idx]
+        return dataset[sample_idx % len(dataset)]

fish_speech/datasets/protos/text-data.proto ADDED Viewed

	@@ -0,0 +1,24 @@

+syntax = "proto3";
+package text_data;
+message Semantics {
+    repeated uint32 values = 1;
+}
+message Sentence {
+    repeated string texts = 1;
+    repeated Semantics semantics = 3;
+}
+message TextData {
+    string source = 1;
+    string name = 2;
+    repeated Sentence sentences = 4;
+}
+message SampledData {
+    string source = 1;
+    string name = 2;
+    repeated Sentence samples = 3;
+}