Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +10 -0
- .github/ISSUE_TEMPLATE/bug_report.yml +50 -0
- .github/ISSUE_TEMPLATE/config.yml +1 -0
- .github/ISSUE_TEMPLATE/feature_request.yml +62 -0
- .github/ISSUE_TEMPLATE/help_wanted.yml +54 -0
- .github/ISSUE_TEMPLATE/question.yml +26 -0
- .github/workflows/pre-commit.yaml +14 -0
- .github/workflows/publish-docker-image.yaml +60 -0
- .github/workflows/publish-pypi.yaml +66 -0
- .gitignore +171 -0
- .gitmodules +3 -0
- .pre-commit-config.yaml +17 -0
- Dockerfile +30 -0
- F5-TTS/.github/ISSUE_TEMPLATE/bug_report.yml +50 -0
- F5-TTS/.github/ISSUE_TEMPLATE/config.yml +1 -0
- F5-TTS/.github/ISSUE_TEMPLATE/feature_request.yml +62 -0
- F5-TTS/.github/ISSUE_TEMPLATE/help_wanted.yml +54 -0
- F5-TTS/.github/ISSUE_TEMPLATE/question.yml +26 -0
- F5-TTS/.github/workflows/pre-commit.yaml +14 -0
- F5-TTS/.github/workflows/publish-docker-image.yaml +60 -0
- F5-TTS/.github/workflows/publish-pypi.yaml +66 -0
- F5-TTS/.gitignore +171 -0
- F5-TTS/.gitmodules +3 -0
- F5-TTS/.pre-commit-config.yaml +17 -0
- F5-TTS/Dockerfile +30 -0
- F5-TTS/LICENSE +21 -0
- F5-TTS/README.md +262 -0
- F5-TTS/pyproject.toml +64 -0
- F5-TTS/ruff.toml +10 -0
- F5-TTS/src/f5_tts/api.py +164 -0
- F5-TTS/src/f5_tts/configs/E2TTS_Base.yaml +49 -0
- F5-TTS/src/f5_tts/configs/E2TTS_Small.yaml +49 -0
- F5-TTS/src/f5_tts/configs/F5TTS_Base.yaml +54 -0
- F5-TTS/src/f5_tts/configs/F5TTS_Small.yaml +54 -0
- F5-TTS/src/f5_tts/configs/F5TTS_v1_Base.yaml +55 -0
- F5-TTS/src/f5_tts/eval/README.md +52 -0
- F5-TTS/src/f5_tts/eval/ecapa_tdnn.py +331 -0
- F5-TTS/src/f5_tts/eval/eval_infer_batch.py +210 -0
- F5-TTS/src/f5_tts/eval/eval_infer_batch.sh +18 -0
- F5-TTS/src/f5_tts/eval/eval_librispeech_test_clean.py +89 -0
- F5-TTS/src/f5_tts/eval/eval_seedtts_testset.py +88 -0
- F5-TTS/src/f5_tts/eval/eval_utmos.py +42 -0
- F5-TTS/src/f5_tts/eval/utils_eval.py +419 -0
- F5-TTS/src/f5_tts/infer/README.md +177 -0
- F5-TTS/src/f5_tts/infer/SHARED.md +193 -0
- F5-TTS/src/f5_tts/infer/examples/basic/basic.toml +11 -0
- F5-TTS/src/f5_tts/infer/examples/basic/basic_ref_en.wav +3 -0
- F5-TTS/src/f5_tts/infer/examples/basic/basic_ref_zh.wav +3 -0
- F5-TTS/src/f5_tts/infer/examples/multi/country.flac +3 -0
- F5-TTS/src/f5_tts/infer/examples/multi/main.flac +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,13 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
F5-TTS/src/f5_tts/infer/examples/basic/basic_ref_en.wav filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
F5-TTS/src/f5_tts/infer/examples/basic/basic_ref_zh.wav filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
F5-TTS/src/f5_tts/infer/examples/multi/country.flac filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
F5-TTS/src/f5_tts/infer/examples/multi/main.flac filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
F5-TTS/src/f5_tts/infer/examples/multi/town.flac filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
src/f5_tts/infer/examples/basic/basic_ref_en.wav filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
src/f5_tts/infer/examples/basic/basic_ref_zh.wav filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
src/f5_tts/infer/examples/multi/country.flac filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
src/f5_tts/infer/examples/multi/main.flac filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
src/f5_tts/infer/examples/multi/town.flac filter=lfs diff=lfs merge=lfs -text
|
.github/ISSUE_TEMPLATE/bug_report.yml
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: "Bug Report"
|
| 2 |
+
description: |
|
| 3 |
+
Please provide as much details to help address the issue more efficiently, including input, output, logs and screenshots.
|
| 4 |
+
labels:
|
| 5 |
+
- bug
|
| 6 |
+
body:
|
| 7 |
+
- type: checkboxes
|
| 8 |
+
attributes:
|
| 9 |
+
label: Checks
|
| 10 |
+
description: "To ensure timely help, please confirm the following:"
|
| 11 |
+
options:
|
| 12 |
+
- label: This template is only for bug reports, usage problems go with 'Help Wanted'.
|
| 13 |
+
required: true
|
| 14 |
+
- label: I have thoroughly reviewed the project documentation but couldn't find information to solve my problem.
|
| 15 |
+
required: true
|
| 16 |
+
- label: I have searched for existing issues, including closed ones, and couldn't find a solution.
|
| 17 |
+
required: true
|
| 18 |
+
- label: I am using English to submit this issue to facilitate community communication.
|
| 19 |
+
required: true
|
| 20 |
+
- type: textarea
|
| 21 |
+
attributes:
|
| 22 |
+
label: Environment Details
|
| 23 |
+
description: "Provide details including OS, GPU info, Python version, any relevant software or dependencies, and trainer setting."
|
| 24 |
+
placeholder: e.g., CentOS Linux 7, 4 * RTX 3090, Python 3.10, torch==2.3.0+cu118, cuda 11.8, config yaml is ...
|
| 25 |
+
validations:
|
| 26 |
+
required: true
|
| 27 |
+
- type: textarea
|
| 28 |
+
attributes:
|
| 29 |
+
label: Steps to Reproduce
|
| 30 |
+
description: |
|
| 31 |
+
Include detailed steps, screenshots, and logs. Use the correct markdown syntax for code blocks.
|
| 32 |
+
placeholder: |
|
| 33 |
+
1. Create a new conda environment.
|
| 34 |
+
2. Clone the repository, install as local editable and properly set up.
|
| 35 |
+
3. Run the command: `accelerate launch src/f5_tts/train/train.py`.
|
| 36 |
+
4. Have following error message... (attach logs).
|
| 37 |
+
validations:
|
| 38 |
+
required: true
|
| 39 |
+
- type: textarea
|
| 40 |
+
attributes:
|
| 41 |
+
label: ✔️ Expected Behavior
|
| 42 |
+
placeholder: Describe in detail what you expected to happen.
|
| 43 |
+
validations:
|
| 44 |
+
required: false
|
| 45 |
+
- type: textarea
|
| 46 |
+
attributes:
|
| 47 |
+
label: ❌ Actual Behavior
|
| 48 |
+
placeholder: Describe in detail what actually happened.
|
| 49 |
+
validations:
|
| 50 |
+
required: false
|
.github/ISSUE_TEMPLATE/config.yml
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
blank_issues_enabled: false
|
.github/ISSUE_TEMPLATE/feature_request.yml
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: "Feature Request"
|
| 2 |
+
description: |
|
| 3 |
+
Some constructive suggestions and new ideas regarding current repo.
|
| 4 |
+
labels:
|
| 5 |
+
- enhancement
|
| 6 |
+
body:
|
| 7 |
+
- type: checkboxes
|
| 8 |
+
attributes:
|
| 9 |
+
label: Checks
|
| 10 |
+
description: "To help us grasp quickly, please confirm the following:"
|
| 11 |
+
options:
|
| 12 |
+
- label: This template is only for feature request.
|
| 13 |
+
required: true
|
| 14 |
+
- label: I have thoroughly reviewed the project documentation but couldn't find any relevant information that meets my needs.
|
| 15 |
+
required: true
|
| 16 |
+
- label: I have searched for existing issues, including closed ones, and found not discussion yet.
|
| 17 |
+
required: true
|
| 18 |
+
- label: I am using English to submit this issue to facilitate community communication.
|
| 19 |
+
required: true
|
| 20 |
+
- type: textarea
|
| 21 |
+
attributes:
|
| 22 |
+
label: 1. Is this request related to a challenge you're experiencing? Tell us your story.
|
| 23 |
+
description: |
|
| 24 |
+
Describe the specific problem or scenario you're facing in detail. For example:
|
| 25 |
+
*"I was trying to use [feature] for [specific task], but encountered [issue]. This was frustrating because...."*
|
| 26 |
+
placeholder: Please describe the situation in as much detail as possible.
|
| 27 |
+
validations:
|
| 28 |
+
required: true
|
| 29 |
+
|
| 30 |
+
- type: textarea
|
| 31 |
+
attributes:
|
| 32 |
+
label: 2. What is your suggested solution?
|
| 33 |
+
description: |
|
| 34 |
+
Provide a clear description of the feature or enhancement you'd like to propose.
|
| 35 |
+
How would this feature solve your issue or improve the project?
|
| 36 |
+
placeholder: Describe your idea or proposed solution here.
|
| 37 |
+
validations:
|
| 38 |
+
required: true
|
| 39 |
+
|
| 40 |
+
- type: textarea
|
| 41 |
+
attributes:
|
| 42 |
+
label: 3. Additional context or comments
|
| 43 |
+
description: |
|
| 44 |
+
Any other relevant information, links, documents, or screenshots that provide clarity.
|
| 45 |
+
Use this section for anything not covered above.
|
| 46 |
+
placeholder: Add any extra details here.
|
| 47 |
+
validations:
|
| 48 |
+
required: false
|
| 49 |
+
|
| 50 |
+
- type: checkboxes
|
| 51 |
+
attributes:
|
| 52 |
+
label: 4. Can you help us with this feature?
|
| 53 |
+
description: |
|
| 54 |
+
Let us know if you're interested in contributing. This is not a commitment but a way to express interest in collaboration.
|
| 55 |
+
options:
|
| 56 |
+
- label: I am interested in contributing to this feature.
|
| 57 |
+
required: false
|
| 58 |
+
|
| 59 |
+
- type: markdown
|
| 60 |
+
attributes:
|
| 61 |
+
value: |
|
| 62 |
+
**Note:** Please submit only one request per issue to keep discussions focused and manageable.
|
.github/ISSUE_TEMPLATE/help_wanted.yml
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: "Help Wanted"
|
| 2 |
+
description: |
|
| 3 |
+
Please provide as much details to help address the issue more efficiently, including input, output, logs and screenshots.
|
| 4 |
+
labels:
|
| 5 |
+
- help wanted
|
| 6 |
+
body:
|
| 7 |
+
- type: checkboxes
|
| 8 |
+
attributes:
|
| 9 |
+
label: Checks
|
| 10 |
+
description: "To ensure timely help, please confirm the following:"
|
| 11 |
+
options:
|
| 12 |
+
- label: This template is only for usage issues encountered.
|
| 13 |
+
required: true
|
| 14 |
+
- label: I have thoroughly reviewed the project documentation but couldn't find information to solve my problem.
|
| 15 |
+
required: true
|
| 16 |
+
- label: I have searched for existing issues, including closed ones, and couldn't find a solution.
|
| 17 |
+
required: true
|
| 18 |
+
- label: I am using English to submit this issue to facilitate community communication.
|
| 19 |
+
required: true
|
| 20 |
+
- type: textarea
|
| 21 |
+
attributes:
|
| 22 |
+
label: Environment Details
|
| 23 |
+
description: "Provide details such as OS, Python version, and any relevant software or dependencies."
|
| 24 |
+
placeholder: |
|
| 25 |
+
e.g., macOS 13.5, Python 3.10, torch==2.3.0, Gradio 4.44.1
|
| 26 |
+
If training or finetuning related, provide detailed configuration including GPU info and training setup.
|
| 27 |
+
validations:
|
| 28 |
+
required: true
|
| 29 |
+
- type: textarea
|
| 30 |
+
attributes:
|
| 31 |
+
label: Steps to Reproduce
|
| 32 |
+
description: |
|
| 33 |
+
Include detailed steps, screenshots, and logs. Provide used prompt wav and text. Use the correct markdown syntax for code blocks.
|
| 34 |
+
placeholder: |
|
| 35 |
+
1. Create a new conda environment.
|
| 36 |
+
2. Clone the repository and install as pip package.
|
| 37 |
+
3. Run the command: `f5-tts_infer-gradio` with no ref_text provided.
|
| 38 |
+
4. Stuck there with the following message... (attach logs and also error msg e.g. after ctrl-c).
|
| 39 |
+
5. Prompt & generated wavs are [change suffix to .mp4 to enable direct upload or pack all to .zip].
|
| 40 |
+
6. Reference audio's transcription or provided ref_text is `xxx`, and text to generate is `xxx`.
|
| 41 |
+
validations:
|
| 42 |
+
required: true
|
| 43 |
+
- type: textarea
|
| 44 |
+
attributes:
|
| 45 |
+
label: ✔️ Expected Behavior
|
| 46 |
+
placeholder: Describe what you expected to happen in detail, e.g. output a generated audio.
|
| 47 |
+
validations:
|
| 48 |
+
required: false
|
| 49 |
+
- type: textarea
|
| 50 |
+
attributes:
|
| 51 |
+
label: ❌ Actual Behavior
|
| 52 |
+
placeholder: Describe what actually happened in detail, failure messages, etc.
|
| 53 |
+
validations:
|
| 54 |
+
required: false
|
.github/ISSUE_TEMPLATE/question.yml
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: "Question"
|
| 2 |
+
description: |
|
| 3 |
+
Research question or pure inquiry about the project, usage issue goes with "help wanted".
|
| 4 |
+
labels:
|
| 5 |
+
- question
|
| 6 |
+
body:
|
| 7 |
+
- type: checkboxes
|
| 8 |
+
attributes:
|
| 9 |
+
label: Checks
|
| 10 |
+
description: "To help us grasp quickly, please confirm the following:"
|
| 11 |
+
options:
|
| 12 |
+
- label: This template is only for research question, not usage problems, feature requests or bug reports.
|
| 13 |
+
required: true
|
| 14 |
+
- label: I have thoroughly reviewed the project documentation and read the related paper(s).
|
| 15 |
+
required: true
|
| 16 |
+
- label: I have searched for existing issues, including closed ones, no similar questions.
|
| 17 |
+
required: true
|
| 18 |
+
- label: I am using English to submit this issue to facilitate community communication.
|
| 19 |
+
required: true
|
| 20 |
+
- type: textarea
|
| 21 |
+
attributes:
|
| 22 |
+
label: Question details
|
| 23 |
+
description: |
|
| 24 |
+
Question details, clearly stated using proper markdown syntax.
|
| 25 |
+
validations:
|
| 26 |
+
required: true
|
.github/workflows/pre-commit.yaml
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: pre-commit
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
pull_request:
|
| 5 |
+
push:
|
| 6 |
+
branches: [main]
|
| 7 |
+
|
| 8 |
+
jobs:
|
| 9 |
+
pre-commit:
|
| 10 |
+
runs-on: ubuntu-latest
|
| 11 |
+
steps:
|
| 12 |
+
- uses: actions/checkout@v3
|
| 13 |
+
- uses: actions/setup-python@v3
|
| 14 |
+
- uses: pre-commit/action@v3.0.1
|
.github/workflows/publish-docker-image.yaml
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Create and publish a Docker image
|
| 2 |
+
|
| 3 |
+
# Configures this workflow to run every time a change is pushed to the branch called `release`.
|
| 4 |
+
on:
|
| 5 |
+
push:
|
| 6 |
+
branches: ['main']
|
| 7 |
+
|
| 8 |
+
# Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds.
|
| 9 |
+
env:
|
| 10 |
+
REGISTRY: ghcr.io
|
| 11 |
+
IMAGE_NAME: ${{ github.repository }}
|
| 12 |
+
|
| 13 |
+
# There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu.
|
| 14 |
+
jobs:
|
| 15 |
+
build-and-push-image:
|
| 16 |
+
runs-on: ubuntu-latest
|
| 17 |
+
# Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job.
|
| 18 |
+
permissions:
|
| 19 |
+
contents: read
|
| 20 |
+
packages: write
|
| 21 |
+
#
|
| 22 |
+
steps:
|
| 23 |
+
- name: Checkout repository
|
| 24 |
+
uses: actions/checkout@v4
|
| 25 |
+
- name: Free Up GitHub Actions Ubuntu Runner Disk Space 🔧
|
| 26 |
+
uses: jlumbroso/free-disk-space@main
|
| 27 |
+
with:
|
| 28 |
+
# This might remove tools that are actually needed, if set to "true" but frees about 6 GB
|
| 29 |
+
tool-cache: false
|
| 30 |
+
|
| 31 |
+
# All of these default to true, but feel free to set to "false" if necessary for your workflow
|
| 32 |
+
android: true
|
| 33 |
+
dotnet: true
|
| 34 |
+
haskell: true
|
| 35 |
+
large-packages: false
|
| 36 |
+
swap-storage: false
|
| 37 |
+
docker-images: false
|
| 38 |
+
# Uses the `docker/login-action` action to log in to the Container registry registry using the account and password that will publish the packages. Once published, the packages are scoped to the account defined here.
|
| 39 |
+
- name: Log in to the Container registry
|
| 40 |
+
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
|
| 41 |
+
with:
|
| 42 |
+
registry: ${{ env.REGISTRY }}
|
| 43 |
+
username: ${{ github.actor }}
|
| 44 |
+
password: ${{ secrets.GITHUB_TOKEN }}
|
| 45 |
+
# This step uses [docker/metadata-action](https://github.com/docker/metadata-action#about) to extract tags and labels that will be applied to the specified image. The `id` "meta" allows the output of this step to be referenced in a subsequent step. The `images` value provides the base name for the tags and labels.
|
| 46 |
+
- name: Extract metadata (tags, labels) for Docker
|
| 47 |
+
id: meta
|
| 48 |
+
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
|
| 49 |
+
with:
|
| 50 |
+
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
|
| 51 |
+
# This step uses the `docker/build-push-action` action to build the image, based on your repository's `Dockerfile`. If the build succeeds, it pushes the image to GitHub Packages.
|
| 52 |
+
# It uses the `context` parameter to define the build's context as the set of files located in the specified path. For more information, see "[Usage](https://github.com/docker/build-push-action#usage)" in the README of the `docker/build-push-action` repository.
|
| 53 |
+
# It uses the `tags` and `labels` parameters to tag and label the image with the output from the "meta" step.
|
| 54 |
+
- name: Build and push Docker image
|
| 55 |
+
uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
|
| 56 |
+
with:
|
| 57 |
+
context: .
|
| 58 |
+
push: true
|
| 59 |
+
tags: ${{ steps.meta.outputs.tags }}
|
| 60 |
+
labels: ${{ steps.meta.outputs.labels }}
|
.github/workflows/publish-pypi.yaml
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This workflow uses actions that are not certified by GitHub.
|
| 2 |
+
# They are provided by a third-party and are governed by
|
| 3 |
+
# separate terms of service, privacy policy, and support
|
| 4 |
+
# documentation.
|
| 5 |
+
|
| 6 |
+
# GitHub recommends pinning actions to a commit SHA.
|
| 7 |
+
# To get a newer version, you will need to update the SHA.
|
| 8 |
+
# You can also reference a tag or branch, but the action may change without warning.
|
| 9 |
+
|
| 10 |
+
name: Upload Python Package
|
| 11 |
+
|
| 12 |
+
on:
|
| 13 |
+
release:
|
| 14 |
+
types: [published]
|
| 15 |
+
|
| 16 |
+
permissions:
|
| 17 |
+
contents: read
|
| 18 |
+
|
| 19 |
+
jobs:
|
| 20 |
+
release-build:
|
| 21 |
+
runs-on: ubuntu-latest
|
| 22 |
+
|
| 23 |
+
steps:
|
| 24 |
+
- uses: actions/checkout@v4
|
| 25 |
+
|
| 26 |
+
- uses: actions/setup-python@v5
|
| 27 |
+
with:
|
| 28 |
+
python-version: "3.x"
|
| 29 |
+
|
| 30 |
+
- name: Build release distributions
|
| 31 |
+
run: |
|
| 32 |
+
# NOTE: put your own distribution build steps here.
|
| 33 |
+
python -m pip install build
|
| 34 |
+
python -m build
|
| 35 |
+
|
| 36 |
+
- name: Upload distributions
|
| 37 |
+
uses: actions/upload-artifact@v4
|
| 38 |
+
with:
|
| 39 |
+
name: release-dists
|
| 40 |
+
path: dist/
|
| 41 |
+
|
| 42 |
+
pypi-publish:
|
| 43 |
+
runs-on: ubuntu-latest
|
| 44 |
+
|
| 45 |
+
needs:
|
| 46 |
+
- release-build
|
| 47 |
+
|
| 48 |
+
permissions:
|
| 49 |
+
# IMPORTANT: this permission is mandatory for trusted publishing
|
| 50 |
+
id-token: write
|
| 51 |
+
|
| 52 |
+
# Dedicated environments with protections for publishing are strongly recommended.
|
| 53 |
+
environment:
|
| 54 |
+
name: pypi
|
| 55 |
+
# OPTIONAL: uncomment and update to include your PyPI project URL in the deployment status:
|
| 56 |
+
# url: https://pypi.org/p/YOURPROJECT
|
| 57 |
+
|
| 58 |
+
steps:
|
| 59 |
+
- name: Retrieve release distributions
|
| 60 |
+
uses: actions/download-artifact@v4
|
| 61 |
+
with:
|
| 62 |
+
name: release-dists
|
| 63 |
+
path: dist/
|
| 64 |
+
|
| 65 |
+
- name: Publish release distributions to PyPI
|
| 66 |
+
uses: pypa/gh-action-pypi-publish@release/v1
|
.gitignore
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Customed
|
| 2 |
+
.vscode/
|
| 3 |
+
tests/
|
| 4 |
+
runs/
|
| 5 |
+
data/
|
| 6 |
+
ckpts/
|
| 7 |
+
wandb/
|
| 8 |
+
results/
|
| 9 |
+
|
| 10 |
+
# Byte-compiled / optimized / DLL files
|
| 11 |
+
__pycache__/
|
| 12 |
+
*.py[cod]
|
| 13 |
+
*$py.class
|
| 14 |
+
|
| 15 |
+
# C extensions
|
| 16 |
+
*.so
|
| 17 |
+
|
| 18 |
+
# Distribution / packaging
|
| 19 |
+
.Python
|
| 20 |
+
build/
|
| 21 |
+
develop-eggs/
|
| 22 |
+
dist/
|
| 23 |
+
downloads/
|
| 24 |
+
eggs/
|
| 25 |
+
.eggs/
|
| 26 |
+
lib/
|
| 27 |
+
lib64/
|
| 28 |
+
parts/
|
| 29 |
+
sdist/
|
| 30 |
+
var/
|
| 31 |
+
wheels/
|
| 32 |
+
share/python-wheels/
|
| 33 |
+
*.egg-info/
|
| 34 |
+
.installed.cfg
|
| 35 |
+
*.egg
|
| 36 |
+
MANIFEST
|
| 37 |
+
|
| 38 |
+
# PyInstaller
|
| 39 |
+
# Usually these files are written by a python script from a template
|
| 40 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 41 |
+
*.manifest
|
| 42 |
+
*.spec
|
| 43 |
+
|
| 44 |
+
# Installer logs
|
| 45 |
+
pip-log.txt
|
| 46 |
+
pip-delete-this-directory.txt
|
| 47 |
+
|
| 48 |
+
# Unit test / coverage reports
|
| 49 |
+
htmlcov/
|
| 50 |
+
.tox/
|
| 51 |
+
.nox/
|
| 52 |
+
.coverage
|
| 53 |
+
.coverage.*
|
| 54 |
+
.cache
|
| 55 |
+
nosetests.xml
|
| 56 |
+
coverage.xml
|
| 57 |
+
*.cover
|
| 58 |
+
*.py,cover
|
| 59 |
+
.hypothesis/
|
| 60 |
+
.pytest_cache/
|
| 61 |
+
cover/
|
| 62 |
+
|
| 63 |
+
# Translations
|
| 64 |
+
*.mo
|
| 65 |
+
*.pot
|
| 66 |
+
|
| 67 |
+
# Django stuff:
|
| 68 |
+
*.log
|
| 69 |
+
local_settings.py
|
| 70 |
+
db.sqlite3
|
| 71 |
+
db.sqlite3-journal
|
| 72 |
+
|
| 73 |
+
# Flask stuff:
|
| 74 |
+
instance/
|
| 75 |
+
.webassets-cache
|
| 76 |
+
|
| 77 |
+
# Scrapy stuff:
|
| 78 |
+
.scrapy
|
| 79 |
+
|
| 80 |
+
# Sphinx documentation
|
| 81 |
+
docs/_build/
|
| 82 |
+
|
| 83 |
+
# PyBuilder
|
| 84 |
+
.pybuilder/
|
| 85 |
+
target/
|
| 86 |
+
|
| 87 |
+
# Jupyter Notebook
|
| 88 |
+
.ipynb_checkpoints
|
| 89 |
+
|
| 90 |
+
# IPython
|
| 91 |
+
profile_default/
|
| 92 |
+
ipython_config.py
|
| 93 |
+
|
| 94 |
+
# pyenv
|
| 95 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 96 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 97 |
+
# .python-version
|
| 98 |
+
|
| 99 |
+
# pipenv
|
| 100 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 101 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 102 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 103 |
+
# install all needed dependencies.
|
| 104 |
+
#Pipfile.lock
|
| 105 |
+
|
| 106 |
+
# poetry
|
| 107 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 108 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 109 |
+
# commonly ignored for libraries.
|
| 110 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 111 |
+
#poetry.lock
|
| 112 |
+
|
| 113 |
+
# pdm
|
| 114 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 115 |
+
#pdm.lock
|
| 116 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
| 117 |
+
# in version control.
|
| 118 |
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
| 119 |
+
.pdm.toml
|
| 120 |
+
.pdm-python
|
| 121 |
+
.pdm-build/
|
| 122 |
+
|
| 123 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 124 |
+
__pypackages__/
|
| 125 |
+
|
| 126 |
+
# Celery stuff
|
| 127 |
+
celerybeat-schedule
|
| 128 |
+
celerybeat.pid
|
| 129 |
+
|
| 130 |
+
# SageMath parsed files
|
| 131 |
+
*.sage.py
|
| 132 |
+
|
| 133 |
+
# Environments
|
| 134 |
+
.env
|
| 135 |
+
.venv
|
| 136 |
+
env/
|
| 137 |
+
venv/
|
| 138 |
+
ENV/
|
| 139 |
+
env.bak/
|
| 140 |
+
venv.bak/
|
| 141 |
+
|
| 142 |
+
# Spyder project settings
|
| 143 |
+
.spyderproject
|
| 144 |
+
.spyproject
|
| 145 |
+
|
| 146 |
+
# Rope project settings
|
| 147 |
+
.ropeproject
|
| 148 |
+
|
| 149 |
+
# mkdocs documentation
|
| 150 |
+
/site
|
| 151 |
+
|
| 152 |
+
# mypy
|
| 153 |
+
.mypy_cache/
|
| 154 |
+
.dmypy.json
|
| 155 |
+
dmypy.json
|
| 156 |
+
|
| 157 |
+
# Pyre type checker
|
| 158 |
+
.pyre/
|
| 159 |
+
|
| 160 |
+
# pytype static type analyzer
|
| 161 |
+
.pytype/
|
| 162 |
+
|
| 163 |
+
# Cython debug symbols
|
| 164 |
+
cython_debug/
|
| 165 |
+
|
| 166 |
+
# PyCharm
|
| 167 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 168 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 169 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 170 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 171 |
+
#.idea/
|
.gitmodules
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[submodule "src/third_party/BigVGAN"]
|
| 2 |
+
path = src/third_party/BigVGAN
|
| 3 |
+
url = https://github.com/NVIDIA/BigVGAN.git
|
.pre-commit-config.yaml
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
repos:
|
| 2 |
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
| 3 |
+
# Ruff version.
|
| 4 |
+
rev: v0.11.2
|
| 5 |
+
hooks:
|
| 6 |
+
- id: ruff
|
| 7 |
+
name: ruff linter
|
| 8 |
+
args: [--fix]
|
| 9 |
+
- id: ruff-format
|
| 10 |
+
name: ruff formatter
|
| 11 |
+
- id: ruff
|
| 12 |
+
name: ruff sorter
|
| 13 |
+
args: [--select, I, --fix]
|
| 14 |
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
| 15 |
+
rev: v5.0.0
|
| 16 |
+
hooks:
|
| 17 |
+
- id: check-yaml
|
Dockerfile
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM pytorch/pytorch:2.4.0-cuda12.4-cudnn9-devel
|
| 2 |
+
|
| 3 |
+
USER root
|
| 4 |
+
|
| 5 |
+
ARG DEBIAN_FRONTEND=noninteractive
|
| 6 |
+
|
| 7 |
+
LABEL github_repo="https://github.com/SWivid/F5-TTS"
|
| 8 |
+
|
| 9 |
+
RUN set -x \
|
| 10 |
+
&& apt-get update \
|
| 11 |
+
&& apt-get -y install wget curl man git less openssl libssl-dev unzip unar build-essential aria2 tmux vim \
|
| 12 |
+
&& apt-get install -y openssh-server sox libsox-fmt-all libsox-fmt-mp3 libsndfile1-dev ffmpeg \
|
| 13 |
+
&& apt-get install -y librdmacm1 libibumad3 librdmacm-dev libibverbs1 libibverbs-dev ibverbs-utils ibverbs-providers \
|
| 14 |
+
&& rm -rf /var/lib/apt/lists/* \
|
| 15 |
+
&& apt-get clean
|
| 16 |
+
|
| 17 |
+
WORKDIR /workspace
|
| 18 |
+
|
| 19 |
+
RUN git clone https://github.com/SWivid/F5-TTS.git \
|
| 20 |
+
&& cd F5-TTS \
|
| 21 |
+
&& git submodule update --init --recursive \
|
| 22 |
+
&& pip install -e . --no-cache-dir
|
| 23 |
+
|
| 24 |
+
ENV SHELL=/bin/bash
|
| 25 |
+
|
| 26 |
+
VOLUME /root/.cache/huggingface/hub/
|
| 27 |
+
|
| 28 |
+
EXPOSE 7860
|
| 29 |
+
|
| 30 |
+
WORKDIR /workspace/F5-TTS
|
F5-TTS/.github/ISSUE_TEMPLATE/bug_report.yml
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: "Bug Report"
|
| 2 |
+
description: |
|
| 3 |
+
Please provide as much details to help address the issue more efficiently, including input, output, logs and screenshots.
|
| 4 |
+
labels:
|
| 5 |
+
- bug
|
| 6 |
+
body:
|
| 7 |
+
- type: checkboxes
|
| 8 |
+
attributes:
|
| 9 |
+
label: Checks
|
| 10 |
+
description: "To ensure timely help, please confirm the following:"
|
| 11 |
+
options:
|
| 12 |
+
- label: This template is only for bug reports, usage problems go with 'Help Wanted'.
|
| 13 |
+
required: true
|
| 14 |
+
- label: I have thoroughly reviewed the project documentation but couldn't find information to solve my problem.
|
| 15 |
+
required: true
|
| 16 |
+
- label: I have searched for existing issues, including closed ones, and couldn't find a solution.
|
| 17 |
+
required: true
|
| 18 |
+
- label: I am using English to submit this issue to facilitate community communication.
|
| 19 |
+
required: true
|
| 20 |
+
- type: textarea
|
| 21 |
+
attributes:
|
| 22 |
+
label: Environment Details
|
| 23 |
+
description: "Provide details including OS, GPU info, Python version, any relevant software or dependencies, and trainer setting."
|
| 24 |
+
placeholder: e.g., CentOS Linux 7, 4 * RTX 3090, Python 3.10, torch==2.3.0+cu118, cuda 11.8, config yaml is ...
|
| 25 |
+
validations:
|
| 26 |
+
required: true
|
| 27 |
+
- type: textarea
|
| 28 |
+
attributes:
|
| 29 |
+
label: Steps to Reproduce
|
| 30 |
+
description: |
|
| 31 |
+
Include detailed steps, screenshots, and logs. Use the correct markdown syntax for code blocks.
|
| 32 |
+
placeholder: |
|
| 33 |
+
1. Create a new conda environment.
|
| 34 |
+
2. Clone the repository, install as local editable and properly set up.
|
| 35 |
+
3. Run the command: `accelerate launch src/f5_tts/train/train.py`.
|
| 36 |
+
4. Have following error message... (attach logs).
|
| 37 |
+
validations:
|
| 38 |
+
required: true
|
| 39 |
+
- type: textarea
|
| 40 |
+
attributes:
|
| 41 |
+
label: ✔️ Expected Behavior
|
| 42 |
+
placeholder: Describe in detail what you expected to happen.
|
| 43 |
+
validations:
|
| 44 |
+
required: false
|
| 45 |
+
- type: textarea
|
| 46 |
+
attributes:
|
| 47 |
+
label: ❌ Actual Behavior
|
| 48 |
+
placeholder: Describe in detail what actually happened.
|
| 49 |
+
validations:
|
| 50 |
+
required: false
|
F5-TTS/.github/ISSUE_TEMPLATE/config.yml
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
blank_issues_enabled: false
|
F5-TTS/.github/ISSUE_TEMPLATE/feature_request.yml
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: "Feature Request"
|
| 2 |
+
description: |
|
| 3 |
+
Some constructive suggestions and new ideas regarding current repo.
|
| 4 |
+
labels:
|
| 5 |
+
- enhancement
|
| 6 |
+
body:
|
| 7 |
+
- type: checkboxes
|
| 8 |
+
attributes:
|
| 9 |
+
label: Checks
|
| 10 |
+
description: "To help us grasp quickly, please confirm the following:"
|
| 11 |
+
options:
|
| 12 |
+
- label: This template is only for feature request.
|
| 13 |
+
required: true
|
| 14 |
+
- label: I have thoroughly reviewed the project documentation but couldn't find any relevant information that meets my needs.
|
| 15 |
+
required: true
|
| 16 |
+
- label: I have searched for existing issues, including closed ones, and found not discussion yet.
|
| 17 |
+
required: true
|
| 18 |
+
- label: I am using English to submit this issue to facilitate community communication.
|
| 19 |
+
required: true
|
| 20 |
+
- type: textarea
|
| 21 |
+
attributes:
|
| 22 |
+
label: 1. Is this request related to a challenge you're experiencing? Tell us your story.
|
| 23 |
+
description: |
|
| 24 |
+
Describe the specific problem or scenario you're facing in detail. For example:
|
| 25 |
+
*"I was trying to use [feature] for [specific task], but encountered [issue]. This was frustrating because...."*
|
| 26 |
+
placeholder: Please describe the situation in as much detail as possible.
|
| 27 |
+
validations:
|
| 28 |
+
required: true
|
| 29 |
+
|
| 30 |
+
- type: textarea
|
| 31 |
+
attributes:
|
| 32 |
+
label: 2. What is your suggested solution?
|
| 33 |
+
description: |
|
| 34 |
+
Provide a clear description of the feature or enhancement you'd like to propose.
|
| 35 |
+
How would this feature solve your issue or improve the project?
|
| 36 |
+
placeholder: Describe your idea or proposed solution here.
|
| 37 |
+
validations:
|
| 38 |
+
required: true
|
| 39 |
+
|
| 40 |
+
- type: textarea
|
| 41 |
+
attributes:
|
| 42 |
+
label: 3. Additional context or comments
|
| 43 |
+
description: |
|
| 44 |
+
Any other relevant information, links, documents, or screenshots that provide clarity.
|
| 45 |
+
Use this section for anything not covered above.
|
| 46 |
+
placeholder: Add any extra details here.
|
| 47 |
+
validations:
|
| 48 |
+
required: false
|
| 49 |
+
|
| 50 |
+
- type: checkboxes
|
| 51 |
+
attributes:
|
| 52 |
+
label: 4. Can you help us with this feature?
|
| 53 |
+
description: |
|
| 54 |
+
Let us know if you're interested in contributing. This is not a commitment but a way to express interest in collaboration.
|
| 55 |
+
options:
|
| 56 |
+
- label: I am interested in contributing to this feature.
|
| 57 |
+
required: false
|
| 58 |
+
|
| 59 |
+
- type: markdown
|
| 60 |
+
attributes:
|
| 61 |
+
value: |
|
| 62 |
+
**Note:** Please submit only one request per issue to keep discussions focused and manageable.
|
F5-TTS/.github/ISSUE_TEMPLATE/help_wanted.yml
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: "Help Wanted"
|
| 2 |
+
description: |
|
| 3 |
+
Please provide as much details to help address the issue more efficiently, including input, output, logs and screenshots.
|
| 4 |
+
labels:
|
| 5 |
+
- help wanted
|
| 6 |
+
body:
|
| 7 |
+
- type: checkboxes
|
| 8 |
+
attributes:
|
| 9 |
+
label: Checks
|
| 10 |
+
description: "To ensure timely help, please confirm the following:"
|
| 11 |
+
options:
|
| 12 |
+
- label: This template is only for usage issues encountered.
|
| 13 |
+
required: true
|
| 14 |
+
- label: I have thoroughly reviewed the project documentation but couldn't find information to solve my problem.
|
| 15 |
+
required: true
|
| 16 |
+
- label: I have searched for existing issues, including closed ones, and couldn't find a solution.
|
| 17 |
+
required: true
|
| 18 |
+
- label: I am using English to submit this issue to facilitate community communication.
|
| 19 |
+
required: true
|
| 20 |
+
- type: textarea
|
| 21 |
+
attributes:
|
| 22 |
+
label: Environment Details
|
| 23 |
+
description: "Provide details such as OS, Python version, and any relevant software or dependencies."
|
| 24 |
+
placeholder: |
|
| 25 |
+
e.g., macOS 13.5, Python 3.10, torch==2.3.0, Gradio 4.44.1
|
| 26 |
+
If training or finetuning related, provide detailed configuration including GPU info and training setup.
|
| 27 |
+
validations:
|
| 28 |
+
required: true
|
| 29 |
+
- type: textarea
|
| 30 |
+
attributes:
|
| 31 |
+
label: Steps to Reproduce
|
| 32 |
+
description: |
|
| 33 |
+
Include detailed steps, screenshots, and logs. Provide used prompt wav and text. Use the correct markdown syntax for code blocks.
|
| 34 |
+
placeholder: |
|
| 35 |
+
1. Create a new conda environment.
|
| 36 |
+
2. Clone the repository and install as pip package.
|
| 37 |
+
3. Run the command: `f5-tts_infer-gradio` with no ref_text provided.
|
| 38 |
+
4. Stuck there with the following message... (attach logs and also error msg e.g. after ctrl-c).
|
| 39 |
+
5. Prompt & generated wavs are [change suffix to .mp4 to enable direct upload or pack all to .zip].
|
| 40 |
+
6. Reference audio's transcription or provided ref_text is `xxx`, and text to generate is `xxx`.
|
| 41 |
+
validations:
|
| 42 |
+
required: true
|
| 43 |
+
- type: textarea
|
| 44 |
+
attributes:
|
| 45 |
+
label: ✔️ Expected Behavior
|
| 46 |
+
placeholder: Describe what you expected to happen in detail, e.g. output a generated audio.
|
| 47 |
+
validations:
|
| 48 |
+
required: false
|
| 49 |
+
- type: textarea
|
| 50 |
+
attributes:
|
| 51 |
+
label: ❌ Actual Behavior
|
| 52 |
+
placeholder: Describe what actually happened in detail, failure messages, etc.
|
| 53 |
+
validations:
|
| 54 |
+
required: false
|
F5-TTS/.github/ISSUE_TEMPLATE/question.yml
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: "Question"
|
| 2 |
+
description: |
|
| 3 |
+
Research question or pure inquiry about the project, usage issue goes with "help wanted".
|
| 4 |
+
labels:
|
| 5 |
+
- question
|
| 6 |
+
body:
|
| 7 |
+
- type: checkboxes
|
| 8 |
+
attributes:
|
| 9 |
+
label: Checks
|
| 10 |
+
description: "To help us grasp quickly, please confirm the following:"
|
| 11 |
+
options:
|
| 12 |
+
- label: This template is only for research question, not usage problems, feature requests or bug reports.
|
| 13 |
+
required: true
|
| 14 |
+
- label: I have thoroughly reviewed the project documentation and read the related paper(s).
|
| 15 |
+
required: true
|
| 16 |
+
- label: I have searched for existing issues, including closed ones, no similar questions.
|
| 17 |
+
required: true
|
| 18 |
+
- label: I am using English to submit this issue to facilitate community communication.
|
| 19 |
+
required: true
|
| 20 |
+
- type: textarea
|
| 21 |
+
attributes:
|
| 22 |
+
label: Question details
|
| 23 |
+
description: |
|
| 24 |
+
Question details, clearly stated using proper markdown syntax.
|
| 25 |
+
validations:
|
| 26 |
+
required: true
|
F5-TTS/.github/workflows/pre-commit.yaml
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: pre-commit
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
pull_request:
|
| 5 |
+
push:
|
| 6 |
+
branches: [main]
|
| 7 |
+
|
| 8 |
+
jobs:
|
| 9 |
+
pre-commit:
|
| 10 |
+
runs-on: ubuntu-latest
|
| 11 |
+
steps:
|
| 12 |
+
- uses: actions/checkout@v3
|
| 13 |
+
- uses: actions/setup-python@v3
|
| 14 |
+
- uses: pre-commit/action@v3.0.1
|
F5-TTS/.github/workflows/publish-docker-image.yaml
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Create and publish a Docker image
|
| 2 |
+
|
| 3 |
+
# Configures this workflow to run every time a change is pushed to the branch called `release`.
|
| 4 |
+
on:
|
| 5 |
+
push:
|
| 6 |
+
branches: ['main']
|
| 7 |
+
|
| 8 |
+
# Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds.
|
| 9 |
+
env:
|
| 10 |
+
REGISTRY: ghcr.io
|
| 11 |
+
IMAGE_NAME: ${{ github.repository }}
|
| 12 |
+
|
| 13 |
+
# There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu.
|
| 14 |
+
jobs:
|
| 15 |
+
build-and-push-image:
|
| 16 |
+
runs-on: ubuntu-latest
|
| 17 |
+
# Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job.
|
| 18 |
+
permissions:
|
| 19 |
+
contents: read
|
| 20 |
+
packages: write
|
| 21 |
+
#
|
| 22 |
+
steps:
|
| 23 |
+
- name: Checkout repository
|
| 24 |
+
uses: actions/checkout@v4
|
| 25 |
+
- name: Free Up GitHub Actions Ubuntu Runner Disk Space 🔧
|
| 26 |
+
uses: jlumbroso/free-disk-space@main
|
| 27 |
+
with:
|
| 28 |
+
# This might remove tools that are actually needed, if set to "true" but frees about 6 GB
|
| 29 |
+
tool-cache: false
|
| 30 |
+
|
| 31 |
+
# All of these default to true, but feel free to set to "false" if necessary for your workflow
|
| 32 |
+
android: true
|
| 33 |
+
dotnet: true
|
| 34 |
+
haskell: true
|
| 35 |
+
large-packages: false
|
| 36 |
+
swap-storage: false
|
| 37 |
+
docker-images: false
|
| 38 |
+
# Uses the `docker/login-action` action to log in to the Container registry registry using the account and password that will publish the packages. Once published, the packages are scoped to the account defined here.
|
| 39 |
+
- name: Log in to the Container registry
|
| 40 |
+
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
|
| 41 |
+
with:
|
| 42 |
+
registry: ${{ env.REGISTRY }}
|
| 43 |
+
username: ${{ github.actor }}
|
| 44 |
+
password: ${{ secrets.GITHUB_TOKEN }}
|
| 45 |
+
# This step uses [docker/metadata-action](https://github.com/docker/metadata-action#about) to extract tags and labels that will be applied to the specified image. The `id` "meta" allows the output of this step to be referenced in a subsequent step. The `images` value provides the base name for the tags and labels.
|
| 46 |
+
- name: Extract metadata (tags, labels) for Docker
|
| 47 |
+
id: meta
|
| 48 |
+
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
|
| 49 |
+
with:
|
| 50 |
+
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
|
| 51 |
+
# This step uses the `docker/build-push-action` action to build the image, based on your repository's `Dockerfile`. If the build succeeds, it pushes the image to GitHub Packages.
|
| 52 |
+
# It uses the `context` parameter to define the build's context as the set of files located in the specified path. For more information, see "[Usage](https://github.com/docker/build-push-action#usage)" in the README of the `docker/build-push-action` repository.
|
| 53 |
+
# It uses the `tags` and `labels` parameters to tag and label the image with the output from the "meta" step.
|
| 54 |
+
- name: Build and push Docker image
|
| 55 |
+
uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
|
| 56 |
+
with:
|
| 57 |
+
context: .
|
| 58 |
+
push: true
|
| 59 |
+
tags: ${{ steps.meta.outputs.tags }}
|
| 60 |
+
labels: ${{ steps.meta.outputs.labels }}
|
F5-TTS/.github/workflows/publish-pypi.yaml
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# This workflow uses actions that are not certified by GitHub.
|
| 2 |
+
# They are provided by a third-party and are governed by
|
| 3 |
+
# separate terms of service, privacy policy, and support
|
| 4 |
+
# documentation.
|
| 5 |
+
|
| 6 |
+
# GitHub recommends pinning actions to a commit SHA.
|
| 7 |
+
# To get a newer version, you will need to update the SHA.
|
| 8 |
+
# You can also reference a tag or branch, but the action may change without warning.
|
| 9 |
+
|
| 10 |
+
name: Upload Python Package
|
| 11 |
+
|
| 12 |
+
on:
|
| 13 |
+
release:
|
| 14 |
+
types: [published]
|
| 15 |
+
|
| 16 |
+
permissions:
|
| 17 |
+
contents: read
|
| 18 |
+
|
| 19 |
+
jobs:
|
| 20 |
+
release-build:
|
| 21 |
+
runs-on: ubuntu-latest
|
| 22 |
+
|
| 23 |
+
steps:
|
| 24 |
+
- uses: actions/checkout@v4
|
| 25 |
+
|
| 26 |
+
- uses: actions/setup-python@v5
|
| 27 |
+
with:
|
| 28 |
+
python-version: "3.x"
|
| 29 |
+
|
| 30 |
+
- name: Build release distributions
|
| 31 |
+
run: |
|
| 32 |
+
# NOTE: put your own distribution build steps here.
|
| 33 |
+
python -m pip install build
|
| 34 |
+
python -m build
|
| 35 |
+
|
| 36 |
+
- name: Upload distributions
|
| 37 |
+
uses: actions/upload-artifact@v4
|
| 38 |
+
with:
|
| 39 |
+
name: release-dists
|
| 40 |
+
path: dist/
|
| 41 |
+
|
| 42 |
+
pypi-publish:
|
| 43 |
+
runs-on: ubuntu-latest
|
| 44 |
+
|
| 45 |
+
needs:
|
| 46 |
+
- release-build
|
| 47 |
+
|
| 48 |
+
permissions:
|
| 49 |
+
# IMPORTANT: this permission is mandatory for trusted publishing
|
| 50 |
+
id-token: write
|
| 51 |
+
|
| 52 |
+
# Dedicated environments with protections for publishing are strongly recommended.
|
| 53 |
+
environment:
|
| 54 |
+
name: pypi
|
| 55 |
+
# OPTIONAL: uncomment and update to include your PyPI project URL in the deployment status:
|
| 56 |
+
# url: https://pypi.org/p/YOURPROJECT
|
| 57 |
+
|
| 58 |
+
steps:
|
| 59 |
+
- name: Retrieve release distributions
|
| 60 |
+
uses: actions/download-artifact@v4
|
| 61 |
+
with:
|
| 62 |
+
name: release-dists
|
| 63 |
+
path: dist/
|
| 64 |
+
|
| 65 |
+
- name: Publish release distributions to PyPI
|
| 66 |
+
uses: pypa/gh-action-pypi-publish@release/v1
|
F5-TTS/.gitignore
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Customed
|
| 2 |
+
.vscode/
|
| 3 |
+
tests/
|
| 4 |
+
runs/
|
| 5 |
+
data/
|
| 6 |
+
ckpts/
|
| 7 |
+
wandb/
|
| 8 |
+
results/
|
| 9 |
+
|
| 10 |
+
# Byte-compiled / optimized / DLL files
|
| 11 |
+
__pycache__/
|
| 12 |
+
*.py[cod]
|
| 13 |
+
*$py.class
|
| 14 |
+
|
| 15 |
+
# C extensions
|
| 16 |
+
*.so
|
| 17 |
+
|
| 18 |
+
# Distribution / packaging
|
| 19 |
+
.Python
|
| 20 |
+
build/
|
| 21 |
+
develop-eggs/
|
| 22 |
+
dist/
|
| 23 |
+
downloads/
|
| 24 |
+
eggs/
|
| 25 |
+
.eggs/
|
| 26 |
+
lib/
|
| 27 |
+
lib64/
|
| 28 |
+
parts/
|
| 29 |
+
sdist/
|
| 30 |
+
var/
|
| 31 |
+
wheels/
|
| 32 |
+
share/python-wheels/
|
| 33 |
+
*.egg-info/
|
| 34 |
+
.installed.cfg
|
| 35 |
+
*.egg
|
| 36 |
+
MANIFEST
|
| 37 |
+
|
| 38 |
+
# PyInstaller
|
| 39 |
+
# Usually these files are written by a python script from a template
|
| 40 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 41 |
+
*.manifest
|
| 42 |
+
*.spec
|
| 43 |
+
|
| 44 |
+
# Installer logs
|
| 45 |
+
pip-log.txt
|
| 46 |
+
pip-delete-this-directory.txt
|
| 47 |
+
|
| 48 |
+
# Unit test / coverage reports
|
| 49 |
+
htmlcov/
|
| 50 |
+
.tox/
|
| 51 |
+
.nox/
|
| 52 |
+
.coverage
|
| 53 |
+
.coverage.*
|
| 54 |
+
.cache
|
| 55 |
+
nosetests.xml
|
| 56 |
+
coverage.xml
|
| 57 |
+
*.cover
|
| 58 |
+
*.py,cover
|
| 59 |
+
.hypothesis/
|
| 60 |
+
.pytest_cache/
|
| 61 |
+
cover/
|
| 62 |
+
|
| 63 |
+
# Translations
|
| 64 |
+
*.mo
|
| 65 |
+
*.pot
|
| 66 |
+
|
| 67 |
+
# Django stuff:
|
| 68 |
+
*.log
|
| 69 |
+
local_settings.py
|
| 70 |
+
db.sqlite3
|
| 71 |
+
db.sqlite3-journal
|
| 72 |
+
|
| 73 |
+
# Flask stuff:
|
| 74 |
+
instance/
|
| 75 |
+
.webassets-cache
|
| 76 |
+
|
| 77 |
+
# Scrapy stuff:
|
| 78 |
+
.scrapy
|
| 79 |
+
|
| 80 |
+
# Sphinx documentation
|
| 81 |
+
docs/_build/
|
| 82 |
+
|
| 83 |
+
# PyBuilder
|
| 84 |
+
.pybuilder/
|
| 85 |
+
target/
|
| 86 |
+
|
| 87 |
+
# Jupyter Notebook
|
| 88 |
+
.ipynb_checkpoints
|
| 89 |
+
|
| 90 |
+
# IPython
|
| 91 |
+
profile_default/
|
| 92 |
+
ipython_config.py
|
| 93 |
+
|
| 94 |
+
# pyenv
|
| 95 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 96 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 97 |
+
# .python-version
|
| 98 |
+
|
| 99 |
+
# pipenv
|
| 100 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 101 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 102 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 103 |
+
# install all needed dependencies.
|
| 104 |
+
#Pipfile.lock
|
| 105 |
+
|
| 106 |
+
# poetry
|
| 107 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 108 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 109 |
+
# commonly ignored for libraries.
|
| 110 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 111 |
+
#poetry.lock
|
| 112 |
+
|
| 113 |
+
# pdm
|
| 114 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 115 |
+
#pdm.lock
|
| 116 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
| 117 |
+
# in version control.
|
| 118 |
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
| 119 |
+
.pdm.toml
|
| 120 |
+
.pdm-python
|
| 121 |
+
.pdm-build/
|
| 122 |
+
|
| 123 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 124 |
+
__pypackages__/
|
| 125 |
+
|
| 126 |
+
# Celery stuff
|
| 127 |
+
celerybeat-schedule
|
| 128 |
+
celerybeat.pid
|
| 129 |
+
|
| 130 |
+
# SageMath parsed files
|
| 131 |
+
*.sage.py
|
| 132 |
+
|
| 133 |
+
# Environments
|
| 134 |
+
.env
|
| 135 |
+
.venv
|
| 136 |
+
env/
|
| 137 |
+
venv/
|
| 138 |
+
ENV/
|
| 139 |
+
env.bak/
|
| 140 |
+
venv.bak/
|
| 141 |
+
|
| 142 |
+
# Spyder project settings
|
| 143 |
+
.spyderproject
|
| 144 |
+
.spyproject
|
| 145 |
+
|
| 146 |
+
# Rope project settings
|
| 147 |
+
.ropeproject
|
| 148 |
+
|
| 149 |
+
# mkdocs documentation
|
| 150 |
+
/site
|
| 151 |
+
|
| 152 |
+
# mypy
|
| 153 |
+
.mypy_cache/
|
| 154 |
+
.dmypy.json
|
| 155 |
+
dmypy.json
|
| 156 |
+
|
| 157 |
+
# Pyre type checker
|
| 158 |
+
.pyre/
|
| 159 |
+
|
| 160 |
+
# pytype static type analyzer
|
| 161 |
+
.pytype/
|
| 162 |
+
|
| 163 |
+
# Cython debug symbols
|
| 164 |
+
cython_debug/
|
| 165 |
+
|
| 166 |
+
# PyCharm
|
| 167 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 168 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 169 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 170 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 171 |
+
#.idea/
|
F5-TTS/.gitmodules
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[submodule "src/third_party/BigVGAN"]
|
| 2 |
+
path = src/third_party/BigVGAN
|
| 3 |
+
url = https://github.com/NVIDIA/BigVGAN.git
|
F5-TTS/.pre-commit-config.yaml
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
repos:
|
| 2 |
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
| 3 |
+
# Ruff version.
|
| 4 |
+
rev: v0.11.2
|
| 5 |
+
hooks:
|
| 6 |
+
- id: ruff
|
| 7 |
+
name: ruff linter
|
| 8 |
+
args: [--fix]
|
| 9 |
+
- id: ruff-format
|
| 10 |
+
name: ruff formatter
|
| 11 |
+
- id: ruff
|
| 12 |
+
name: ruff sorter
|
| 13 |
+
args: [--select, I, --fix]
|
| 14 |
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
| 15 |
+
rev: v5.0.0
|
| 16 |
+
hooks:
|
| 17 |
+
- id: check-yaml
|
F5-TTS/Dockerfile
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM pytorch/pytorch:2.4.0-cuda12.4-cudnn9-devel
|
| 2 |
+
|
| 3 |
+
USER root
|
| 4 |
+
|
| 5 |
+
ARG DEBIAN_FRONTEND=noninteractive
|
| 6 |
+
|
| 7 |
+
LABEL github_repo="https://github.com/SWivid/F5-TTS"
|
| 8 |
+
|
| 9 |
+
RUN set -x \
|
| 10 |
+
&& apt-get update \
|
| 11 |
+
&& apt-get -y install wget curl man git less openssl libssl-dev unzip unar build-essential aria2 tmux vim \
|
| 12 |
+
&& apt-get install -y openssh-server sox libsox-fmt-all libsox-fmt-mp3 libsndfile1-dev ffmpeg \
|
| 13 |
+
&& apt-get install -y librdmacm1 libibumad3 librdmacm-dev libibverbs1 libibverbs-dev ibverbs-utils ibverbs-providers \
|
| 14 |
+
&& rm -rf /var/lib/apt/lists/* \
|
| 15 |
+
&& apt-get clean
|
| 16 |
+
|
| 17 |
+
WORKDIR /workspace
|
| 18 |
+
|
| 19 |
+
RUN git clone https://github.com/SWivid/F5-TTS.git \
|
| 20 |
+
&& cd F5-TTS \
|
| 21 |
+
&& git submodule update --init --recursive \
|
| 22 |
+
&& pip install -e . --no-cache-dir
|
| 23 |
+
|
| 24 |
+
ENV SHELL=/bin/bash
|
| 25 |
+
|
| 26 |
+
VOLUME /root/.cache/huggingface/hub/
|
| 27 |
+
|
| 28 |
+
EXPOSE 7860
|
| 29 |
+
|
| 30 |
+
WORKDIR /workspace/F5-TTS
|
F5-TTS/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2024 Yushen CHEN
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
F5-TTS/README.md
ADDED
|
@@ -0,0 +1,262 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching
|
| 2 |
+
|
| 3 |
+
[](https://github.com/SWivid/F5-TTS)
|
| 4 |
+
[](https://arxiv.org/abs/2410.06885)
|
| 5 |
+
[](https://swivid.github.io/F5-TTS/)
|
| 6 |
+
[](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
|
| 7 |
+
[](https://modelscope.cn/studios/AI-ModelScope/E2-F5-TTS)
|
| 8 |
+
[](https://x-lance.sjtu.edu.cn/)
|
| 9 |
+
[](https://www.sii.edu.cn/)
|
| 10 |
+
[](https://www.pcl.ac.cn)
|
| 11 |
+
<!-- <img src="https://github.com/user-attachments/assets/12d7749c-071a-427c-81bf-b87b91def670" alt="Watermark" style="width: 40px; height: auto"> -->
|
| 12 |
+
|
| 13 |
+
**F5-TTS**: Diffusion Transformer with ConvNeXt V2, faster trained and inference.
|
| 14 |
+
|
| 15 |
+
**E2 TTS**: Flat-UNet Transformer, closest reproduction from [paper](https://arxiv.org/abs/2406.18009).
|
| 16 |
+
|
| 17 |
+
**Sway Sampling**: Inference-time flow step sampling strategy, greatly improves performance
|
| 18 |
+
|
| 19 |
+
### Thanks to all the contributors !
|
| 20 |
+
|
| 21 |
+
## News
|
| 22 |
+
- **2025/03/12**: 🔥 F5-TTS v1 base model with better training and inference performance. [Few demo](https://swivid.github.io/F5-TTS_updates).
|
| 23 |
+
- **2024/10/08**: F5-TTS & E2 TTS base models on [🤗 Hugging Face](https://huggingface.co/SWivid/F5-TTS), [🤖 Model Scope](https://www.modelscope.cn/models/SWivid/F5-TTS_Emilia-ZH-EN), [🟣 Wisemodel](https://wisemodel.cn/models/SJTU_X-LANCE/F5-TTS_Emilia-ZH-EN).
|
| 24 |
+
|
| 25 |
+
## Installation
|
| 26 |
+
|
| 27 |
+
### Create a separate environment if needed
|
| 28 |
+
|
| 29 |
+
```bash
|
| 30 |
+
# Create a conda env with python_version>=3.10 (you could also use virtualenv)
|
| 31 |
+
conda create -n f5-tts python=3.11
|
| 32 |
+
conda activate f5-tts
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
+
### Install PyTorch with matched device
|
| 36 |
+
|
| 37 |
+
<details>
|
| 38 |
+
<summary>NVIDIA GPU</summary>
|
| 39 |
+
|
| 40 |
+
> ```bash
|
| 41 |
+
> # Install pytorch with your CUDA version, e.g.
|
| 42 |
+
> pip install torch==2.4.0+cu124 torchaudio==2.4.0+cu124 --extra-index-url https://download.pytorch.org/whl/cu124
|
| 43 |
+
> ```
|
| 44 |
+
|
| 45 |
+
</details>
|
| 46 |
+
|
| 47 |
+
<details>
|
| 48 |
+
<summary>AMD GPU</summary>
|
| 49 |
+
|
| 50 |
+
> ```bash
|
| 51 |
+
> # Install pytorch with your ROCm version (Linux only), e.g.
|
| 52 |
+
> pip install torch==2.5.1+rocm6.2 torchaudio==2.5.1+rocm6.2 --extra-index-url https://download.pytorch.org/whl/rocm6.2
|
| 53 |
+
> ```
|
| 54 |
+
|
| 55 |
+
</details>
|
| 56 |
+
|
| 57 |
+
<details>
|
| 58 |
+
<summary>Intel GPU</summary>
|
| 59 |
+
|
| 60 |
+
> ```bash
|
| 61 |
+
> # Install pytorch with your XPU version, e.g.
|
| 62 |
+
> # Intel® Deep Learning Essentials or Intel® oneAPI Base Toolkit must be installed
|
| 63 |
+
> pip install torch torchaudio --index-url https://download.pytorch.org/whl/test/xpu
|
| 64 |
+
>
|
| 65 |
+
> # Intel GPU support is also available through IPEX (Intel® Extension for PyTorch)
|
| 66 |
+
> # IPEX does not require the Intel® Deep Learning Essentials or Intel® oneAPI Base Toolkit
|
| 67 |
+
> # See: https://pytorch-extension.intel.com/installation?request=platform
|
| 68 |
+
> ```
|
| 69 |
+
|
| 70 |
+
</details>
|
| 71 |
+
|
| 72 |
+
<details>
|
| 73 |
+
<summary>Apple Silicon</summary>
|
| 74 |
+
|
| 75 |
+
> ```bash
|
| 76 |
+
> # Install the stable pytorch, e.g.
|
| 77 |
+
> pip install torch torchaudio
|
| 78 |
+
> ```
|
| 79 |
+
|
| 80 |
+
</details>
|
| 81 |
+
|
| 82 |
+
### Then you can choose one from below:
|
| 83 |
+
|
| 84 |
+
> ### 1. As a pip package (if just for inference)
|
| 85 |
+
>
|
| 86 |
+
> ```bash
|
| 87 |
+
> pip install f5-tts
|
| 88 |
+
> ```
|
| 89 |
+
>
|
| 90 |
+
> ### 2. Local editable (if also do training, finetuning)
|
| 91 |
+
>
|
| 92 |
+
> ```bash
|
| 93 |
+
> git clone https://github.com/SWivid/F5-TTS.git
|
| 94 |
+
> cd F5-TTS
|
| 95 |
+
> # git submodule update --init --recursive # (optional, if use bigvgan as vocoder)
|
| 96 |
+
> pip install -e .
|
| 97 |
+
> ```
|
| 98 |
+
|
| 99 |
+
### Docker usage also available
|
| 100 |
+
```bash
|
| 101 |
+
# Build from Dockerfile
|
| 102 |
+
docker build -t f5tts:v1 .
|
| 103 |
+
|
| 104 |
+
# Run from GitHub Container Registry
|
| 105 |
+
docker container run --rm -it --gpus=all --mount 'type=volume,source=f5-tts,target=/root/.cache/huggingface/hub/' -p 7860:7860 ghcr.io/swivid/f5-tts:main
|
| 106 |
+
|
| 107 |
+
# Quickstart if you want to just run the web interface (not CLI)
|
| 108 |
+
docker container run --rm -it --gpus=all --mount 'type=volume,source=f5-tts,target=/root/.cache/huggingface/hub/' -p 7860:7860 ghcr.io/swivid/f5-tts:main f5-tts_infer-gradio --host 0.0.0.0
|
| 109 |
+
```
|
| 110 |
+
|
| 111 |
+
### Runtime
|
| 112 |
+
|
| 113 |
+
Deployment solution with Triton and TensorRT-LLM.
|
| 114 |
+
|
| 115 |
+
#### Benchmark Results
|
| 116 |
+
Decoding on a single L20 GPU, using 26 different prompt_audio & target_text pairs, 16 NFE.
|
| 117 |
+
|
| 118 |
+
| Model | Concurrency | Avg Latency | RTF | Mode |
|
| 119 |
+
|---------------------|----------------|-------------|--------|-----------------|
|
| 120 |
+
| F5-TTS Base (Vocos) | 2 | 253 ms | 0.0394 | Client-Server |
|
| 121 |
+
| F5-TTS Base (Vocos) | 1 (Batch_size) | - | 0.0402 | Offline TRT-LLM |
|
| 122 |
+
| F5-TTS Base (Vocos) | 1 (Batch_size) | - | 0.1467 | Offline Pytorch |
|
| 123 |
+
|
| 124 |
+
See [detailed instructions](src/f5_tts/runtime/triton_trtllm/README.md) for more information.
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
## Inference
|
| 128 |
+
|
| 129 |
+
- In order to achieve desired performance, take a moment to read [detailed guidance](src/f5_tts/infer).
|
| 130 |
+
- By properly searching the keywords of problem encountered, [issues](https://github.com/SWivid/F5-TTS/issues?q=is%3Aissue) are very helpful.
|
| 131 |
+
|
| 132 |
+
### 1. Gradio App
|
| 133 |
+
|
| 134 |
+
Currently supported features:
|
| 135 |
+
|
| 136 |
+
- Basic TTS with Chunk Inference
|
| 137 |
+
- Multi-Style / Multi-Speaker Generation
|
| 138 |
+
- Voice Chat powered by Qwen2.5-3B-Instruct
|
| 139 |
+
- [Custom inference with more language support](src/f5_tts/infer/SHARED.md)
|
| 140 |
+
|
| 141 |
+
```bash
|
| 142 |
+
# Launch a Gradio app (web interface)
|
| 143 |
+
f5-tts_infer-gradio
|
| 144 |
+
|
| 145 |
+
# Specify the port/host
|
| 146 |
+
f5-tts_infer-gradio --port 7860 --host 0.0.0.0
|
| 147 |
+
|
| 148 |
+
# Launch a share link
|
| 149 |
+
f5-tts_infer-gradio --share
|
| 150 |
+
```
|
| 151 |
+
|
| 152 |
+
<details>
|
| 153 |
+
<summary>NVIDIA device docker compose file example</summary>
|
| 154 |
+
|
| 155 |
+
```yaml
|
| 156 |
+
services:
|
| 157 |
+
f5-tts:
|
| 158 |
+
image: ghcr.io/swivid/f5-tts:main
|
| 159 |
+
ports:
|
| 160 |
+
- "7860:7860"
|
| 161 |
+
environment:
|
| 162 |
+
GRADIO_SERVER_PORT: 7860
|
| 163 |
+
entrypoint: ["f5-tts_infer-gradio", "--port", "7860", "--host", "0.0.0.0"]
|
| 164 |
+
deploy:
|
| 165 |
+
resources:
|
| 166 |
+
reservations:
|
| 167 |
+
devices:
|
| 168 |
+
- driver: nvidia
|
| 169 |
+
count: 1
|
| 170 |
+
capabilities: [gpu]
|
| 171 |
+
|
| 172 |
+
volumes:
|
| 173 |
+
f5-tts:
|
| 174 |
+
driver: local
|
| 175 |
+
```
|
| 176 |
+
|
| 177 |
+
</details>
|
| 178 |
+
|
| 179 |
+
### 2. CLI Inference
|
| 180 |
+
|
| 181 |
+
```bash
|
| 182 |
+
# Run with flags
|
| 183 |
+
# Leave --ref_text "" will have ASR model transcribe (extra GPU memory usage)
|
| 184 |
+
f5-tts_infer-cli --model F5TTS_v1_Base \
|
| 185 |
+
--ref_audio "provide_prompt_wav_path_here.wav" \
|
| 186 |
+
--ref_text "The content, subtitle or transcription of reference audio." \
|
| 187 |
+
--gen_text "Some text you want TTS model generate for you."
|
| 188 |
+
|
| 189 |
+
# Run with default setting. src/f5_tts/infer/examples/basic/basic.toml
|
| 190 |
+
f5-tts_infer-cli
|
| 191 |
+
# Or with your own .toml file
|
| 192 |
+
f5-tts_infer-cli -c custom.toml
|
| 193 |
+
|
| 194 |
+
# Multi voice. See src/f5_tts/infer/README.md
|
| 195 |
+
f5-tts_infer-cli -c src/f5_tts/infer/examples/multi/story.toml
|
| 196 |
+
```
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
## Training
|
| 200 |
+
|
| 201 |
+
### 1. With Hugging Face Accelerate
|
| 202 |
+
|
| 203 |
+
Refer to [training & finetuning guidance](src/f5_tts/train) for best practice.
|
| 204 |
+
|
| 205 |
+
### 2. With Gradio App
|
| 206 |
+
|
| 207 |
+
```bash
|
| 208 |
+
# Quick start with Gradio web interface
|
| 209 |
+
f5-tts_finetune-gradio
|
| 210 |
+
```
|
| 211 |
+
|
| 212 |
+
Read [training & finetuning guidance](src/f5_tts/train) for more instructions.
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
## [Evaluation](src/f5_tts/eval)
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
## Development
|
| 219 |
+
|
| 220 |
+
Use pre-commit to ensure code quality (will run linters and formatters automatically):
|
| 221 |
+
|
| 222 |
+
```bash
|
| 223 |
+
pip install pre-commit
|
| 224 |
+
pre-commit install
|
| 225 |
+
```
|
| 226 |
+
|
| 227 |
+
When making a pull request, before each commit, run:
|
| 228 |
+
|
| 229 |
+
```bash
|
| 230 |
+
pre-commit run --all-files
|
| 231 |
+
```
|
| 232 |
+
|
| 233 |
+
Note: Some model components have linting exceptions for E722 to accommodate tensor notation.
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
## Acknowledgements
|
| 237 |
+
|
| 238 |
+
- [E2-TTS](https://arxiv.org/abs/2406.18009) brilliant work, simple and effective
|
| 239 |
+
- [Emilia](https://arxiv.org/abs/2407.05361), [WenetSpeech4TTS](https://arxiv.org/abs/2406.05763), [LibriTTS](https://arxiv.org/abs/1904.02882), [LJSpeech](https://keithito.com/LJ-Speech-Dataset/) valuable datasets
|
| 240 |
+
- [lucidrains](https://github.com/lucidrains) initial CFM structure with also [bfs18](https://github.com/bfs18) for discussion
|
| 241 |
+
- [SD3](https://arxiv.org/abs/2403.03206) & [Hugging Face diffusers](https://github.com/huggingface/diffusers) DiT and MMDiT code structure
|
| 242 |
+
- [torchdiffeq](https://github.com/rtqichen/torchdiffeq) as ODE solver, [Vocos](https://huggingface.co/charactr/vocos-mel-24khz) and [BigVGAN](https://github.com/NVIDIA/BigVGAN) as vocoder
|
| 243 |
+
- [FunASR](https://github.com/modelscope/FunASR), [faster-whisper](https://github.com/SYSTRAN/faster-whisper), [UniSpeech](https://github.com/microsoft/UniSpeech), [SpeechMOS](https://github.com/tarepan/SpeechMOS) for evaluation tools
|
| 244 |
+
- [ctc-forced-aligner](https://github.com/MahmoudAshraf97/ctc-forced-aligner) for speech edit test
|
| 245 |
+
- [mrfakename](https://x.com/realmrfakename) huggingface space demo ~
|
| 246 |
+
- [f5-tts-mlx](https://github.com/lucasnewman/f5-tts-mlx/tree/main) Implementation with MLX framework by [Lucas Newman](https://github.com/lucasnewman)
|
| 247 |
+
- [F5-TTS-ONNX](https://github.com/DakeQQ/F5-TTS-ONNX) ONNX Runtime version by [DakeQQ](https://github.com/DakeQQ)
|
| 248 |
+
- [Yuekai Zhang](https://github.com/yuekaizhang) Triton and TensorRT-LLM support ~
|
| 249 |
+
|
| 250 |
+
## Citation
|
| 251 |
+
If our work and codebase is useful for you, please cite as:
|
| 252 |
+
```
|
| 253 |
+
@article{chen-etal-2024-f5tts,
|
| 254 |
+
title={F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching},
|
| 255 |
+
author={Yushen Chen and Zhikang Niu and Ziyang Ma and Keqi Deng and Chunhui Wang and Jian Zhao and Kai Yu and Xie Chen},
|
| 256 |
+
journal={arXiv preprint arXiv:2410.06885},
|
| 257 |
+
year={2024},
|
| 258 |
+
}
|
| 259 |
+
```
|
| 260 |
+
## License
|
| 261 |
+
|
| 262 |
+
Our code is released under MIT License. The pre-trained models are licensed under the CC-BY-NC license due to the training data Emilia, which is an in-the-wild dataset. Sorry for any inconvenience this may cause.
|
F5-TTS/pyproject.toml
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools >= 61.0", "setuptools-scm>=8.0"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "f5-tts"
|
| 7 |
+
version = "1.1.9"
|
| 8 |
+
description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
|
| 9 |
+
readme = "README.md"
|
| 10 |
+
license = {text = "MIT License"}
|
| 11 |
+
classifiers = [
|
| 12 |
+
"License :: OSI Approved :: MIT License",
|
| 13 |
+
"Operating System :: OS Independent",
|
| 14 |
+
"Programming Language :: Python :: 3",
|
| 15 |
+
]
|
| 16 |
+
dependencies = [
|
| 17 |
+
"accelerate>=0.33.0",
|
| 18 |
+
"bitsandbytes>0.37.0; platform_machine!='arm64' and platform_system!='Darwin'",
|
| 19 |
+
"cached_path",
|
| 20 |
+
"click",
|
| 21 |
+
"datasets",
|
| 22 |
+
"ema_pytorch>=0.5.2",
|
| 23 |
+
"gradio>=5.0.0",
|
| 24 |
+
"hydra-core>=1.3.0",
|
| 25 |
+
"jieba",
|
| 26 |
+
"librosa",
|
| 27 |
+
"matplotlib",
|
| 28 |
+
"numpy<=1.26.4; python_version<='3.10'",
|
| 29 |
+
"pydantic<=2.10.6",
|
| 30 |
+
"pydub",
|
| 31 |
+
"pypinyin",
|
| 32 |
+
"safetensors",
|
| 33 |
+
"soundfile",
|
| 34 |
+
"tomli",
|
| 35 |
+
"torch>=2.0.0",
|
| 36 |
+
"torchaudio>=2.0.0",
|
| 37 |
+
"torchdiffeq",
|
| 38 |
+
"tqdm>=4.65.0",
|
| 39 |
+
"transformers",
|
| 40 |
+
"transformers_stream_generator",
|
| 41 |
+
"unidecode",
|
| 42 |
+
"vocos",
|
| 43 |
+
"wandb",
|
| 44 |
+
"x_transformers>=1.31.14",
|
| 45 |
+
]
|
| 46 |
+
|
| 47 |
+
[project.optional-dependencies]
|
| 48 |
+
eval = [
|
| 49 |
+
"faster_whisper==0.10.1",
|
| 50 |
+
"funasr",
|
| 51 |
+
"jiwer",
|
| 52 |
+
"modelscope",
|
| 53 |
+
"zhconv",
|
| 54 |
+
"zhon",
|
| 55 |
+
]
|
| 56 |
+
|
| 57 |
+
[project.urls]
|
| 58 |
+
Homepage = "https://github.com/SWivid/F5-TTS"
|
| 59 |
+
|
| 60 |
+
[project.scripts]
|
| 61 |
+
"f5-tts_infer-cli" = "f5_tts.infer.infer_cli:main"
|
| 62 |
+
"f5-tts_infer-gradio" = "f5_tts.infer.infer_gradio:main"
|
| 63 |
+
"f5-tts_finetune-cli" = "f5_tts.train.finetune_cli:main"
|
| 64 |
+
"f5-tts_finetune-gradio" = "f5_tts.train.finetune_gradio:main"
|
F5-TTS/ruff.toml
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
line-length = 120
|
| 2 |
+
target-version = "py310"
|
| 3 |
+
|
| 4 |
+
[lint]
|
| 5 |
+
# Only ignore variables with names starting with "_".
|
| 6 |
+
dummy-variable-rgx = "^_.*$"
|
| 7 |
+
|
| 8 |
+
[lint.isort]
|
| 9 |
+
force-single-line = false
|
| 10 |
+
lines-after-imports = 2
|
F5-TTS/src/f5_tts/api.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import random
|
| 2 |
+
import sys
|
| 3 |
+
from importlib.resources import files
|
| 4 |
+
|
| 5 |
+
import soundfile as sf
|
| 6 |
+
import tqdm
|
| 7 |
+
from cached_path import cached_path
|
| 8 |
+
from hydra.utils import get_class
|
| 9 |
+
from omegaconf import OmegaConf
|
| 10 |
+
|
| 11 |
+
from f5_tts.infer.utils_infer import (
|
| 12 |
+
infer_process,
|
| 13 |
+
load_model,
|
| 14 |
+
load_vocoder,
|
| 15 |
+
preprocess_ref_audio_text,
|
| 16 |
+
remove_silence_for_generated_wav,
|
| 17 |
+
save_spectrogram,
|
| 18 |
+
transcribe,
|
| 19 |
+
)
|
| 20 |
+
from f5_tts.model.utils import seed_everything
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class F5TTS:
|
| 24 |
+
def __init__(
|
| 25 |
+
self,
|
| 26 |
+
model="F5TTS_v1_Base",
|
| 27 |
+
ckpt_file="",
|
| 28 |
+
vocab_file="",
|
| 29 |
+
ode_method="euler",
|
| 30 |
+
use_ema=True,
|
| 31 |
+
vocoder_local_path=None,
|
| 32 |
+
device=None,
|
| 33 |
+
hf_cache_dir=None,
|
| 34 |
+
):
|
| 35 |
+
model_cfg = OmegaConf.load(str(files("f5_tts").joinpath(f"configs/{model}.yaml")))
|
| 36 |
+
model_cls = get_class(f"f5_tts.model.{model_cfg.model.backbone}")
|
| 37 |
+
model_arc = model_cfg.model.arch
|
| 38 |
+
|
| 39 |
+
self.mel_spec_type = model_cfg.model.mel_spec.mel_spec_type
|
| 40 |
+
self.target_sample_rate = model_cfg.model.mel_spec.target_sample_rate
|
| 41 |
+
|
| 42 |
+
self.ode_method = ode_method
|
| 43 |
+
self.use_ema = use_ema
|
| 44 |
+
|
| 45 |
+
if device is not None:
|
| 46 |
+
self.device = device
|
| 47 |
+
else:
|
| 48 |
+
import torch
|
| 49 |
+
|
| 50 |
+
self.device = (
|
| 51 |
+
"cuda"
|
| 52 |
+
if torch.cuda.is_available()
|
| 53 |
+
else "xpu"
|
| 54 |
+
if torch.xpu.is_available()
|
| 55 |
+
else "mps"
|
| 56 |
+
if torch.backends.mps.is_available()
|
| 57 |
+
else "cpu"
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
# Load models
|
| 61 |
+
self.vocoder = load_vocoder(
|
| 62 |
+
self.mel_spec_type, vocoder_local_path is not None, vocoder_local_path, self.device, hf_cache_dir
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
repo_name, ckpt_step, ckpt_type = "F5-TTS", 1250000, "safetensors"
|
| 66 |
+
|
| 67 |
+
# override for previous models
|
| 68 |
+
if model == "F5TTS_Base":
|
| 69 |
+
if self.mel_spec_type == "vocos":
|
| 70 |
+
ckpt_step = 1200000
|
| 71 |
+
elif self.mel_spec_type == "bigvgan":
|
| 72 |
+
model = "F5TTS_Base_bigvgan"
|
| 73 |
+
ckpt_type = "pt"
|
| 74 |
+
elif model == "E2TTS_Base":
|
| 75 |
+
repo_name = "E2-TTS"
|
| 76 |
+
ckpt_step = 1200000
|
| 77 |
+
|
| 78 |
+
if not ckpt_file:
|
| 79 |
+
ckpt_file = str(
|
| 80 |
+
cached_path(f"hf://SWivid/{repo_name}/{model}/model_{ckpt_step}.{ckpt_type}", cache_dir=hf_cache_dir)
|
| 81 |
+
)
|
| 82 |
+
self.ema_model = load_model(
|
| 83 |
+
model_cls, model_arc, ckpt_file, self.mel_spec_type, vocab_file, self.ode_method, self.use_ema, self.device
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
def transcribe(self, ref_audio, language=None):
|
| 87 |
+
return transcribe(ref_audio, language)
|
| 88 |
+
|
| 89 |
+
def export_wav(self, wav, file_wave, remove_silence=False):
|
| 90 |
+
sf.write(file_wave, wav, self.target_sample_rate)
|
| 91 |
+
|
| 92 |
+
if remove_silence:
|
| 93 |
+
remove_silence_for_generated_wav(file_wave)
|
| 94 |
+
|
| 95 |
+
def export_spectrogram(self, spec, file_spec):
|
| 96 |
+
save_spectrogram(spec, file_spec)
|
| 97 |
+
|
| 98 |
+
def infer(
|
| 99 |
+
self,
|
| 100 |
+
ref_file,
|
| 101 |
+
ref_text,
|
| 102 |
+
gen_text,
|
| 103 |
+
show_info=print,
|
| 104 |
+
progress=tqdm,
|
| 105 |
+
target_rms=0.1,
|
| 106 |
+
cross_fade_duration=0.15,
|
| 107 |
+
sway_sampling_coef=-1,
|
| 108 |
+
cfg_strength=2,
|
| 109 |
+
nfe_step=32,
|
| 110 |
+
speed=1.0,
|
| 111 |
+
fix_duration=None,
|
| 112 |
+
remove_silence=False,
|
| 113 |
+
file_wave=None,
|
| 114 |
+
file_spec=None,
|
| 115 |
+
seed=None,
|
| 116 |
+
):
|
| 117 |
+
if seed is None:
|
| 118 |
+
seed = random.randint(0, sys.maxsize)
|
| 119 |
+
seed_everything(seed)
|
| 120 |
+
self.seed = seed
|
| 121 |
+
|
| 122 |
+
ref_file, ref_text = preprocess_ref_audio_text(ref_file, ref_text)
|
| 123 |
+
|
| 124 |
+
wav, sr, spec = infer_process(
|
| 125 |
+
ref_file,
|
| 126 |
+
ref_text,
|
| 127 |
+
gen_text,
|
| 128 |
+
self.ema_model,
|
| 129 |
+
self.vocoder,
|
| 130 |
+
self.mel_spec_type,
|
| 131 |
+
show_info=show_info,
|
| 132 |
+
progress=progress,
|
| 133 |
+
target_rms=target_rms,
|
| 134 |
+
cross_fade_duration=cross_fade_duration,
|
| 135 |
+
nfe_step=nfe_step,
|
| 136 |
+
cfg_strength=cfg_strength,
|
| 137 |
+
sway_sampling_coef=sway_sampling_coef,
|
| 138 |
+
speed=speed,
|
| 139 |
+
fix_duration=fix_duration,
|
| 140 |
+
device=self.device,
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
if file_wave is not None:
|
| 144 |
+
self.export_wav(wav, file_wave, remove_silence)
|
| 145 |
+
|
| 146 |
+
if file_spec is not None:
|
| 147 |
+
self.export_spectrogram(spec, file_spec)
|
| 148 |
+
|
| 149 |
+
return wav, sr, spec
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
if __name__ == "__main__":
|
| 153 |
+
f5tts = F5TTS()
|
| 154 |
+
|
| 155 |
+
wav, sr, spec = f5tts.infer(
|
| 156 |
+
ref_file=str(files("f5_tts").joinpath("infer/examples/basic/basic_ref_en.wav")),
|
| 157 |
+
ref_text="Some call me nature, others call me mother nature.",
|
| 158 |
+
gen_text="I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring.",
|
| 159 |
+
file_wave=str(files("f5_tts").joinpath("../../tests/api_out.wav")),
|
| 160 |
+
file_spec=str(files("f5_tts").joinpath("../../tests/api_out.png")),
|
| 161 |
+
seed=None,
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
print("seed :", f5tts.seed)
|
F5-TTS/src/f5_tts/configs/E2TTS_Base.yaml
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
hydra:
|
| 2 |
+
run:
|
| 3 |
+
dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
| 4 |
+
|
| 5 |
+
datasets:
|
| 6 |
+
name: Emilia_ZH_EN # dataset name
|
| 7 |
+
batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
|
| 8 |
+
batch_size_type: frame # frame | sample
|
| 9 |
+
max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
|
| 10 |
+
num_workers: 16
|
| 11 |
+
|
| 12 |
+
optim:
|
| 13 |
+
epochs: 11
|
| 14 |
+
learning_rate: 7.5e-5
|
| 15 |
+
num_warmup_updates: 20000 # warmup updates
|
| 16 |
+
grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
|
| 17 |
+
max_grad_norm: 1.0 # gradient clipping
|
| 18 |
+
bnb_optimizer: False # use bnb 8bit AdamW optimizer or not
|
| 19 |
+
|
| 20 |
+
model:
|
| 21 |
+
name: E2TTS_Base
|
| 22 |
+
tokenizer: pinyin
|
| 23 |
+
tokenizer_path: null # if 'custom' tokenizer, define the path want to use (should be vocab.txt)
|
| 24 |
+
backbone: UNetT
|
| 25 |
+
arch:
|
| 26 |
+
dim: 1024
|
| 27 |
+
depth: 24
|
| 28 |
+
heads: 16
|
| 29 |
+
ff_mult: 4
|
| 30 |
+
text_mask_padding: False
|
| 31 |
+
pe_attn_head: 1
|
| 32 |
+
mel_spec:
|
| 33 |
+
target_sample_rate: 24000
|
| 34 |
+
n_mel_channels: 100
|
| 35 |
+
hop_length: 256
|
| 36 |
+
win_length: 1024
|
| 37 |
+
n_fft: 1024
|
| 38 |
+
mel_spec_type: vocos # vocos | bigvgan
|
| 39 |
+
vocoder:
|
| 40 |
+
is_local: False # use local offline ckpt or not
|
| 41 |
+
local_path: null # local vocoder path
|
| 42 |
+
|
| 43 |
+
ckpts:
|
| 44 |
+
logger: wandb # wandb | tensorboard | null
|
| 45 |
+
log_samples: True # infer random sample per save checkpoint. wip, normal to fail with extra long samples
|
| 46 |
+
save_per_updates: 50000 # save checkpoint per updates
|
| 47 |
+
keep_last_n_checkpoints: -1 # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
|
| 48 |
+
last_per_updates: 5000 # save last checkpoint per updates
|
| 49 |
+
save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}
|
F5-TTS/src/f5_tts/configs/E2TTS_Small.yaml
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
hydra:
|
| 2 |
+
run:
|
| 3 |
+
dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
| 4 |
+
|
| 5 |
+
datasets:
|
| 6 |
+
name: Emilia_ZH_EN
|
| 7 |
+
batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
|
| 8 |
+
batch_size_type: frame # frame | sample
|
| 9 |
+
max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
|
| 10 |
+
num_workers: 16
|
| 11 |
+
|
| 12 |
+
optim:
|
| 13 |
+
epochs: 11
|
| 14 |
+
learning_rate: 7.5e-5
|
| 15 |
+
num_warmup_updates: 20000 # warmup updates
|
| 16 |
+
grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
|
| 17 |
+
max_grad_norm: 1.0
|
| 18 |
+
bnb_optimizer: False
|
| 19 |
+
|
| 20 |
+
model:
|
| 21 |
+
name: E2TTS_Small
|
| 22 |
+
tokenizer: pinyin
|
| 23 |
+
tokenizer_path: null # if 'custom' tokenizer, define the path want to use (should be vocab.txt)
|
| 24 |
+
backbone: UNetT
|
| 25 |
+
arch:
|
| 26 |
+
dim: 768
|
| 27 |
+
depth: 20
|
| 28 |
+
heads: 12
|
| 29 |
+
ff_mult: 4
|
| 30 |
+
text_mask_padding: False
|
| 31 |
+
pe_attn_head: 1
|
| 32 |
+
mel_spec:
|
| 33 |
+
target_sample_rate: 24000
|
| 34 |
+
n_mel_channels: 100
|
| 35 |
+
hop_length: 256
|
| 36 |
+
win_length: 1024
|
| 37 |
+
n_fft: 1024
|
| 38 |
+
mel_spec_type: vocos # vocos | bigvgan
|
| 39 |
+
vocoder:
|
| 40 |
+
is_local: False # use local offline ckpt or not
|
| 41 |
+
local_path: null # local vocoder path
|
| 42 |
+
|
| 43 |
+
ckpts:
|
| 44 |
+
logger: wandb # wandb | tensorboard | null
|
| 45 |
+
log_samples: True # infer random sample per save checkpoint. wip, normal to fail with extra long samples
|
| 46 |
+
save_per_updates: 50000 # save checkpoint per updates
|
| 47 |
+
keep_last_n_checkpoints: -1 # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
|
| 48 |
+
last_per_updates: 5000 # save last checkpoint per updates
|
| 49 |
+
save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}
|
F5-TTS/src/f5_tts/configs/F5TTS_Base.yaml
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
hydra:
|
| 2 |
+
run:
|
| 3 |
+
dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
| 4 |
+
|
| 5 |
+
datasets:
|
| 6 |
+
name: Emilia_ZH_EN # dataset name
|
| 7 |
+
batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
|
| 8 |
+
batch_size_type: frame # frame | sample
|
| 9 |
+
max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
|
| 10 |
+
num_workers: 16
|
| 11 |
+
|
| 12 |
+
optim:
|
| 13 |
+
epochs: 11
|
| 14 |
+
learning_rate: 7.5e-5
|
| 15 |
+
num_warmup_updates: 20000 # warmup updates
|
| 16 |
+
grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
|
| 17 |
+
max_grad_norm: 1.0 # gradient clipping
|
| 18 |
+
bnb_optimizer: False # use bnb 8bit AdamW optimizer or not
|
| 19 |
+
|
| 20 |
+
model:
|
| 21 |
+
name: F5TTS_Base # model name
|
| 22 |
+
tokenizer: pinyin # tokenizer type
|
| 23 |
+
tokenizer_path: null # if 'custom' tokenizer, define the path want to use (should be vocab.txt)
|
| 24 |
+
backbone: DiT
|
| 25 |
+
arch:
|
| 26 |
+
dim: 1024
|
| 27 |
+
depth: 22
|
| 28 |
+
heads: 16
|
| 29 |
+
ff_mult: 2
|
| 30 |
+
text_dim: 512
|
| 31 |
+
text_mask_padding: False
|
| 32 |
+
conv_layers: 4
|
| 33 |
+
pe_attn_head: 1
|
| 34 |
+
attn_backend: torch # torch | flash_attn
|
| 35 |
+
attn_mask_enabled: False
|
| 36 |
+
checkpoint_activations: False # recompute activations and save memory for extra compute
|
| 37 |
+
mel_spec:
|
| 38 |
+
target_sample_rate: 24000
|
| 39 |
+
n_mel_channels: 100
|
| 40 |
+
hop_length: 256
|
| 41 |
+
win_length: 1024
|
| 42 |
+
n_fft: 1024
|
| 43 |
+
mel_spec_type: vocos # vocos | bigvgan
|
| 44 |
+
vocoder:
|
| 45 |
+
is_local: False # use local offline ckpt or not
|
| 46 |
+
local_path: null # local vocoder path
|
| 47 |
+
|
| 48 |
+
ckpts:
|
| 49 |
+
logger: wandb # wandb | tensorboard | null
|
| 50 |
+
log_samples: True # infer random sample per save checkpoint. wip, normal to fail with extra long samples
|
| 51 |
+
save_per_updates: 50000 # save checkpoint per updates
|
| 52 |
+
keep_last_n_checkpoints: -1 # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
|
| 53 |
+
last_per_updates: 5000 # save last checkpoint per updates
|
| 54 |
+
save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}
|
F5-TTS/src/f5_tts/configs/F5TTS_Small.yaml
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
hydra:
|
| 2 |
+
run:
|
| 3 |
+
dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
| 4 |
+
|
| 5 |
+
datasets:
|
| 6 |
+
name: Emilia_ZH_EN
|
| 7 |
+
batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
|
| 8 |
+
batch_size_type: frame # frame | sample
|
| 9 |
+
max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
|
| 10 |
+
num_workers: 16
|
| 11 |
+
|
| 12 |
+
optim:
|
| 13 |
+
epochs: 11 # only suitable for Emilia, if you want to train it on LibriTTS, set epoch 686
|
| 14 |
+
learning_rate: 7.5e-5
|
| 15 |
+
num_warmup_updates: 20000 # warmup updates
|
| 16 |
+
grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
|
| 17 |
+
max_grad_norm: 1.0 # gradient clipping
|
| 18 |
+
bnb_optimizer: False # use bnb 8bit AdamW optimizer or not
|
| 19 |
+
|
| 20 |
+
model:
|
| 21 |
+
name: F5TTS_Small
|
| 22 |
+
tokenizer: pinyin
|
| 23 |
+
tokenizer_path: null # if 'custom' tokenizer, define the path want to use (should be vocab.txt)
|
| 24 |
+
backbone: DiT
|
| 25 |
+
arch:
|
| 26 |
+
dim: 768
|
| 27 |
+
depth: 18
|
| 28 |
+
heads: 12
|
| 29 |
+
ff_mult: 2
|
| 30 |
+
text_dim: 512
|
| 31 |
+
text_mask_padding: False
|
| 32 |
+
conv_layers: 4
|
| 33 |
+
pe_attn_head: 1
|
| 34 |
+
attn_backend: torch # torch | flash_attn
|
| 35 |
+
attn_mask_enabled: False
|
| 36 |
+
checkpoint_activations: False # recompute activations and save memory for extra compute
|
| 37 |
+
mel_spec:
|
| 38 |
+
target_sample_rate: 24000
|
| 39 |
+
n_mel_channels: 100
|
| 40 |
+
hop_length: 256
|
| 41 |
+
win_length: 1024
|
| 42 |
+
n_fft: 1024
|
| 43 |
+
mel_spec_type: vocos # vocos | bigvgan
|
| 44 |
+
vocoder:
|
| 45 |
+
is_local: False # use local offline ckpt or not
|
| 46 |
+
local_path: null # local vocoder path
|
| 47 |
+
|
| 48 |
+
ckpts:
|
| 49 |
+
logger: wandb # wandb | tensorboard | null
|
| 50 |
+
log_samples: True # infer random sample per save checkpoint. wip, normal to fail with extra long samples
|
| 51 |
+
save_per_updates: 50000 # save checkpoint per updates
|
| 52 |
+
keep_last_n_checkpoints: -1 # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
|
| 53 |
+
last_per_updates: 5000 # save last checkpoint per updates
|
| 54 |
+
save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}
|
F5-TTS/src/f5_tts/configs/F5TTS_v1_Base.yaml
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
hydra:
|
| 2 |
+
run:
|
| 3 |
+
dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
| 4 |
+
|
| 5 |
+
datasets:
|
| 6 |
+
name: Emilia_ZH_EN # dataset name
|
| 7 |
+
batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
|
| 8 |
+
batch_size_type: frame # frame | sample
|
| 9 |
+
max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
|
| 10 |
+
num_workers: 16
|
| 11 |
+
|
| 12 |
+
optim:
|
| 13 |
+
epochs: 11
|
| 14 |
+
learning_rate: 7.5e-5
|
| 15 |
+
num_warmup_updates: 20000 # warmup updates
|
| 16 |
+
grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
|
| 17 |
+
max_grad_norm: 1.0 # gradient clipping
|
| 18 |
+
bnb_optimizer: False # use bnb 8bit AdamW optimizer or not
|
| 19 |
+
|
| 20 |
+
model:
|
| 21 |
+
name: F5TTS_v1_Base # model name
|
| 22 |
+
tokenizer: pinyin # tokenizer type
|
| 23 |
+
tokenizer_path: null # if 'custom' tokenizer, define the path want to use (should be vocab.txt)
|
| 24 |
+
backbone: DiT
|
| 25 |
+
arch:
|
| 26 |
+
dim: 1024
|
| 27 |
+
depth: 22
|
| 28 |
+
heads: 16
|
| 29 |
+
ff_mult: 2
|
| 30 |
+
text_dim: 512
|
| 31 |
+
text_mask_padding: True
|
| 32 |
+
qk_norm: null # null | rms_norm
|
| 33 |
+
conv_layers: 4
|
| 34 |
+
pe_attn_head: null
|
| 35 |
+
attn_backend: torch # torch | flash_attn
|
| 36 |
+
attn_mask_enabled: False
|
| 37 |
+
checkpoint_activations: False # recompute activations and save memory for extra compute
|
| 38 |
+
mel_spec:
|
| 39 |
+
target_sample_rate: 24000
|
| 40 |
+
n_mel_channels: 100
|
| 41 |
+
hop_length: 256
|
| 42 |
+
win_length: 1024
|
| 43 |
+
n_fft: 1024
|
| 44 |
+
mel_spec_type: vocos # vocos | bigvgan
|
| 45 |
+
vocoder:
|
| 46 |
+
is_local: False # use local offline ckpt or not
|
| 47 |
+
local_path: null # local vocoder path
|
| 48 |
+
|
| 49 |
+
ckpts:
|
| 50 |
+
logger: wandb # wandb | tensorboard | null
|
| 51 |
+
log_samples: True # infer random sample per save checkpoint. wip, normal to fail with extra long samples
|
| 52 |
+
save_per_updates: 50000 # save checkpoint per updates
|
| 53 |
+
keep_last_n_checkpoints: -1 # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
|
| 54 |
+
last_per_updates: 5000 # save last checkpoint per updates
|
| 55 |
+
save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}
|
F5-TTS/src/f5_tts/eval/README.md
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# Evaluation
|
| 3 |
+
|
| 4 |
+
Install packages for evaluation:
|
| 5 |
+
|
| 6 |
+
```bash
|
| 7 |
+
pip install -e .[eval]
|
| 8 |
+
```
|
| 9 |
+
|
| 10 |
+
## Generating Samples for Evaluation
|
| 11 |
+
|
| 12 |
+
### Prepare Test Datasets
|
| 13 |
+
|
| 14 |
+
1. *Seed-TTS testset*: Download from [seed-tts-eval](https://github.com/BytedanceSpeech/seed-tts-eval).
|
| 15 |
+
2. *LibriSpeech test-clean*: Download from [OpenSLR](http://www.openslr.org/12/).
|
| 16 |
+
3. Unzip the downloaded datasets and place them in the `data/` directory.
|
| 17 |
+
4. Update the path for *LibriSpeech test-clean* data in `src/f5_tts/eval/eval_infer_batch.py`
|
| 18 |
+
5. Our filtered LibriSpeech-PC 4-10s subset: `data/librispeech_pc_test_clean_cross_sentence.lst`
|
| 19 |
+
|
| 20 |
+
### Batch Inference for Test Set
|
| 21 |
+
|
| 22 |
+
To run batch inference for evaluations, execute the following commands:
|
| 23 |
+
|
| 24 |
+
```bash
|
| 25 |
+
# batch inference for evaluations
|
| 26 |
+
accelerate config # if not set before
|
| 27 |
+
bash src/f5_tts/eval/eval_infer_batch.sh
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
## Objective Evaluation on Generated Results
|
| 31 |
+
|
| 32 |
+
### Download Evaluation Model Checkpoints
|
| 33 |
+
|
| 34 |
+
1. Chinese ASR Model: [Paraformer-zh](https://huggingface.co/funasr/paraformer-zh)
|
| 35 |
+
2. English ASR Model: [Faster-Whisper](https://huggingface.co/Systran/faster-whisper-large-v3)
|
| 36 |
+
3. WavLM Model: Download from [Google Drive](https://drive.google.com/file/d/1-aE1NfzpRCLxA4GUxX9ITI3F9LlbtEGP/view).
|
| 37 |
+
|
| 38 |
+
Then update in the following scripts with the paths you put evaluation model ckpts to.
|
| 39 |
+
|
| 40 |
+
### Objective Evaluation
|
| 41 |
+
|
| 42 |
+
Update the path with your batch-inferenced results, and carry out WER / SIM / UTMOS evaluations:
|
| 43 |
+
```bash
|
| 44 |
+
# Evaluation [WER] for Seed-TTS test [ZH] set
|
| 45 |
+
python src/f5_tts/eval/eval_seedtts_testset.py --eval_task wer --lang zh --gen_wav_dir <GEN_WAV_DIR> --gpu_nums 8
|
| 46 |
+
|
| 47 |
+
# Evaluation [SIM] for LibriSpeech-PC test-clean (cross-sentence)
|
| 48 |
+
python src/f5_tts/eval/eval_librispeech_test_clean.py --eval_task sim --gen_wav_dir <GEN_WAV_DIR> --librispeech_test_clean_path <TEST_CLEAN_PATH>
|
| 49 |
+
|
| 50 |
+
# Evaluation [UTMOS]. --ext: Audio extension
|
| 51 |
+
python src/f5_tts/eval/eval_utmos.py --audio_dir <WAV_DIR> --ext wav
|
| 52 |
+
```
|
F5-TTS/src/f5_tts/eval/ecapa_tdnn.py
ADDED
|
@@ -0,0 +1,331 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# just for speaker similarity evaluation, third-party code
|
| 2 |
+
|
| 3 |
+
# From https://github.com/microsoft/UniSpeech/blob/main/downstreams/speaker_verification/models/
|
| 4 |
+
# part of the code is borrowed from https://github.com/lawlict/ECAPA-TDNN
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
|
| 8 |
+
import torch
|
| 9 |
+
import torch.nn as nn
|
| 10 |
+
import torch.nn.functional as F
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
""" Res2Conv1d + BatchNorm1d + ReLU
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class Res2Conv1dReluBn(nn.Module):
|
| 18 |
+
"""
|
| 19 |
+
in_channels == out_channels == channels
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
def __init__(self, channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=True, scale=4):
|
| 23 |
+
super().__init__()
|
| 24 |
+
assert channels % scale == 0, "{} % {} != 0".format(channels, scale)
|
| 25 |
+
self.scale = scale
|
| 26 |
+
self.width = channels // scale
|
| 27 |
+
self.nums = scale if scale == 1 else scale - 1
|
| 28 |
+
|
| 29 |
+
self.convs = []
|
| 30 |
+
self.bns = []
|
| 31 |
+
for i in range(self.nums):
|
| 32 |
+
self.convs.append(nn.Conv1d(self.width, self.width, kernel_size, stride, padding, dilation, bias=bias))
|
| 33 |
+
self.bns.append(nn.BatchNorm1d(self.width))
|
| 34 |
+
self.convs = nn.ModuleList(self.convs)
|
| 35 |
+
self.bns = nn.ModuleList(self.bns)
|
| 36 |
+
|
| 37 |
+
def forward(self, x):
|
| 38 |
+
out = []
|
| 39 |
+
spx = torch.split(x, self.width, 1)
|
| 40 |
+
for i in range(self.nums):
|
| 41 |
+
if i == 0:
|
| 42 |
+
sp = spx[i]
|
| 43 |
+
else:
|
| 44 |
+
sp = sp + spx[i]
|
| 45 |
+
# Order: conv -> relu -> bn
|
| 46 |
+
sp = self.convs[i](sp)
|
| 47 |
+
sp = self.bns[i](F.relu(sp))
|
| 48 |
+
out.append(sp)
|
| 49 |
+
if self.scale != 1:
|
| 50 |
+
out.append(spx[self.nums])
|
| 51 |
+
out = torch.cat(out, dim=1)
|
| 52 |
+
|
| 53 |
+
return out
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
""" Conv1d + BatchNorm1d + ReLU
|
| 57 |
+
"""
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
class Conv1dReluBn(nn.Module):
|
| 61 |
+
def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=True):
|
| 62 |
+
super().__init__()
|
| 63 |
+
self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias)
|
| 64 |
+
self.bn = nn.BatchNorm1d(out_channels)
|
| 65 |
+
|
| 66 |
+
def forward(self, x):
|
| 67 |
+
return self.bn(F.relu(self.conv(x)))
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
""" The SE connection of 1D case.
|
| 71 |
+
"""
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
class SE_Connect(nn.Module):
|
| 75 |
+
def __init__(self, channels, se_bottleneck_dim=128):
|
| 76 |
+
super().__init__()
|
| 77 |
+
self.linear1 = nn.Linear(channels, se_bottleneck_dim)
|
| 78 |
+
self.linear2 = nn.Linear(se_bottleneck_dim, channels)
|
| 79 |
+
|
| 80 |
+
def forward(self, x):
|
| 81 |
+
out = x.mean(dim=2)
|
| 82 |
+
out = F.relu(self.linear1(out))
|
| 83 |
+
out = torch.sigmoid(self.linear2(out))
|
| 84 |
+
out = x * out.unsqueeze(2)
|
| 85 |
+
|
| 86 |
+
return out
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
""" SE-Res2Block of the ECAPA-TDNN architecture.
|
| 90 |
+
"""
|
| 91 |
+
|
| 92 |
+
# def SE_Res2Block(channels, kernel_size, stride, padding, dilation, scale):
|
| 93 |
+
# return nn.Sequential(
|
| 94 |
+
# Conv1dReluBn(channels, 512, kernel_size=1, stride=1, padding=0),
|
| 95 |
+
# Res2Conv1dReluBn(512, kernel_size, stride, padding, dilation, scale=scale),
|
| 96 |
+
# Conv1dReluBn(512, channels, kernel_size=1, stride=1, padding=0),
|
| 97 |
+
# SE_Connect(channels)
|
| 98 |
+
# )
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
class SE_Res2Block(nn.Module):
|
| 102 |
+
def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation, scale, se_bottleneck_dim):
|
| 103 |
+
super().__init__()
|
| 104 |
+
self.Conv1dReluBn1 = Conv1dReluBn(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
|
| 105 |
+
self.Res2Conv1dReluBn = Res2Conv1dReluBn(out_channels, kernel_size, stride, padding, dilation, scale=scale)
|
| 106 |
+
self.Conv1dReluBn2 = Conv1dReluBn(out_channels, out_channels, kernel_size=1, stride=1, padding=0)
|
| 107 |
+
self.SE_Connect = SE_Connect(out_channels, se_bottleneck_dim)
|
| 108 |
+
|
| 109 |
+
self.shortcut = None
|
| 110 |
+
if in_channels != out_channels:
|
| 111 |
+
self.shortcut = nn.Conv1d(
|
| 112 |
+
in_channels=in_channels,
|
| 113 |
+
out_channels=out_channels,
|
| 114 |
+
kernel_size=1,
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
def forward(self, x):
|
| 118 |
+
residual = x
|
| 119 |
+
if self.shortcut:
|
| 120 |
+
residual = self.shortcut(x)
|
| 121 |
+
|
| 122 |
+
x = self.Conv1dReluBn1(x)
|
| 123 |
+
x = self.Res2Conv1dReluBn(x)
|
| 124 |
+
x = self.Conv1dReluBn2(x)
|
| 125 |
+
x = self.SE_Connect(x)
|
| 126 |
+
|
| 127 |
+
return x + residual
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
""" Attentive weighted mean and standard deviation pooling.
|
| 131 |
+
"""
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
class AttentiveStatsPool(nn.Module):
|
| 135 |
+
def __init__(self, in_dim, attention_channels=128, global_context_att=False):
|
| 136 |
+
super().__init__()
|
| 137 |
+
self.global_context_att = global_context_att
|
| 138 |
+
|
| 139 |
+
# Use Conv1d with stride == 1 rather than Linear, then we don't need to transpose inputs.
|
| 140 |
+
if global_context_att:
|
| 141 |
+
self.linear1 = nn.Conv1d(in_dim * 3, attention_channels, kernel_size=1) # equals W and b in the paper
|
| 142 |
+
else:
|
| 143 |
+
self.linear1 = nn.Conv1d(in_dim, attention_channels, kernel_size=1) # equals W and b in the paper
|
| 144 |
+
self.linear2 = nn.Conv1d(attention_channels, in_dim, kernel_size=1) # equals V and k in the paper
|
| 145 |
+
|
| 146 |
+
def forward(self, x):
|
| 147 |
+
if self.global_context_att:
|
| 148 |
+
context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x)
|
| 149 |
+
context_std = torch.sqrt(torch.var(x, dim=-1, keepdim=True) + 1e-10).expand_as(x)
|
| 150 |
+
x_in = torch.cat((x, context_mean, context_std), dim=1)
|
| 151 |
+
else:
|
| 152 |
+
x_in = x
|
| 153 |
+
|
| 154 |
+
# DON'T use ReLU here! In experiments, I find ReLU hard to converge.
|
| 155 |
+
alpha = torch.tanh(self.linear1(x_in))
|
| 156 |
+
# alpha = F.relu(self.linear1(x_in))
|
| 157 |
+
alpha = torch.softmax(self.linear2(alpha), dim=2)
|
| 158 |
+
mean = torch.sum(alpha * x, dim=2)
|
| 159 |
+
residuals = torch.sum(alpha * (x**2), dim=2) - mean**2
|
| 160 |
+
std = torch.sqrt(residuals.clamp(min=1e-9))
|
| 161 |
+
return torch.cat([mean, std], dim=1)
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
class ECAPA_TDNN(nn.Module):
|
| 165 |
+
def __init__(
|
| 166 |
+
self,
|
| 167 |
+
feat_dim=80,
|
| 168 |
+
channels=512,
|
| 169 |
+
emb_dim=192,
|
| 170 |
+
global_context_att=False,
|
| 171 |
+
feat_type="wavlm_large",
|
| 172 |
+
sr=16000,
|
| 173 |
+
feature_selection="hidden_states",
|
| 174 |
+
update_extract=False,
|
| 175 |
+
config_path=None,
|
| 176 |
+
):
|
| 177 |
+
super().__init__()
|
| 178 |
+
|
| 179 |
+
self.feat_type = feat_type
|
| 180 |
+
self.feature_selection = feature_selection
|
| 181 |
+
self.update_extract = update_extract
|
| 182 |
+
self.sr = sr
|
| 183 |
+
|
| 184 |
+
torch.hub._validate_not_a_forked_repo = lambda a, b, c: True
|
| 185 |
+
try:
|
| 186 |
+
local_s3prl_path = os.path.expanduser("~/.cache/torch/hub/s3prl_s3prl_main")
|
| 187 |
+
self.feature_extract = torch.hub.load(local_s3prl_path, feat_type, source="local", config_path=config_path)
|
| 188 |
+
except: # noqa: E722
|
| 189 |
+
self.feature_extract = torch.hub.load("s3prl/s3prl", feat_type)
|
| 190 |
+
|
| 191 |
+
if len(self.feature_extract.model.encoder.layers) == 24 and hasattr(
|
| 192 |
+
self.feature_extract.model.encoder.layers[23].self_attn, "fp32_attention"
|
| 193 |
+
):
|
| 194 |
+
self.feature_extract.model.encoder.layers[23].self_attn.fp32_attention = False
|
| 195 |
+
if len(self.feature_extract.model.encoder.layers) == 24 and hasattr(
|
| 196 |
+
self.feature_extract.model.encoder.layers[11].self_attn, "fp32_attention"
|
| 197 |
+
):
|
| 198 |
+
self.feature_extract.model.encoder.layers[11].self_attn.fp32_attention = False
|
| 199 |
+
|
| 200 |
+
self.feat_num = self.get_feat_num()
|
| 201 |
+
self.feature_weight = nn.Parameter(torch.zeros(self.feat_num))
|
| 202 |
+
|
| 203 |
+
if feat_type != "fbank" and feat_type != "mfcc":
|
| 204 |
+
freeze_list = ["final_proj", "label_embs_concat", "mask_emb", "project_q", "quantizer"]
|
| 205 |
+
for name, param in self.feature_extract.named_parameters():
|
| 206 |
+
for freeze_val in freeze_list:
|
| 207 |
+
if freeze_val in name:
|
| 208 |
+
param.requires_grad = False
|
| 209 |
+
break
|
| 210 |
+
|
| 211 |
+
if not self.update_extract:
|
| 212 |
+
for param in self.feature_extract.parameters():
|
| 213 |
+
param.requires_grad = False
|
| 214 |
+
|
| 215 |
+
self.instance_norm = nn.InstanceNorm1d(feat_dim)
|
| 216 |
+
# self.channels = [channels] * 4 + [channels * 3]
|
| 217 |
+
self.channels = [channels] * 4 + [1536]
|
| 218 |
+
|
| 219 |
+
self.layer1 = Conv1dReluBn(feat_dim, self.channels[0], kernel_size=5, padding=2)
|
| 220 |
+
self.layer2 = SE_Res2Block(
|
| 221 |
+
self.channels[0],
|
| 222 |
+
self.channels[1],
|
| 223 |
+
kernel_size=3,
|
| 224 |
+
stride=1,
|
| 225 |
+
padding=2,
|
| 226 |
+
dilation=2,
|
| 227 |
+
scale=8,
|
| 228 |
+
se_bottleneck_dim=128,
|
| 229 |
+
)
|
| 230 |
+
self.layer3 = SE_Res2Block(
|
| 231 |
+
self.channels[1],
|
| 232 |
+
self.channels[2],
|
| 233 |
+
kernel_size=3,
|
| 234 |
+
stride=1,
|
| 235 |
+
padding=3,
|
| 236 |
+
dilation=3,
|
| 237 |
+
scale=8,
|
| 238 |
+
se_bottleneck_dim=128,
|
| 239 |
+
)
|
| 240 |
+
self.layer4 = SE_Res2Block(
|
| 241 |
+
self.channels[2],
|
| 242 |
+
self.channels[3],
|
| 243 |
+
kernel_size=3,
|
| 244 |
+
stride=1,
|
| 245 |
+
padding=4,
|
| 246 |
+
dilation=4,
|
| 247 |
+
scale=8,
|
| 248 |
+
se_bottleneck_dim=128,
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
# self.conv = nn.Conv1d(self.channels[-1], self.channels[-1], kernel_size=1)
|
| 252 |
+
cat_channels = channels * 3
|
| 253 |
+
self.conv = nn.Conv1d(cat_channels, self.channels[-1], kernel_size=1)
|
| 254 |
+
self.pooling = AttentiveStatsPool(
|
| 255 |
+
self.channels[-1], attention_channels=128, global_context_att=global_context_att
|
| 256 |
+
)
|
| 257 |
+
self.bn = nn.BatchNorm1d(self.channels[-1] * 2)
|
| 258 |
+
self.linear = nn.Linear(self.channels[-1] * 2, emb_dim)
|
| 259 |
+
|
| 260 |
+
def get_feat_num(self):
|
| 261 |
+
self.feature_extract.eval()
|
| 262 |
+
wav = [torch.randn(self.sr).to(next(self.feature_extract.parameters()).device)]
|
| 263 |
+
with torch.no_grad():
|
| 264 |
+
features = self.feature_extract(wav)
|
| 265 |
+
select_feature = features[self.feature_selection]
|
| 266 |
+
if isinstance(select_feature, (list, tuple)):
|
| 267 |
+
return len(select_feature)
|
| 268 |
+
else:
|
| 269 |
+
return 1
|
| 270 |
+
|
| 271 |
+
def get_feat(self, x):
|
| 272 |
+
if self.update_extract:
|
| 273 |
+
x = self.feature_extract([sample for sample in x])
|
| 274 |
+
else:
|
| 275 |
+
with torch.no_grad():
|
| 276 |
+
if self.feat_type == "fbank" or self.feat_type == "mfcc":
|
| 277 |
+
x = self.feature_extract(x) + 1e-6 # B x feat_dim x time_len
|
| 278 |
+
else:
|
| 279 |
+
x = self.feature_extract([sample for sample in x])
|
| 280 |
+
|
| 281 |
+
if self.feat_type == "fbank":
|
| 282 |
+
x = x.log()
|
| 283 |
+
|
| 284 |
+
if self.feat_type != "fbank" and self.feat_type != "mfcc":
|
| 285 |
+
x = x[self.feature_selection]
|
| 286 |
+
if isinstance(x, (list, tuple)):
|
| 287 |
+
x = torch.stack(x, dim=0)
|
| 288 |
+
else:
|
| 289 |
+
x = x.unsqueeze(0)
|
| 290 |
+
norm_weights = F.softmax(self.feature_weight, dim=-1).unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)
|
| 291 |
+
x = (norm_weights * x).sum(dim=0)
|
| 292 |
+
x = torch.transpose(x, 1, 2) + 1e-6
|
| 293 |
+
|
| 294 |
+
x = self.instance_norm(x)
|
| 295 |
+
return x
|
| 296 |
+
|
| 297 |
+
def forward(self, x):
|
| 298 |
+
x = self.get_feat(x)
|
| 299 |
+
|
| 300 |
+
out1 = self.layer1(x)
|
| 301 |
+
out2 = self.layer2(out1)
|
| 302 |
+
out3 = self.layer3(out2)
|
| 303 |
+
out4 = self.layer4(out3)
|
| 304 |
+
|
| 305 |
+
out = torch.cat([out2, out3, out4], dim=1)
|
| 306 |
+
out = F.relu(self.conv(out))
|
| 307 |
+
out = self.bn(self.pooling(out))
|
| 308 |
+
out = self.linear(out)
|
| 309 |
+
|
| 310 |
+
return out
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
def ECAPA_TDNN_SMALL(
|
| 314 |
+
feat_dim,
|
| 315 |
+
emb_dim=256,
|
| 316 |
+
feat_type="wavlm_large",
|
| 317 |
+
sr=16000,
|
| 318 |
+
feature_selection="hidden_states",
|
| 319 |
+
update_extract=False,
|
| 320 |
+
config_path=None,
|
| 321 |
+
):
|
| 322 |
+
return ECAPA_TDNN(
|
| 323 |
+
feat_dim=feat_dim,
|
| 324 |
+
channels=512,
|
| 325 |
+
emb_dim=emb_dim,
|
| 326 |
+
feat_type=feat_type,
|
| 327 |
+
sr=sr,
|
| 328 |
+
feature_selection=feature_selection,
|
| 329 |
+
update_extract=update_extract,
|
| 330 |
+
config_path=config_path,
|
| 331 |
+
)
|
F5-TTS/src/f5_tts/eval/eval_infer_batch.py
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
sys.path.append(os.getcwd())
|
| 6 |
+
|
| 7 |
+
import argparse
|
| 8 |
+
import time
|
| 9 |
+
from importlib.resources import files
|
| 10 |
+
|
| 11 |
+
import torch
|
| 12 |
+
import torchaudio
|
| 13 |
+
from accelerate import Accelerator
|
| 14 |
+
from hydra.utils import get_class
|
| 15 |
+
from omegaconf import OmegaConf
|
| 16 |
+
from tqdm import tqdm
|
| 17 |
+
|
| 18 |
+
from f5_tts.eval.utils_eval import (
|
| 19 |
+
get_inference_prompt,
|
| 20 |
+
get_librispeech_test_clean_metainfo,
|
| 21 |
+
get_seedtts_testset_metainfo,
|
| 22 |
+
)
|
| 23 |
+
from f5_tts.infer.utils_infer import load_checkpoint, load_vocoder
|
| 24 |
+
from f5_tts.model import CFM
|
| 25 |
+
from f5_tts.model.utils import get_tokenizer
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
accelerator = Accelerator()
|
| 29 |
+
device = f"cuda:{accelerator.process_index}"
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
use_ema = True
|
| 33 |
+
target_rms = 0.1
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
rel_path = str(files("f5_tts").joinpath("../../"))
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def main():
|
| 40 |
+
parser = argparse.ArgumentParser(description="batch inference")
|
| 41 |
+
|
| 42 |
+
parser.add_argument("-s", "--seed", default=None, type=int)
|
| 43 |
+
parser.add_argument("-n", "--expname", required=True)
|
| 44 |
+
parser.add_argument("-c", "--ckptstep", default=1250000, type=int)
|
| 45 |
+
|
| 46 |
+
parser.add_argument("-nfe", "--nfestep", default=32, type=int)
|
| 47 |
+
parser.add_argument("-o", "--odemethod", default="euler")
|
| 48 |
+
parser.add_argument("-ss", "--swaysampling", default=-1, type=float)
|
| 49 |
+
|
| 50 |
+
parser.add_argument("-t", "--testset", required=True)
|
| 51 |
+
|
| 52 |
+
args = parser.parse_args()
|
| 53 |
+
|
| 54 |
+
seed = args.seed
|
| 55 |
+
exp_name = args.expname
|
| 56 |
+
ckpt_step = args.ckptstep
|
| 57 |
+
|
| 58 |
+
nfe_step = args.nfestep
|
| 59 |
+
ode_method = args.odemethod
|
| 60 |
+
sway_sampling_coef = args.swaysampling
|
| 61 |
+
|
| 62 |
+
testset = args.testset
|
| 63 |
+
|
| 64 |
+
infer_batch_size = 1 # max frames. 1 for ddp single inference (recommended)
|
| 65 |
+
cfg_strength = 2.0
|
| 66 |
+
speed = 1.0
|
| 67 |
+
use_truth_duration = False
|
| 68 |
+
no_ref_audio = False
|
| 69 |
+
|
| 70 |
+
model_cfg = OmegaConf.load(str(files("f5_tts").joinpath(f"configs/{exp_name}.yaml")))
|
| 71 |
+
model_cls = get_class(f"f5_tts.model.{model_cfg.model.backbone}")
|
| 72 |
+
model_arc = model_cfg.model.arch
|
| 73 |
+
|
| 74 |
+
dataset_name = model_cfg.datasets.name
|
| 75 |
+
tokenizer = model_cfg.model.tokenizer
|
| 76 |
+
|
| 77 |
+
mel_spec_type = model_cfg.model.mel_spec.mel_spec_type
|
| 78 |
+
target_sample_rate = model_cfg.model.mel_spec.target_sample_rate
|
| 79 |
+
n_mel_channels = model_cfg.model.mel_spec.n_mel_channels
|
| 80 |
+
hop_length = model_cfg.model.mel_spec.hop_length
|
| 81 |
+
win_length = model_cfg.model.mel_spec.win_length
|
| 82 |
+
n_fft = model_cfg.model.mel_spec.n_fft
|
| 83 |
+
|
| 84 |
+
if testset == "ls_pc_test_clean":
|
| 85 |
+
metalst = rel_path + "/data/librispeech_pc_test_clean_cross_sentence.lst"
|
| 86 |
+
librispeech_test_clean_path = "<SOME_PATH>/LibriSpeech/test-clean" # test-clean path
|
| 87 |
+
metainfo = get_librispeech_test_clean_metainfo(metalst, librispeech_test_clean_path)
|
| 88 |
+
|
| 89 |
+
elif testset == "seedtts_test_zh":
|
| 90 |
+
metalst = rel_path + "/data/seedtts_testset/zh/meta.lst"
|
| 91 |
+
metainfo = get_seedtts_testset_metainfo(metalst)
|
| 92 |
+
|
| 93 |
+
elif testset == "seedtts_test_en":
|
| 94 |
+
metalst = rel_path + "/data/seedtts_testset/en/meta.lst"
|
| 95 |
+
metainfo = get_seedtts_testset_metainfo(metalst)
|
| 96 |
+
|
| 97 |
+
# path to save genereted wavs
|
| 98 |
+
output_dir = (
|
| 99 |
+
f"{rel_path}/"
|
| 100 |
+
f"results/{exp_name}_{ckpt_step}/{testset}/"
|
| 101 |
+
f"seed{seed}_{ode_method}_nfe{nfe_step}_{mel_spec_type}"
|
| 102 |
+
f"{f'_ss{sway_sampling_coef}' if sway_sampling_coef else ''}"
|
| 103 |
+
f"_cfg{cfg_strength}_speed{speed}"
|
| 104 |
+
f"{'_gt-dur' if use_truth_duration else ''}"
|
| 105 |
+
f"{'_no-ref-audio' if no_ref_audio else ''}"
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
# -------------------------------------------------#
|
| 109 |
+
|
| 110 |
+
prompts_all = get_inference_prompt(
|
| 111 |
+
metainfo,
|
| 112 |
+
speed=speed,
|
| 113 |
+
tokenizer=tokenizer,
|
| 114 |
+
target_sample_rate=target_sample_rate,
|
| 115 |
+
n_mel_channels=n_mel_channels,
|
| 116 |
+
hop_length=hop_length,
|
| 117 |
+
mel_spec_type=mel_spec_type,
|
| 118 |
+
target_rms=target_rms,
|
| 119 |
+
use_truth_duration=use_truth_duration,
|
| 120 |
+
infer_batch_size=infer_batch_size,
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
# Vocoder model
|
| 124 |
+
local = False
|
| 125 |
+
if mel_spec_type == "vocos":
|
| 126 |
+
vocoder_local_path = "../checkpoints/charactr/vocos-mel-24khz"
|
| 127 |
+
elif mel_spec_type == "bigvgan":
|
| 128 |
+
vocoder_local_path = "../checkpoints/bigvgan_v2_24khz_100band_256x"
|
| 129 |
+
vocoder = load_vocoder(vocoder_name=mel_spec_type, is_local=local, local_path=vocoder_local_path)
|
| 130 |
+
|
| 131 |
+
# Tokenizer
|
| 132 |
+
vocab_char_map, vocab_size = get_tokenizer(dataset_name, tokenizer)
|
| 133 |
+
|
| 134 |
+
# Model
|
| 135 |
+
model = CFM(
|
| 136 |
+
transformer=model_cls(**model_arc, text_num_embeds=vocab_size, mel_dim=n_mel_channels),
|
| 137 |
+
mel_spec_kwargs=dict(
|
| 138 |
+
n_fft=n_fft,
|
| 139 |
+
hop_length=hop_length,
|
| 140 |
+
win_length=win_length,
|
| 141 |
+
n_mel_channels=n_mel_channels,
|
| 142 |
+
target_sample_rate=target_sample_rate,
|
| 143 |
+
mel_spec_type=mel_spec_type,
|
| 144 |
+
),
|
| 145 |
+
odeint_kwargs=dict(
|
| 146 |
+
method=ode_method,
|
| 147 |
+
),
|
| 148 |
+
vocab_char_map=vocab_char_map,
|
| 149 |
+
).to(device)
|
| 150 |
+
|
| 151 |
+
ckpt_prefix = rel_path + f"/ckpts/{exp_name}/model_{ckpt_step}"
|
| 152 |
+
if os.path.exists(ckpt_prefix + ".pt"):
|
| 153 |
+
ckpt_path = ckpt_prefix + ".pt"
|
| 154 |
+
elif os.path.exists(ckpt_prefix + ".safetensors"):
|
| 155 |
+
ckpt_path = ckpt_prefix + ".safetensors"
|
| 156 |
+
else:
|
| 157 |
+
print("Loading from self-organized training checkpoints rather than released pretrained.")
|
| 158 |
+
ckpt_path = rel_path + f"/{model_cfg.ckpts.save_dir}/model_{ckpt_step}.pt"
|
| 159 |
+
|
| 160 |
+
dtype = torch.float32 if mel_spec_type == "bigvgan" else None
|
| 161 |
+
model = load_checkpoint(model, ckpt_path, device, dtype=dtype, use_ema=use_ema)
|
| 162 |
+
|
| 163 |
+
if not os.path.exists(output_dir) and accelerator.is_main_process:
|
| 164 |
+
os.makedirs(output_dir)
|
| 165 |
+
|
| 166 |
+
# start batch inference
|
| 167 |
+
accelerator.wait_for_everyone()
|
| 168 |
+
start = time.time()
|
| 169 |
+
|
| 170 |
+
with accelerator.split_between_processes(prompts_all) as prompts:
|
| 171 |
+
for prompt in tqdm(prompts, disable=not accelerator.is_local_main_process):
|
| 172 |
+
utts, ref_rms_list, ref_mels, ref_mel_lens, total_mel_lens, final_text_list = prompt
|
| 173 |
+
ref_mels = ref_mels.to(device)
|
| 174 |
+
ref_mel_lens = torch.tensor(ref_mel_lens, dtype=torch.long).to(device)
|
| 175 |
+
total_mel_lens = torch.tensor(total_mel_lens, dtype=torch.long).to(device)
|
| 176 |
+
|
| 177 |
+
# Inference
|
| 178 |
+
with torch.inference_mode():
|
| 179 |
+
generated, _ = model.sample(
|
| 180 |
+
cond=ref_mels,
|
| 181 |
+
text=final_text_list,
|
| 182 |
+
duration=total_mel_lens,
|
| 183 |
+
lens=ref_mel_lens,
|
| 184 |
+
steps=nfe_step,
|
| 185 |
+
cfg_strength=cfg_strength,
|
| 186 |
+
sway_sampling_coef=sway_sampling_coef,
|
| 187 |
+
no_ref_audio=no_ref_audio,
|
| 188 |
+
seed=seed,
|
| 189 |
+
)
|
| 190 |
+
# Final result
|
| 191 |
+
for i, gen in enumerate(generated):
|
| 192 |
+
gen = gen[ref_mel_lens[i] : total_mel_lens[i], :].unsqueeze(0)
|
| 193 |
+
gen_mel_spec = gen.permute(0, 2, 1).to(torch.float32)
|
| 194 |
+
if mel_spec_type == "vocos":
|
| 195 |
+
generated_wave = vocoder.decode(gen_mel_spec).cpu()
|
| 196 |
+
elif mel_spec_type == "bigvgan":
|
| 197 |
+
generated_wave = vocoder(gen_mel_spec).squeeze(0).cpu()
|
| 198 |
+
|
| 199 |
+
if ref_rms_list[i] < target_rms:
|
| 200 |
+
generated_wave = generated_wave * ref_rms_list[i] / target_rms
|
| 201 |
+
torchaudio.save(f"{output_dir}/{utts[i]}.wav", generated_wave, target_sample_rate)
|
| 202 |
+
|
| 203 |
+
accelerator.wait_for_everyone()
|
| 204 |
+
if accelerator.is_main_process:
|
| 205 |
+
timediff = time.time() - start
|
| 206 |
+
print(f"Done batch inference in {timediff / 60:.2f} minutes.")
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
if __name__ == "__main__":
|
| 210 |
+
main()
|
F5-TTS/src/f5_tts/eval/eval_infer_batch.sh
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# e.g. F5-TTS, 16 NFE
|
| 4 |
+
accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "F5TTS_v1_Base" -t "seedtts_test_zh" -nfe 16
|
| 5 |
+
accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "F5TTS_v1_Base" -t "seedtts_test_en" -nfe 16
|
| 6 |
+
accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "F5TTS_v1_Base" -t "ls_pc_test_clean" -nfe 16
|
| 7 |
+
|
| 8 |
+
# e.g. Vanilla E2 TTS, 32 NFE
|
| 9 |
+
accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "E2TTS_Base" -c 1200000 -t "seedtts_test_zh" -o "midpoint" -ss 0
|
| 10 |
+
accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "E2TTS_Base" -c 1200000 -t "seedtts_test_en" -o "midpoint" -ss 0
|
| 11 |
+
accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "E2TTS_Base" -c 1200000 -t "ls_pc_test_clean" -o "midpoint" -ss 0
|
| 12 |
+
|
| 13 |
+
# e.g. evaluate F5-TTS 16 NFE result on Seed-TTS test-zh
|
| 14 |
+
python src/f5_tts/eval/eval_seedtts_testset.py -e wer -l zh --gen_wav_dir results/F5TTS_v1_Base_1250000/seedtts_test_zh/seed0_euler_nfe32_vocos_ss-1_cfg2.0_speed1.0 --gpu_nums 8
|
| 15 |
+
python src/f5_tts/eval/eval_seedtts_testset.py -e sim -l zh --gen_wav_dir results/F5TTS_v1_Base_1250000/seedtts_test_zh/seed0_euler_nfe32_vocos_ss-1_cfg2.0_speed1.0 --gpu_nums 8
|
| 16 |
+
python src/f5_tts/eval/eval_utmos.py --audio_dir results/F5TTS_v1_Base_1250000/seedtts_test_zh/seed0_euler_nfe32_vocos_ss-1_cfg2.0_speed1.0
|
| 17 |
+
|
| 18 |
+
# etc.
|
F5-TTS/src/f5_tts/eval/eval_librispeech_test_clean.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Evaluate with Librispeech test-clean, ~3s prompt to generate 4-10s audio (the way of valle/voicebox evaluation)
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
sys.path.append(os.getcwd())
|
| 10 |
+
|
| 11 |
+
import multiprocessing as mp
|
| 12 |
+
from importlib.resources import files
|
| 13 |
+
|
| 14 |
+
import numpy as np
|
| 15 |
+
|
| 16 |
+
from f5_tts.eval.utils_eval import get_librispeech_test, run_asr_wer, run_sim
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
rel_path = str(files("f5_tts").joinpath("../../"))
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def get_args():
|
| 23 |
+
parser = argparse.ArgumentParser()
|
| 24 |
+
parser.add_argument("-e", "--eval_task", type=str, default="wer", choices=["sim", "wer"])
|
| 25 |
+
parser.add_argument("-l", "--lang", type=str, default="en")
|
| 26 |
+
parser.add_argument("-g", "--gen_wav_dir", type=str, required=True)
|
| 27 |
+
parser.add_argument("-p", "--librispeech_test_clean_path", type=str, required=True)
|
| 28 |
+
parser.add_argument("-n", "--gpu_nums", type=int, default=8, help="Number of GPUs to use")
|
| 29 |
+
parser.add_argument("--local", action="store_true", help="Use local custom checkpoint directory")
|
| 30 |
+
return parser.parse_args()
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def main():
|
| 34 |
+
args = get_args()
|
| 35 |
+
eval_task = args.eval_task
|
| 36 |
+
lang = args.lang
|
| 37 |
+
librispeech_test_clean_path = args.librispeech_test_clean_path # test-clean path
|
| 38 |
+
gen_wav_dir = args.gen_wav_dir
|
| 39 |
+
metalst = rel_path + "/data/librispeech_pc_test_clean_cross_sentence.lst"
|
| 40 |
+
|
| 41 |
+
gpus = list(range(args.gpu_nums))
|
| 42 |
+
test_set = get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path)
|
| 43 |
+
|
| 44 |
+
## In LibriSpeech, some speakers utilized varying voice characteristics for different characters in the book,
|
| 45 |
+
## leading to a low similarity for the ground truth in some cases.
|
| 46 |
+
# test_set = get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path, eval_ground_truth = True) # eval ground truth
|
| 47 |
+
|
| 48 |
+
local = args.local
|
| 49 |
+
if local: # use local custom checkpoint dir
|
| 50 |
+
asr_ckpt_dir = "../checkpoints/Systran/faster-whisper-large-v3"
|
| 51 |
+
else:
|
| 52 |
+
asr_ckpt_dir = "" # auto download to cache dir
|
| 53 |
+
wavlm_ckpt_dir = "../checkpoints/UniSpeech/wavlm_large_finetune.pth"
|
| 54 |
+
|
| 55 |
+
# --------------------------------------------------------------------------
|
| 56 |
+
|
| 57 |
+
full_results = []
|
| 58 |
+
metrics = []
|
| 59 |
+
|
| 60 |
+
if eval_task == "wer":
|
| 61 |
+
with mp.Pool(processes=len(gpus)) as pool:
|
| 62 |
+
args = [(rank, lang, sub_test_set, asr_ckpt_dir) for (rank, sub_test_set) in test_set]
|
| 63 |
+
results = pool.map(run_asr_wer, args)
|
| 64 |
+
for r in results:
|
| 65 |
+
full_results.extend(r)
|
| 66 |
+
elif eval_task == "sim":
|
| 67 |
+
with mp.Pool(processes=len(gpus)) as pool:
|
| 68 |
+
args = [(rank, sub_test_set, wavlm_ckpt_dir) for (rank, sub_test_set) in test_set]
|
| 69 |
+
results = pool.map(run_sim, args)
|
| 70 |
+
for r in results:
|
| 71 |
+
full_results.extend(r)
|
| 72 |
+
else:
|
| 73 |
+
raise ValueError(f"Unknown metric type: {eval_task}")
|
| 74 |
+
|
| 75 |
+
result_path = f"{gen_wav_dir}/_{eval_task}_results.jsonl"
|
| 76 |
+
with open(result_path, "w") as f:
|
| 77 |
+
for line in full_results:
|
| 78 |
+
metrics.append(line[eval_task])
|
| 79 |
+
f.write(json.dumps(line, ensure_ascii=False) + "\n")
|
| 80 |
+
metric = round(np.mean(metrics), 5)
|
| 81 |
+
f.write(f"\n{eval_task.upper()}: {metric}\n")
|
| 82 |
+
|
| 83 |
+
print(f"\nTotal {len(metrics)} samples")
|
| 84 |
+
print(f"{eval_task.upper()}: {metric}")
|
| 85 |
+
print(f"{eval_task.upper()} results saved to {result_path}")
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
if __name__ == "__main__":
|
| 89 |
+
main()
|
F5-TTS/src/f5_tts/eval/eval_seedtts_testset.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Evaluate with Seed-TTS testset
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
sys.path.append(os.getcwd())
|
| 10 |
+
|
| 11 |
+
import multiprocessing as mp
|
| 12 |
+
from importlib.resources import files
|
| 13 |
+
|
| 14 |
+
import numpy as np
|
| 15 |
+
|
| 16 |
+
from f5_tts.eval.utils_eval import get_seed_tts_test, run_asr_wer, run_sim
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
rel_path = str(files("f5_tts").joinpath("../../"))
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def get_args():
|
| 23 |
+
parser = argparse.ArgumentParser()
|
| 24 |
+
parser.add_argument("-e", "--eval_task", type=str, default="wer", choices=["sim", "wer"])
|
| 25 |
+
parser.add_argument("-l", "--lang", type=str, default="en", choices=["zh", "en"])
|
| 26 |
+
parser.add_argument("-g", "--gen_wav_dir", type=str, required=True)
|
| 27 |
+
parser.add_argument("-n", "--gpu_nums", type=int, default=8, help="Number of GPUs to use")
|
| 28 |
+
parser.add_argument("--local", action="store_true", help="Use local custom checkpoint directory")
|
| 29 |
+
return parser.parse_args()
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def main():
|
| 33 |
+
args = get_args()
|
| 34 |
+
eval_task = args.eval_task
|
| 35 |
+
lang = args.lang
|
| 36 |
+
gen_wav_dir = args.gen_wav_dir
|
| 37 |
+
metalst = rel_path + f"/data/seedtts_testset/{lang}/meta.lst" # seed-tts testset
|
| 38 |
+
|
| 39 |
+
# NOTE. paraformer-zh result will be slightly different according to the number of gpus, cuz batchsize is different
|
| 40 |
+
# zh 1.254 seems a result of 4 workers wer_seed_tts
|
| 41 |
+
gpus = list(range(args.gpu_nums))
|
| 42 |
+
test_set = get_seed_tts_test(metalst, gen_wav_dir, gpus)
|
| 43 |
+
|
| 44 |
+
local = args.local
|
| 45 |
+
if local: # use local custom checkpoint dir
|
| 46 |
+
if lang == "zh":
|
| 47 |
+
asr_ckpt_dir = "../checkpoints/funasr" # paraformer-zh dir under funasr
|
| 48 |
+
elif lang == "en":
|
| 49 |
+
asr_ckpt_dir = "../checkpoints/Systran/faster-whisper-large-v3"
|
| 50 |
+
else:
|
| 51 |
+
asr_ckpt_dir = "" # auto download to cache dir
|
| 52 |
+
wavlm_ckpt_dir = "../checkpoints/UniSpeech/wavlm_large_finetune.pth"
|
| 53 |
+
|
| 54 |
+
# --------------------------------------------------------------------------
|
| 55 |
+
|
| 56 |
+
full_results = []
|
| 57 |
+
metrics = []
|
| 58 |
+
|
| 59 |
+
if eval_task == "wer":
|
| 60 |
+
with mp.Pool(processes=len(gpus)) as pool:
|
| 61 |
+
args = [(rank, lang, sub_test_set, asr_ckpt_dir) for (rank, sub_test_set) in test_set]
|
| 62 |
+
results = pool.map(run_asr_wer, args)
|
| 63 |
+
for r in results:
|
| 64 |
+
full_results.extend(r)
|
| 65 |
+
elif eval_task == "sim":
|
| 66 |
+
with mp.Pool(processes=len(gpus)) as pool:
|
| 67 |
+
args = [(rank, sub_test_set, wavlm_ckpt_dir) for (rank, sub_test_set) in test_set]
|
| 68 |
+
results = pool.map(run_sim, args)
|
| 69 |
+
for r in results:
|
| 70 |
+
full_results.extend(r)
|
| 71 |
+
else:
|
| 72 |
+
raise ValueError(f"Unknown metric type: {eval_task}")
|
| 73 |
+
|
| 74 |
+
result_path = f"{gen_wav_dir}/_{eval_task}_results.jsonl"
|
| 75 |
+
with open(result_path, "w") as f:
|
| 76 |
+
for line in full_results:
|
| 77 |
+
metrics.append(line[eval_task])
|
| 78 |
+
f.write(json.dumps(line, ensure_ascii=False) + "\n")
|
| 79 |
+
metric = round(np.mean(metrics), 5)
|
| 80 |
+
f.write(f"\n{eval_task.upper()}: {metric}\n")
|
| 81 |
+
|
| 82 |
+
print(f"\nTotal {len(metrics)} samples")
|
| 83 |
+
print(f"{eval_task.upper()}: {metric}")
|
| 84 |
+
print(f"{eval_task.upper()} results saved to {result_path}")
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
if __name__ == "__main__":
|
| 88 |
+
main()
|
F5-TTS/src/f5_tts/eval/eval_utmos.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import json
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
import librosa
|
| 6 |
+
import torch
|
| 7 |
+
from tqdm import tqdm
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def main():
|
| 11 |
+
parser = argparse.ArgumentParser(description="UTMOS Evaluation")
|
| 12 |
+
parser.add_argument("--audio_dir", type=str, required=True, help="Audio file path.")
|
| 13 |
+
parser.add_argument("--ext", type=str, default="wav", help="Audio extension.")
|
| 14 |
+
args = parser.parse_args()
|
| 15 |
+
|
| 16 |
+
device = "cuda" if torch.cuda.is_available() else "xpu" if torch.xpu.is_available() else "cpu"
|
| 17 |
+
|
| 18 |
+
predictor = torch.hub.load("tarepan/SpeechMOS:v1.2.0", "utmos22_strong", trust_repo=True)
|
| 19 |
+
predictor = predictor.to(device)
|
| 20 |
+
|
| 21 |
+
audio_paths = list(Path(args.audio_dir).rglob(f"*.{args.ext}"))
|
| 22 |
+
utmos_score = 0
|
| 23 |
+
|
| 24 |
+
utmos_result_path = Path(args.audio_dir) / "_utmos_results.jsonl"
|
| 25 |
+
with open(utmos_result_path, "w", encoding="utf-8") as f:
|
| 26 |
+
for audio_path in tqdm(audio_paths, desc="Processing"):
|
| 27 |
+
wav, sr = librosa.load(audio_path, sr=None, mono=True)
|
| 28 |
+
wav_tensor = torch.from_numpy(wav).to(device).unsqueeze(0)
|
| 29 |
+
score = predictor(wav_tensor, sr)
|
| 30 |
+
line = {}
|
| 31 |
+
line["wav"], line["utmos"] = str(audio_path.stem), score.item()
|
| 32 |
+
utmos_score += score.item()
|
| 33 |
+
f.write(json.dumps(line, ensure_ascii=False) + "\n")
|
| 34 |
+
avg_score = utmos_score / len(audio_paths) if len(audio_paths) > 0 else 0
|
| 35 |
+
f.write(f"\nUTMOS: {avg_score:.4f}\n")
|
| 36 |
+
|
| 37 |
+
print(f"UTMOS: {avg_score:.4f}")
|
| 38 |
+
print(f"UTMOS results saved to {utmos_result_path}")
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
if __name__ == "__main__":
|
| 42 |
+
main()
|
F5-TTS/src/f5_tts/eval/utils_eval.py
ADDED
|
@@ -0,0 +1,419 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import math
|
| 2 |
+
import os
|
| 3 |
+
import random
|
| 4 |
+
import string
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
import torch
|
| 8 |
+
import torch.nn.functional as F
|
| 9 |
+
import torchaudio
|
| 10 |
+
from tqdm import tqdm
|
| 11 |
+
|
| 12 |
+
from f5_tts.eval.ecapa_tdnn import ECAPA_TDNN_SMALL
|
| 13 |
+
from f5_tts.model.modules import MelSpec
|
| 14 |
+
from f5_tts.model.utils import convert_char_to_pinyin
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
# seedtts testset metainfo: utt, prompt_text, prompt_wav, gt_text, gt_wav
|
| 18 |
+
def get_seedtts_testset_metainfo(metalst):
|
| 19 |
+
f = open(metalst)
|
| 20 |
+
lines = f.readlines()
|
| 21 |
+
f.close()
|
| 22 |
+
metainfo = []
|
| 23 |
+
for line in lines:
|
| 24 |
+
if len(line.strip().split("|")) == 5:
|
| 25 |
+
utt, prompt_text, prompt_wav, gt_text, gt_wav = line.strip().split("|")
|
| 26 |
+
elif len(line.strip().split("|")) == 4:
|
| 27 |
+
utt, prompt_text, prompt_wav, gt_text = line.strip().split("|")
|
| 28 |
+
gt_wav = os.path.join(os.path.dirname(metalst), "wavs", utt + ".wav")
|
| 29 |
+
if not os.path.isabs(prompt_wav):
|
| 30 |
+
prompt_wav = os.path.join(os.path.dirname(metalst), prompt_wav)
|
| 31 |
+
metainfo.append((utt, prompt_text, prompt_wav, gt_text, gt_wav))
|
| 32 |
+
return metainfo
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# librispeech test-clean metainfo: gen_utt, ref_txt, ref_wav, gen_txt, gen_wav
|
| 36 |
+
def get_librispeech_test_clean_metainfo(metalst, librispeech_test_clean_path):
|
| 37 |
+
f = open(metalst)
|
| 38 |
+
lines = f.readlines()
|
| 39 |
+
f.close()
|
| 40 |
+
metainfo = []
|
| 41 |
+
for line in lines:
|
| 42 |
+
ref_utt, ref_dur, ref_txt, gen_utt, gen_dur, gen_txt = line.strip().split("\t")
|
| 43 |
+
|
| 44 |
+
# ref_txt = ref_txt[0] + ref_txt[1:].lower() + '.' # if use librispeech test-clean (no-pc)
|
| 45 |
+
ref_spk_id, ref_chaptr_id, _ = ref_utt.split("-")
|
| 46 |
+
ref_wav = os.path.join(librispeech_test_clean_path, ref_spk_id, ref_chaptr_id, ref_utt + ".flac")
|
| 47 |
+
|
| 48 |
+
# gen_txt = gen_txt[0] + gen_txt[1:].lower() + '.' # if use librispeech test-clean (no-pc)
|
| 49 |
+
gen_spk_id, gen_chaptr_id, _ = gen_utt.split("-")
|
| 50 |
+
gen_wav = os.path.join(librispeech_test_clean_path, gen_spk_id, gen_chaptr_id, gen_utt + ".flac")
|
| 51 |
+
|
| 52 |
+
metainfo.append((gen_utt, ref_txt, ref_wav, " " + gen_txt, gen_wav))
|
| 53 |
+
|
| 54 |
+
return metainfo
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
# padded to max length mel batch
|
| 58 |
+
def padded_mel_batch(ref_mels):
|
| 59 |
+
max_mel_length = torch.LongTensor([mel.shape[-1] for mel in ref_mels]).amax()
|
| 60 |
+
padded_ref_mels = []
|
| 61 |
+
for mel in ref_mels:
|
| 62 |
+
padded_ref_mel = F.pad(mel, (0, max_mel_length - mel.shape[-1]), value=0)
|
| 63 |
+
padded_ref_mels.append(padded_ref_mel)
|
| 64 |
+
padded_ref_mels = torch.stack(padded_ref_mels)
|
| 65 |
+
padded_ref_mels = padded_ref_mels.permute(0, 2, 1)
|
| 66 |
+
return padded_ref_mels
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
# get prompts from metainfo containing: utt, prompt_text, prompt_wav, gt_text, gt_wav
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def get_inference_prompt(
|
| 73 |
+
metainfo,
|
| 74 |
+
speed=1.0,
|
| 75 |
+
tokenizer="pinyin",
|
| 76 |
+
polyphone=True,
|
| 77 |
+
target_sample_rate=24000,
|
| 78 |
+
n_fft=1024,
|
| 79 |
+
win_length=1024,
|
| 80 |
+
n_mel_channels=100,
|
| 81 |
+
hop_length=256,
|
| 82 |
+
mel_spec_type="vocos",
|
| 83 |
+
target_rms=0.1,
|
| 84 |
+
use_truth_duration=False,
|
| 85 |
+
infer_batch_size=1,
|
| 86 |
+
num_buckets=200,
|
| 87 |
+
min_secs=3,
|
| 88 |
+
max_secs=40,
|
| 89 |
+
):
|
| 90 |
+
prompts_all = []
|
| 91 |
+
|
| 92 |
+
min_tokens = min_secs * target_sample_rate // hop_length
|
| 93 |
+
max_tokens = max_secs * target_sample_rate // hop_length
|
| 94 |
+
|
| 95 |
+
batch_accum = [0] * num_buckets
|
| 96 |
+
utts, ref_rms_list, ref_mels, ref_mel_lens, total_mel_lens, final_text_list = (
|
| 97 |
+
[[] for _ in range(num_buckets)] for _ in range(6)
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
mel_spectrogram = MelSpec(
|
| 101 |
+
n_fft=n_fft,
|
| 102 |
+
hop_length=hop_length,
|
| 103 |
+
win_length=win_length,
|
| 104 |
+
n_mel_channels=n_mel_channels,
|
| 105 |
+
target_sample_rate=target_sample_rate,
|
| 106 |
+
mel_spec_type=mel_spec_type,
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
for utt, prompt_text, prompt_wav, gt_text, gt_wav in tqdm(metainfo, desc="Processing prompts..."):
|
| 110 |
+
# Audio
|
| 111 |
+
ref_audio, ref_sr = torchaudio.load(prompt_wav)
|
| 112 |
+
ref_rms = torch.sqrt(torch.mean(torch.square(ref_audio)))
|
| 113 |
+
if ref_rms < target_rms:
|
| 114 |
+
ref_audio = ref_audio * target_rms / ref_rms
|
| 115 |
+
assert ref_audio.shape[-1] > 5000, f"Empty prompt wav: {prompt_wav}, or torchaudio backend issue."
|
| 116 |
+
if ref_sr != target_sample_rate:
|
| 117 |
+
resampler = torchaudio.transforms.Resample(ref_sr, target_sample_rate)
|
| 118 |
+
ref_audio = resampler(ref_audio)
|
| 119 |
+
|
| 120 |
+
# Text
|
| 121 |
+
if len(prompt_text[-1].encode("utf-8")) == 1:
|
| 122 |
+
prompt_text = prompt_text + " "
|
| 123 |
+
text = [prompt_text + gt_text]
|
| 124 |
+
if tokenizer == "pinyin":
|
| 125 |
+
text_list = convert_char_to_pinyin(text, polyphone=polyphone)
|
| 126 |
+
else:
|
| 127 |
+
text_list = text
|
| 128 |
+
|
| 129 |
+
# to mel spectrogram
|
| 130 |
+
ref_mel = mel_spectrogram(ref_audio)
|
| 131 |
+
ref_mel = ref_mel.squeeze(0)
|
| 132 |
+
|
| 133 |
+
# Duration, mel frame length
|
| 134 |
+
ref_mel_len = ref_mel.shape[-1]
|
| 135 |
+
|
| 136 |
+
if use_truth_duration:
|
| 137 |
+
gt_audio, gt_sr = torchaudio.load(gt_wav)
|
| 138 |
+
if gt_sr != target_sample_rate:
|
| 139 |
+
resampler = torchaudio.transforms.Resample(gt_sr, target_sample_rate)
|
| 140 |
+
gt_audio = resampler(gt_audio)
|
| 141 |
+
total_mel_len = ref_mel_len + int(gt_audio.shape[-1] / hop_length / speed)
|
| 142 |
+
|
| 143 |
+
# # test vocoder resynthesis
|
| 144 |
+
# ref_audio = gt_audio
|
| 145 |
+
else:
|
| 146 |
+
ref_text_len = len(prompt_text.encode("utf-8"))
|
| 147 |
+
gen_text_len = len(gt_text.encode("utf-8"))
|
| 148 |
+
total_mel_len = ref_mel_len + int(ref_mel_len / ref_text_len * gen_text_len / speed)
|
| 149 |
+
|
| 150 |
+
# deal with batch
|
| 151 |
+
assert infer_batch_size > 0, "infer_batch_size should be greater than 0."
|
| 152 |
+
assert min_tokens <= total_mel_len <= max_tokens, (
|
| 153 |
+
f"Audio {utt} has duration {total_mel_len * hop_length // target_sample_rate}s out of range [{min_secs}, {max_secs}]."
|
| 154 |
+
)
|
| 155 |
+
bucket_i = math.floor((total_mel_len - min_tokens) / (max_tokens - min_tokens + 1) * num_buckets)
|
| 156 |
+
|
| 157 |
+
utts[bucket_i].append(utt)
|
| 158 |
+
ref_rms_list[bucket_i].append(ref_rms)
|
| 159 |
+
ref_mels[bucket_i].append(ref_mel)
|
| 160 |
+
ref_mel_lens[bucket_i].append(ref_mel_len)
|
| 161 |
+
total_mel_lens[bucket_i].append(total_mel_len)
|
| 162 |
+
final_text_list[bucket_i].extend(text_list)
|
| 163 |
+
|
| 164 |
+
batch_accum[bucket_i] += total_mel_len
|
| 165 |
+
|
| 166 |
+
if batch_accum[bucket_i] >= infer_batch_size:
|
| 167 |
+
# print(f"\n{len(ref_mels[bucket_i][0][0])}\n{ref_mel_lens[bucket_i]}\n{total_mel_lens[bucket_i]}")
|
| 168 |
+
prompts_all.append(
|
| 169 |
+
(
|
| 170 |
+
utts[bucket_i],
|
| 171 |
+
ref_rms_list[bucket_i],
|
| 172 |
+
padded_mel_batch(ref_mels[bucket_i]),
|
| 173 |
+
ref_mel_lens[bucket_i],
|
| 174 |
+
total_mel_lens[bucket_i],
|
| 175 |
+
final_text_list[bucket_i],
|
| 176 |
+
)
|
| 177 |
+
)
|
| 178 |
+
batch_accum[bucket_i] = 0
|
| 179 |
+
(
|
| 180 |
+
utts[bucket_i],
|
| 181 |
+
ref_rms_list[bucket_i],
|
| 182 |
+
ref_mels[bucket_i],
|
| 183 |
+
ref_mel_lens[bucket_i],
|
| 184 |
+
total_mel_lens[bucket_i],
|
| 185 |
+
final_text_list[bucket_i],
|
| 186 |
+
) = [], [], [], [], [], []
|
| 187 |
+
|
| 188 |
+
# add residual
|
| 189 |
+
for bucket_i, bucket_frames in enumerate(batch_accum):
|
| 190 |
+
if bucket_frames > 0:
|
| 191 |
+
prompts_all.append(
|
| 192 |
+
(
|
| 193 |
+
utts[bucket_i],
|
| 194 |
+
ref_rms_list[bucket_i],
|
| 195 |
+
padded_mel_batch(ref_mels[bucket_i]),
|
| 196 |
+
ref_mel_lens[bucket_i],
|
| 197 |
+
total_mel_lens[bucket_i],
|
| 198 |
+
final_text_list[bucket_i],
|
| 199 |
+
)
|
| 200 |
+
)
|
| 201 |
+
# not only leave easy work for last workers
|
| 202 |
+
random.seed(666)
|
| 203 |
+
random.shuffle(prompts_all)
|
| 204 |
+
|
| 205 |
+
return prompts_all
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
# get wav_res_ref_text of seed-tts test metalst
|
| 209 |
+
# https://github.com/BytedanceSpeech/seed-tts-eval
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
def get_seed_tts_test(metalst, gen_wav_dir, gpus):
|
| 213 |
+
f = open(metalst)
|
| 214 |
+
lines = f.readlines()
|
| 215 |
+
f.close()
|
| 216 |
+
|
| 217 |
+
test_set_ = []
|
| 218 |
+
for line in tqdm(lines):
|
| 219 |
+
if len(line.strip().split("|")) == 5:
|
| 220 |
+
utt, prompt_text, prompt_wav, gt_text, gt_wav = line.strip().split("|")
|
| 221 |
+
elif len(line.strip().split("|")) == 4:
|
| 222 |
+
utt, prompt_text, prompt_wav, gt_text = line.strip().split("|")
|
| 223 |
+
|
| 224 |
+
if not os.path.exists(os.path.join(gen_wav_dir, utt + ".wav")):
|
| 225 |
+
continue
|
| 226 |
+
gen_wav = os.path.join(gen_wav_dir, utt + ".wav")
|
| 227 |
+
if not os.path.isabs(prompt_wav):
|
| 228 |
+
prompt_wav = os.path.join(os.path.dirname(metalst), prompt_wav)
|
| 229 |
+
|
| 230 |
+
test_set_.append((gen_wav, prompt_wav, gt_text))
|
| 231 |
+
|
| 232 |
+
num_jobs = len(gpus)
|
| 233 |
+
if num_jobs == 1:
|
| 234 |
+
return [(gpus[0], test_set_)]
|
| 235 |
+
|
| 236 |
+
wav_per_job = len(test_set_) // num_jobs + 1
|
| 237 |
+
test_set = []
|
| 238 |
+
for i in range(num_jobs):
|
| 239 |
+
test_set.append((gpus[i], test_set_[i * wav_per_job : (i + 1) * wav_per_job]))
|
| 240 |
+
|
| 241 |
+
return test_set
|
| 242 |
+
|
| 243 |
+
|
| 244 |
+
# get librispeech test-clean cross sentence test
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
def get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path, eval_ground_truth=False):
|
| 248 |
+
f = open(metalst)
|
| 249 |
+
lines = f.readlines()
|
| 250 |
+
f.close()
|
| 251 |
+
|
| 252 |
+
test_set_ = []
|
| 253 |
+
for line in tqdm(lines):
|
| 254 |
+
ref_utt, ref_dur, ref_txt, gen_utt, gen_dur, gen_txt = line.strip().split("\t")
|
| 255 |
+
|
| 256 |
+
if eval_ground_truth:
|
| 257 |
+
gen_spk_id, gen_chaptr_id, _ = gen_utt.split("-")
|
| 258 |
+
gen_wav = os.path.join(librispeech_test_clean_path, gen_spk_id, gen_chaptr_id, gen_utt + ".flac")
|
| 259 |
+
else:
|
| 260 |
+
if not os.path.exists(os.path.join(gen_wav_dir, gen_utt + ".wav")):
|
| 261 |
+
raise FileNotFoundError(f"Generated wav not found: {gen_utt}")
|
| 262 |
+
gen_wav = os.path.join(gen_wav_dir, gen_utt + ".wav")
|
| 263 |
+
|
| 264 |
+
ref_spk_id, ref_chaptr_id, _ = ref_utt.split("-")
|
| 265 |
+
ref_wav = os.path.join(librispeech_test_clean_path, ref_spk_id, ref_chaptr_id, ref_utt + ".flac")
|
| 266 |
+
|
| 267 |
+
test_set_.append((gen_wav, ref_wav, gen_txt))
|
| 268 |
+
|
| 269 |
+
num_jobs = len(gpus)
|
| 270 |
+
if num_jobs == 1:
|
| 271 |
+
return [(gpus[0], test_set_)]
|
| 272 |
+
|
| 273 |
+
wav_per_job = len(test_set_) // num_jobs + 1
|
| 274 |
+
test_set = []
|
| 275 |
+
for i in range(num_jobs):
|
| 276 |
+
test_set.append((gpus[i], test_set_[i * wav_per_job : (i + 1) * wav_per_job]))
|
| 277 |
+
|
| 278 |
+
return test_set
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
# load asr model
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
def load_asr_model(lang, ckpt_dir=""):
|
| 285 |
+
if lang == "zh":
|
| 286 |
+
from funasr import AutoModel
|
| 287 |
+
|
| 288 |
+
model = AutoModel(
|
| 289 |
+
model=os.path.join(ckpt_dir, "paraformer-zh"),
|
| 290 |
+
# vad_model = os.path.join(ckpt_dir, "fsmn-vad"),
|
| 291 |
+
# punc_model = os.path.join(ckpt_dir, "ct-punc"),
|
| 292 |
+
# spk_model = os.path.join(ckpt_dir, "cam++"),
|
| 293 |
+
disable_update=True,
|
| 294 |
+
) # following seed-tts setting
|
| 295 |
+
elif lang == "en":
|
| 296 |
+
from faster_whisper import WhisperModel
|
| 297 |
+
|
| 298 |
+
model_size = "large-v3" if ckpt_dir == "" else ckpt_dir
|
| 299 |
+
model = WhisperModel(model_size, device="cuda", compute_type="float16")
|
| 300 |
+
return model
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
# WER Evaluation, the way Seed-TTS does
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
def run_asr_wer(args):
|
| 307 |
+
rank, lang, test_set, ckpt_dir = args
|
| 308 |
+
|
| 309 |
+
if lang == "zh":
|
| 310 |
+
import zhconv
|
| 311 |
+
|
| 312 |
+
torch.cuda.set_device(rank)
|
| 313 |
+
elif lang == "en":
|
| 314 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = str(rank)
|
| 315 |
+
else:
|
| 316 |
+
raise NotImplementedError(
|
| 317 |
+
"lang support only 'zh' (funasr paraformer-zh), 'en' (faster-whisper-large-v3), for now."
|
| 318 |
+
)
|
| 319 |
+
|
| 320 |
+
asr_model = load_asr_model(lang, ckpt_dir=ckpt_dir)
|
| 321 |
+
|
| 322 |
+
from zhon.hanzi import punctuation
|
| 323 |
+
|
| 324 |
+
punctuation_all = punctuation + string.punctuation
|
| 325 |
+
wer_results = []
|
| 326 |
+
|
| 327 |
+
from jiwer import compute_measures
|
| 328 |
+
|
| 329 |
+
for gen_wav, prompt_wav, truth in tqdm(test_set):
|
| 330 |
+
if lang == "zh":
|
| 331 |
+
res = asr_model.generate(input=gen_wav, batch_size_s=300, disable_pbar=True)
|
| 332 |
+
hypo = res[0]["text"]
|
| 333 |
+
hypo = zhconv.convert(hypo, "zh-cn")
|
| 334 |
+
elif lang == "en":
|
| 335 |
+
segments, _ = asr_model.transcribe(gen_wav, beam_size=5, language="en")
|
| 336 |
+
hypo = ""
|
| 337 |
+
for segment in segments:
|
| 338 |
+
hypo = hypo + " " + segment.text
|
| 339 |
+
|
| 340 |
+
raw_truth = truth
|
| 341 |
+
raw_hypo = hypo
|
| 342 |
+
|
| 343 |
+
for x in punctuation_all:
|
| 344 |
+
truth = truth.replace(x, "")
|
| 345 |
+
hypo = hypo.replace(x, "")
|
| 346 |
+
|
| 347 |
+
truth = truth.replace(" ", " ")
|
| 348 |
+
hypo = hypo.replace(" ", " ")
|
| 349 |
+
|
| 350 |
+
if lang == "zh":
|
| 351 |
+
truth = " ".join([x for x in truth])
|
| 352 |
+
hypo = " ".join([x for x in hypo])
|
| 353 |
+
elif lang == "en":
|
| 354 |
+
truth = truth.lower()
|
| 355 |
+
hypo = hypo.lower()
|
| 356 |
+
|
| 357 |
+
measures = compute_measures(truth, hypo)
|
| 358 |
+
wer = measures["wer"]
|
| 359 |
+
|
| 360 |
+
# ref_list = truth.split(" ")
|
| 361 |
+
# subs = measures["substitutions"] / len(ref_list)
|
| 362 |
+
# dele = measures["deletions"] / len(ref_list)
|
| 363 |
+
# inse = measures["insertions"] / len(ref_list)
|
| 364 |
+
|
| 365 |
+
wer_results.append(
|
| 366 |
+
{
|
| 367 |
+
"wav": Path(gen_wav).stem,
|
| 368 |
+
"truth": raw_truth,
|
| 369 |
+
"hypo": raw_hypo,
|
| 370 |
+
"wer": wer,
|
| 371 |
+
}
|
| 372 |
+
)
|
| 373 |
+
|
| 374 |
+
return wer_results
|
| 375 |
+
|
| 376 |
+
|
| 377 |
+
# SIM Evaluation
|
| 378 |
+
|
| 379 |
+
|
| 380 |
+
def run_sim(args):
|
| 381 |
+
rank, test_set, ckpt_dir = args
|
| 382 |
+
device = f"cuda:{rank}"
|
| 383 |
+
|
| 384 |
+
model = ECAPA_TDNN_SMALL(feat_dim=1024, feat_type="wavlm_large", config_path=None)
|
| 385 |
+
state_dict = torch.load(ckpt_dir, weights_only=True, map_location=lambda storage, loc: storage)
|
| 386 |
+
model.load_state_dict(state_dict["model"], strict=False)
|
| 387 |
+
|
| 388 |
+
use_gpu = True if torch.cuda.is_available() else False
|
| 389 |
+
if use_gpu:
|
| 390 |
+
model = model.cuda(device)
|
| 391 |
+
model.eval()
|
| 392 |
+
|
| 393 |
+
sim_results = []
|
| 394 |
+
for gen_wav, prompt_wav, truth in tqdm(test_set):
|
| 395 |
+
wav1, sr1 = torchaudio.load(gen_wav)
|
| 396 |
+
wav2, sr2 = torchaudio.load(prompt_wav)
|
| 397 |
+
|
| 398 |
+
resample1 = torchaudio.transforms.Resample(orig_freq=sr1, new_freq=16000)
|
| 399 |
+
resample2 = torchaudio.transforms.Resample(orig_freq=sr2, new_freq=16000)
|
| 400 |
+
wav1 = resample1(wav1)
|
| 401 |
+
wav2 = resample2(wav2)
|
| 402 |
+
|
| 403 |
+
if use_gpu:
|
| 404 |
+
wav1 = wav1.cuda(device)
|
| 405 |
+
wav2 = wav2.cuda(device)
|
| 406 |
+
with torch.no_grad():
|
| 407 |
+
emb1 = model(wav1)
|
| 408 |
+
emb2 = model(wav2)
|
| 409 |
+
|
| 410 |
+
sim = F.cosine_similarity(emb1, emb2)[0].item()
|
| 411 |
+
# print(f"VSim score between two audios: {sim:.4f} (-1.0, 1.0).")
|
| 412 |
+
sim_results.append(
|
| 413 |
+
{
|
| 414 |
+
"wav": Path(gen_wav).stem,
|
| 415 |
+
"sim": sim,
|
| 416 |
+
}
|
| 417 |
+
)
|
| 418 |
+
|
| 419 |
+
return sim_results
|
F5-TTS/src/f5_tts/infer/README.md
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Inference
|
| 2 |
+
|
| 3 |
+
The pretrained model checkpoints can be reached at [🤗 Hugging Face](https://huggingface.co/SWivid/F5-TTS) and [🤖 Model Scope](https://www.modelscope.cn/models/SWivid/F5-TTS_Emilia-ZH-EN), or will be automatically downloaded when running inference scripts.
|
| 4 |
+
|
| 5 |
+
**More checkpoints with whole community efforts can be found in [SHARED.md](SHARED.md), supporting more languages.**
|
| 6 |
+
|
| 7 |
+
Currently support **30s for a single** generation, which is the **total length** (same logic if `fix_duration`) including both prompt and output audio. However, `infer_cli` and `infer_gradio` will automatically do chunk generation for longer text. Long reference audio will be **clip short to ~12s**.
|
| 8 |
+
|
| 9 |
+
To avoid possible inference failures, make sure you have seen through the following instructions.
|
| 10 |
+
|
| 11 |
+
- Use reference audio <12s and leave proper silence space (e.g. 1s) at the end. Otherwise there is a risk of truncating in the middle of word, leading to suboptimal generation.
|
| 12 |
+
- <ins>Uppercased letters</ins> (best with form like K.F.C.) will be uttered letter by letter, and lowercased letters used for common words.
|
| 13 |
+
- Add some spaces (blank: " ") or punctuations (e.g. "," ".") <ins>to explicitly introduce some pauses</ins>.
|
| 14 |
+
- If English punctuation marks the end of a sentence, make sure there is a space " " after it. Otherwise not regarded as when chunk.
|
| 15 |
+
- <ins>Preprocess numbers</ins> to Chinese letters if you want to have them read in Chinese, otherwise in English.
|
| 16 |
+
- If the generation output is blank (pure silence), <ins>check for FFmpeg installation</ins>.
|
| 17 |
+
- Try <ins>turn off `use_ema` if using an early-stage</ins> finetuned checkpoint (which goes just few updates).
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
## Gradio App
|
| 21 |
+
|
| 22 |
+
Currently supported features:
|
| 23 |
+
|
| 24 |
+
- Basic TTS with Chunk Inference
|
| 25 |
+
- Multi-Style / Multi-Speaker Generation
|
| 26 |
+
- Voice Chat powered by Qwen2.5-3B-Instruct
|
| 27 |
+
- [Custom inference with more language support](SHARED.md)
|
| 28 |
+
|
| 29 |
+
The cli command `f5-tts_infer-gradio` equals to `python src/f5_tts/infer/infer_gradio.py`, which launches a Gradio APP (web interface) for inference.
|
| 30 |
+
|
| 31 |
+
The script will load model checkpoints from Huggingface. You can also manually download files and update the path to `load_model()` in `infer_gradio.py`. Currently only load TTS models first, will load ASR model to do transcription if `ref_text` not provided, will load LLM model if use Voice Chat.
|
| 32 |
+
|
| 33 |
+
More flags options:
|
| 34 |
+
|
| 35 |
+
```bash
|
| 36 |
+
# Automatically launch the interface in the default web browser
|
| 37 |
+
f5-tts_infer-gradio --inbrowser
|
| 38 |
+
|
| 39 |
+
# Set the root path of the application, if it's not served from the root ("/") of the domain
|
| 40 |
+
# For example, if the application is served at "https://example.com/myapp"
|
| 41 |
+
f5-tts_infer-gradio --root_path "/myapp"
|
| 42 |
+
```
|
| 43 |
+
|
| 44 |
+
Could also be used as a component for larger application:
|
| 45 |
+
```python
|
| 46 |
+
import gradio as gr
|
| 47 |
+
from f5_tts.infer.infer_gradio import app
|
| 48 |
+
|
| 49 |
+
with gr.Blocks() as main_app:
|
| 50 |
+
gr.Markdown("# This is an example of using F5-TTS within a bigger Gradio app")
|
| 51 |
+
|
| 52 |
+
# ... other Gradio components
|
| 53 |
+
|
| 54 |
+
app.render()
|
| 55 |
+
|
| 56 |
+
main_app.launch()
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
## CLI Inference
|
| 61 |
+
|
| 62 |
+
The cli command `f5-tts_infer-cli` equals to `python src/f5_tts/infer/infer_cli.py`, which is a command line tool for inference.
|
| 63 |
+
|
| 64 |
+
The script will load model checkpoints from Huggingface. You can also manually download files and use `--ckpt_file` to specify the model you want to load, or directly update in `infer_cli.py`.
|
| 65 |
+
|
| 66 |
+
For change vocab.txt use `--vocab_file` to provide your `vocab.txt` file.
|
| 67 |
+
|
| 68 |
+
Basically you can inference with flags:
|
| 69 |
+
```bash
|
| 70 |
+
# Leave --ref_text "" will have ASR model transcribe (extra GPU memory usage)
|
| 71 |
+
f5-tts_infer-cli \
|
| 72 |
+
--model F5TTS_v1_Base \
|
| 73 |
+
--ref_audio "ref_audio.wav" \
|
| 74 |
+
--ref_text "The content, subtitle or transcription of reference audio." \
|
| 75 |
+
--gen_text "Some text you want TTS model generate for you."
|
| 76 |
+
|
| 77 |
+
# Use BigVGAN as vocoder. Currently only support F5TTS_Base.
|
| 78 |
+
f5-tts_infer-cli --model F5TTS_Base --vocoder_name bigvgan --load_vocoder_from_local
|
| 79 |
+
|
| 80 |
+
# Use custom path checkpoint, e.g.
|
| 81 |
+
f5-tts_infer-cli --ckpt_file ckpts/F5TTS_v1_Base/model_1250000.safetensors
|
| 82 |
+
|
| 83 |
+
# More instructions
|
| 84 |
+
f5-tts_infer-cli --help
|
| 85 |
+
```
|
| 86 |
+
|
| 87 |
+
And a `.toml` file would help with more flexible usage.
|
| 88 |
+
|
| 89 |
+
```bash
|
| 90 |
+
f5-tts_infer-cli -c custom.toml
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
For example, you can use `.toml` to pass in variables, refer to `src/f5_tts/infer/examples/basic/basic.toml`:
|
| 94 |
+
|
| 95 |
+
```toml
|
| 96 |
+
# F5TTS_v1_Base | E2TTS_Base
|
| 97 |
+
model = "F5TTS_v1_Base"
|
| 98 |
+
ref_audio = "infer/examples/basic/basic_ref_en.wav"
|
| 99 |
+
# If an empty "", transcribes the reference audio automatically.
|
| 100 |
+
ref_text = "Some call me nature, others call me mother nature."
|
| 101 |
+
gen_text = "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring."
|
| 102 |
+
# File with text to generate. Ignores the text above.
|
| 103 |
+
gen_file = ""
|
| 104 |
+
remove_silence = false
|
| 105 |
+
output_dir = "tests"
|
| 106 |
+
```
|
| 107 |
+
|
| 108 |
+
You can also leverage `.toml` file to do multi-style generation, refer to `src/f5_tts/infer/examples/multi/story.toml`.
|
| 109 |
+
|
| 110 |
+
```toml
|
| 111 |
+
# F5TTS_v1_Base | E2TTS_Base
|
| 112 |
+
model = "F5TTS_v1_Base"
|
| 113 |
+
ref_audio = "infer/examples/multi/main.flac"
|
| 114 |
+
# If an empty "", transcribes the reference audio automatically.
|
| 115 |
+
ref_text = ""
|
| 116 |
+
gen_text = ""
|
| 117 |
+
# File with text to generate. Ignores the text above.
|
| 118 |
+
gen_file = "infer/examples/multi/story.txt"
|
| 119 |
+
remove_silence = true
|
| 120 |
+
output_dir = "tests"
|
| 121 |
+
|
| 122 |
+
[voices.town]
|
| 123 |
+
ref_audio = "infer/examples/multi/town.flac"
|
| 124 |
+
ref_text = ""
|
| 125 |
+
|
| 126 |
+
[voices.country]
|
| 127 |
+
ref_audio = "infer/examples/multi/country.flac"
|
| 128 |
+
ref_text = ""
|
| 129 |
+
```
|
| 130 |
+
You should mark the voice with `[main]` `[town]` `[country]` whenever you want to change voice, refer to `src/f5_tts/infer/examples/multi/story.txt`.
|
| 131 |
+
|
| 132 |
+
## API Usage
|
| 133 |
+
|
| 134 |
+
```python
|
| 135 |
+
from importlib.resources import files
|
| 136 |
+
from f5_tts.api import F5TTS
|
| 137 |
+
|
| 138 |
+
f5tts = F5TTS()
|
| 139 |
+
wav, sr, spec = f5tts.infer(
|
| 140 |
+
ref_file=str(files("f5_tts").joinpath("infer/examples/basic/basic_ref_en.wav")),
|
| 141 |
+
ref_text="some call me nature, others call me mother nature.",
|
| 142 |
+
gen_text="""I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring. Respect me and I'll nurture you; ignore me and you shall face the consequences.""",
|
| 143 |
+
file_wave=str(files("f5_tts").joinpath("../../tests/api_out.wav")),
|
| 144 |
+
file_spec=str(files("f5_tts").joinpath("../../tests/api_out.png")),
|
| 145 |
+
seed=None,
|
| 146 |
+
)
|
| 147 |
+
```
|
| 148 |
+
Check [api.py](../api.py) for more details.
|
| 149 |
+
|
| 150 |
+
## TensorRT-LLM Deployment
|
| 151 |
+
|
| 152 |
+
See [detailed instructions](../runtime/triton_trtllm/README.md) for more information.
|
| 153 |
+
|
| 154 |
+
## Socket Real-time Service
|
| 155 |
+
|
| 156 |
+
Real-time voice output with chunk stream:
|
| 157 |
+
|
| 158 |
+
```bash
|
| 159 |
+
# Start socket server
|
| 160 |
+
python src/f5_tts/socket_server.py
|
| 161 |
+
|
| 162 |
+
# If PyAudio not installed
|
| 163 |
+
sudo apt-get install portaudio19-dev
|
| 164 |
+
pip install pyaudio
|
| 165 |
+
|
| 166 |
+
# Communicate with socket client
|
| 167 |
+
python src/f5_tts/socket_client.py
|
| 168 |
+
```
|
| 169 |
+
|
| 170 |
+
## Speech Editing
|
| 171 |
+
|
| 172 |
+
To test speech editing capabilities, use the following command:
|
| 173 |
+
|
| 174 |
+
```bash
|
| 175 |
+
python src/f5_tts/infer/speech_edit.py
|
| 176 |
+
```
|
| 177 |
+
|
F5-TTS/src/f5_tts/infer/SHARED.md
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!-- omit in toc -->
|
| 2 |
+
# Shared Model Cards
|
| 3 |
+
|
| 4 |
+
<!-- omit in toc -->
|
| 5 |
+
### **Prerequisites of using**
|
| 6 |
+
- This document is serving as a quick lookup table for the community training/finetuning result, with various language support.
|
| 7 |
+
- The models in this repository are open source and are based on voluntary contributions from contributors.
|
| 8 |
+
- The use of models must be conditioned on respect for the respective creators. The convenience brought comes from their efforts.
|
| 9 |
+
|
| 10 |
+
<!-- omit in toc -->
|
| 11 |
+
### **Welcome to share here**
|
| 12 |
+
- Have a pretrained/finetuned result: model checkpoint (pruned best to facilitate inference, i.e. leave only `ema_model_state_dict`) and corresponding vocab file (for tokenization).
|
| 13 |
+
- Host a public [huggingface model repository](https://huggingface.co/new) and upload the model related files.
|
| 14 |
+
- Make a pull request adding a model card to the current page, i.e. `src\f5_tts\infer\SHARED.md`.
|
| 15 |
+
|
| 16 |
+
<!-- omit in toc -->
|
| 17 |
+
### Supported Languages
|
| 18 |
+
- [Multilingual](#multilingual)
|
| 19 |
+
- [F5-TTS v1 v0 Base @ zh \& en @ F5-TTS](#f5-tts-v1-v0-base--zh--en--f5-tts)
|
| 20 |
+
- [English](#english)
|
| 21 |
+
- [Finnish](#finnish)
|
| 22 |
+
- [F5-TTS Base @ fi @ AsmoKoskinen](#f5-tts-base--fi--asmokoskinen)
|
| 23 |
+
- [French](#french)
|
| 24 |
+
- [F5-TTS Base @ fr @ RASPIAUDIO](#f5-tts-base--fr--raspiaudio)
|
| 25 |
+
- [German](#german)
|
| 26 |
+
- [F5-TTS Base @ de @ hvoss-techfak](#f5-tts-base--de--hvoss-techfak)
|
| 27 |
+
- [Hindi](#hindi)
|
| 28 |
+
- [F5-TTS Small @ hi @ SPRINGLab](#f5-tts-small--hi--springlab)
|
| 29 |
+
- [Italian](#italian)
|
| 30 |
+
- [F5-TTS Base @ it @ alien79](#f5-tts-base--it--alien79)
|
| 31 |
+
- [Japanese](#japanese)
|
| 32 |
+
- [F5-TTS Base @ ja @ Jmica](#f5-tts-base--ja--jmica)
|
| 33 |
+
- [Mandarin](#mandarin)
|
| 34 |
+
- [Russian](#russian)
|
| 35 |
+
- [F5-TTS Base @ ru @ HotDro4illa](#f5-tts-base--ru--hotdro4illa)
|
| 36 |
+
- [Spanish](#spanish)
|
| 37 |
+
- [F5-TTS Base @ es @ jpgallegoar](#f5-tts-base--es--jpgallegoar)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
## Multilingual
|
| 41 |
+
|
| 42 |
+
#### F5-TTS v1 v0 Base @ zh & en @ F5-TTS
|
| 43 |
+
|Model|🤗Hugging Face|Data (Hours)|Model License|
|
| 44 |
+
|:---:|:------------:|:-----------:|:-------------:|
|
| 45 |
+
|F5-TTS v1 Base|[ckpt & vocab](https://huggingface.co/SWivid/F5-TTS/tree/main/F5TTS_v1_Base)|[Emilia 95K zh&en](https://huggingface.co/datasets/amphion/Emilia-Dataset/tree/fc71e07)|cc-by-nc-4.0|
|
| 46 |
+
|
| 47 |
+
```bash
|
| 48 |
+
Model: hf://SWivid/F5-TTS/F5TTS_v1_Base/model_1250000.safetensors
|
| 49 |
+
# A Variant Model: hf://SWivid/F5-TTS/F5TTS_v1_Base_no_zero_init/model_1250000.safetensors
|
| 50 |
+
Vocab: hf://SWivid/F5-TTS/F5TTS_v1_Base/vocab.txt
|
| 51 |
+
Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "conv_layers": 4}
|
| 52 |
+
```
|
| 53 |
+
|
| 54 |
+
|Model|🤗Hugging Face|Data (Hours)|Model License|
|
| 55 |
+
|:---:|:------------:|:-----------:|:-------------:|
|
| 56 |
+
|F5-TTS Base|[ckpt & vocab](https://huggingface.co/SWivid/F5-TTS/tree/main/F5TTS_Base)|[Emilia 95K zh&en](https://huggingface.co/datasets/amphion/Emilia-Dataset/tree/fc71e07)|cc-by-nc-4.0|
|
| 57 |
+
|
| 58 |
+
```bash
|
| 59 |
+
Model: hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors
|
| 60 |
+
Vocab: hf://SWivid/F5-TTS/F5TTS_Base/vocab.txt
|
| 61 |
+
Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "text_mask_padding": False, "conv_layers": 4, "pe_attn_head": 1}
|
| 62 |
+
```
|
| 63 |
+
|
| 64 |
+
*Other infos, e.g. Author info, Github repo, Link to some sampled results, Usage instruction, Tutorial (Blog, Video, etc.) ...*
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
## English
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
## Finnish
|
| 71 |
+
|
| 72 |
+
#### F5-TTS Base @ fi @ AsmoKoskinen
|
| 73 |
+
|Model|🤗Hugging Face|Data|Model License|
|
| 74 |
+
|:---:|:------------:|:-----------:|:-------------:|
|
| 75 |
+
|F5-TTS Base|[ckpt & vocab](https://huggingface.co/AsmoKoskinen/F5-TTS_Finnish_Model)|[Common Voice](https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0), [Vox Populi](https://huggingface.co/datasets/facebook/voxpopuli)|cc-by-nc-4.0|
|
| 76 |
+
|
| 77 |
+
```bash
|
| 78 |
+
Model: hf://AsmoKoskinen/F5-TTS_Finnish_Model/model_common_voice_fi_vox_populi_fi_20241206.safetensors
|
| 79 |
+
Vocab: hf://AsmoKoskinen/F5-TTS_Finnish_Model/vocab.txt
|
| 80 |
+
Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "text_mask_padding": False, "conv_layers": 4, "pe_attn_head": 1}
|
| 81 |
+
```
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
## French
|
| 85 |
+
|
| 86 |
+
#### F5-TTS Base @ fr @ RASPIAUDIO
|
| 87 |
+
|Model|🤗Hugging Face|Data (Hours)|Model License|
|
| 88 |
+
|:---:|:------------:|:-----------:|:-------------:|
|
| 89 |
+
|F5-TTS Base|[ckpt & vocab](https://huggingface.co/RASPIAUDIO/F5-French-MixedSpeakers-reduced)|[LibriVox](https://librivox.org/)|cc-by-nc-4.0|
|
| 90 |
+
|
| 91 |
+
```bash
|
| 92 |
+
Model: hf://RASPIAUDIO/F5-French-MixedSpeakers-reduced/model_last_reduced.pt
|
| 93 |
+
Vocab: hf://RASPIAUDIO/F5-French-MixedSpeakers-reduced/vocab.txt
|
| 94 |
+
Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "text_mask_padding": False, "conv_layers": 4, "pe_attn_head": 1}
|
| 95 |
+
```
|
| 96 |
+
|
| 97 |
+
- [Online Inference with Hugging Face Space](https://huggingface.co/spaces/RASPIAUDIO/f5-tts_french).
|
| 98 |
+
- [Tutorial video to train a new language model](https://www.youtube.com/watch?v=UO4usaOojys).
|
| 99 |
+
- [Discussion about this training can be found here](https://github.com/SWivid/F5-TTS/issues/434).
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
## German
|
| 103 |
+
|
| 104 |
+
#### F5-TTS Base @ de @ hvoss-techfak
|
| 105 |
+
|Model|🤗Hugging Face|Data (Hours)|Model License|
|
| 106 |
+
|:---:|:------------:|:-----------:|:-------------:|
|
| 107 |
+
|F5-TTS Base|[ckpt & vocab](https://huggingface.co/hvoss-techfak/F5-TTS-German)|[Mozilla Common Voice 19.0](https://commonvoice.mozilla.org/en/datasets) & 800 hours Crowdsourced |cc-by-nc-4.0|
|
| 108 |
+
|
| 109 |
+
```bash
|
| 110 |
+
Model: hf://hvoss-techfak/F5-TTS-German/model_f5tts_german.pt
|
| 111 |
+
Vocab: hf://hvoss-techfak/F5-TTS-German/vocab.txt
|
| 112 |
+
Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "text_mask_padding": False, "conv_layers": 4, "pe_attn_head": 1}
|
| 113 |
+
```
|
| 114 |
+
|
| 115 |
+
- Finetuned by [@hvoss-techfak](https://github.com/hvoss-techfak)
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
## Hindi
|
| 119 |
+
|
| 120 |
+
#### F5-TTS Small @ hi @ SPRINGLab
|
| 121 |
+
|Model|🤗Hugging Face|Data (Hours)|Model License|
|
| 122 |
+
|:---:|:------------:|:-----------:|:-------------:|
|
| 123 |
+
|F5-TTS Small|[ckpt & vocab](https://huggingface.co/SPRINGLab/F5-Hindi-24KHz)|[IndicTTS Hi](https://huggingface.co/datasets/SPRINGLab/IndicTTS-Hindi) & [IndicVoices-R Hi](https://huggingface.co/datasets/SPRINGLab/IndicVoices-R_Hindi) |cc-by-4.0|
|
| 124 |
+
|
| 125 |
+
```bash
|
| 126 |
+
Model: hf://SPRINGLab/F5-Hindi-24KHz/model_2500000.safetensors
|
| 127 |
+
Vocab: hf://SPRINGLab/F5-Hindi-24KHz/vocab.txt
|
| 128 |
+
Config: {"dim": 768, "depth": 18, "heads": 12, "ff_mult": 2, "text_dim": 512, "text_mask_padding": False, "conv_layers": 4, "pe_attn_head": 1}
|
| 129 |
+
```
|
| 130 |
+
|
| 131 |
+
- Authors: SPRING Lab, Indian Institute of Technology, Madras
|
| 132 |
+
- Website: https://asr.iitm.ac.in/
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
## Italian
|
| 136 |
+
|
| 137 |
+
#### F5-TTS Base @ it @ alien79
|
| 138 |
+
|Model|🤗Hugging Face|Data|Model License|
|
| 139 |
+
|:---:|:------------:|:-----------:|:-------------:|
|
| 140 |
+
|F5-TTS Base|[ckpt & vocab](https://huggingface.co/alien79/F5-TTS-italian)|[ylacombe/cml-tts](https://huggingface.co/datasets/ylacombe/cml-tts) |cc-by-nc-4.0|
|
| 141 |
+
|
| 142 |
+
```bash
|
| 143 |
+
Model: hf://alien79/F5-TTS-italian/model_159600.safetensors
|
| 144 |
+
Vocab: hf://alien79/F5-TTS-italian/vocab.txt
|
| 145 |
+
Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "text_mask_padding": False, "conv_layers": 4, "pe_attn_head": 1}
|
| 146 |
+
```
|
| 147 |
+
|
| 148 |
+
- Trained by [Mithril Man](https://github.com/MithrilMan)
|
| 149 |
+
- Model details on [hf project home](https://huggingface.co/alien79/F5-TTS-italian)
|
| 150 |
+
- Open to collaborations to further improve the model
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
## Japanese
|
| 154 |
+
|
| 155 |
+
#### F5-TTS Base @ ja @ Jmica
|
| 156 |
+
|Model|🤗Hugging Face|Data (Hours)|Model License|
|
| 157 |
+
|:---:|:------------:|:-----------:|:-------------:|
|
| 158 |
+
|F5-TTS Base|[ckpt & vocab](https://huggingface.co/Jmica/F5TTS/tree/main/JA_21999120)|[Emilia 1.7k JA](https://huggingface.co/datasets/amphion/Emilia-Dataset/tree/fc71e07) & [Galgame Dataset 5.4k](https://huggingface.co/datasets/OOPPEENN/Galgame_Dataset)|cc-by-nc-4.0|
|
| 159 |
+
|
| 160 |
+
```bash
|
| 161 |
+
Model: hf://Jmica/F5TTS/JA_21999120/model_21999120.pt
|
| 162 |
+
Vocab: hf://Jmica/F5TTS/JA_21999120/vocab_japanese.txt
|
| 163 |
+
Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "text_mask_padding": False, "conv_layers": 4, "pe_attn_head": 1}
|
| 164 |
+
```
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
## Mandarin
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
## Russian
|
| 171 |
+
|
| 172 |
+
#### F5-TTS Base @ ru @ HotDro4illa
|
| 173 |
+
|Model|🤗Hugging Face|Data (Hours)|Model License|
|
| 174 |
+
|:---:|:------------:|:-----------:|:-------------:|
|
| 175 |
+
|F5-TTS Base|[ckpt & vocab](https://huggingface.co/hotstone228/F5-TTS-Russian)|[Common voice](https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0)|cc-by-nc-4.0|
|
| 176 |
+
|
| 177 |
+
```bash
|
| 178 |
+
Model: hf://hotstone228/F5-TTS-Russian/model_last.safetensors
|
| 179 |
+
Vocab: hf://hotstone228/F5-TTS-Russian/vocab.txt
|
| 180 |
+
Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "text_mask_padding": False, "conv_layers": 4, "pe_attn_head": 1}
|
| 181 |
+
```
|
| 182 |
+
- Finetuned by [HotDro4illa](https://github.com/HotDro4illa)
|
| 183 |
+
- Any improvements are welcome
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
## Spanish
|
| 187 |
+
|
| 188 |
+
#### F5-TTS Base @ es @ jpgallegoar
|
| 189 |
+
|Model|🤗Hugging Face|Data (Hours)|Model License|
|
| 190 |
+
|:---:|:------------:|:-----------:|:-------------:|
|
| 191 |
+
|F5-TTS Base|[ckpt & vocab](https://huggingface.co/jpgallegoar/F5-Spanish)|[Voxpopuli](https://huggingface.co/datasets/facebook/voxpopuli) & Crowdsourced & TEDx, 218 hours|cc0-1.0|
|
| 192 |
+
|
| 193 |
+
- @jpgallegoar [GitHub repo](https://github.com/jpgallegoar/Spanish-F5), Jupyter Notebook and Gradio usage for Spanish model.
|
F5-TTS/src/f5_tts/infer/examples/basic/basic.toml
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# F5TTS_v1_Base | E2TTS_Base
|
| 2 |
+
model = "F5TTS_v1_Base"
|
| 3 |
+
ref_audio = "infer/examples/basic/basic_ref_en.wav"
|
| 4 |
+
# If an empty "", transcribes the reference audio automatically.
|
| 5 |
+
ref_text = "Some call me nature, others call me mother nature."
|
| 6 |
+
gen_text = "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring."
|
| 7 |
+
# File with text to generate. Ignores the text above.
|
| 8 |
+
gen_file = ""
|
| 9 |
+
remove_silence = false
|
| 10 |
+
output_dir = "tests"
|
| 11 |
+
output_file = "infer_cli_basic.wav"
|
F5-TTS/src/f5_tts/infer/examples/basic/basic_ref_en.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b0e22048e72414fcc1e6b6342e47a774d748a195ed34e4a5b3fcf416707f2b71
|
| 3 |
+
size 256018
|
F5-TTS/src/f5_tts/infer/examples/basic/basic_ref_zh.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:96724a113240d1f82c6ded1334122f0176b96c9226ccd3c919e625bcfd2a3ede
|
| 3 |
+
size 324558
|
F5-TTS/src/f5_tts/infer/examples/multi/country.flac
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bb15708b4b3875e37beec46591a5d89e1a9a63fdad3b8fe4a5c8738f4f554400
|
| 3 |
+
size 180321
|
F5-TTS/src/f5_tts/infer/examples/multi/main.flac
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4abb1107771ce7e14926fde879b959dde6db6e572476b98684f04e45e978ab19
|
| 3 |
+
size 279219
|