foxiis commited on
Commit
7fa2003
·
verified ·
1 Parent(s): 231e57a

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +10 -0
  2. .github/ISSUE_TEMPLATE/bug_report.yml +50 -0
  3. .github/ISSUE_TEMPLATE/config.yml +1 -0
  4. .github/ISSUE_TEMPLATE/feature_request.yml +62 -0
  5. .github/ISSUE_TEMPLATE/help_wanted.yml +54 -0
  6. .github/ISSUE_TEMPLATE/question.yml +26 -0
  7. .github/workflows/pre-commit.yaml +14 -0
  8. .github/workflows/publish-docker-image.yaml +60 -0
  9. .github/workflows/publish-pypi.yaml +66 -0
  10. .gitignore +171 -0
  11. .gitmodules +3 -0
  12. .pre-commit-config.yaml +17 -0
  13. Dockerfile +30 -0
  14. F5-TTS/.github/ISSUE_TEMPLATE/bug_report.yml +50 -0
  15. F5-TTS/.github/ISSUE_TEMPLATE/config.yml +1 -0
  16. F5-TTS/.github/ISSUE_TEMPLATE/feature_request.yml +62 -0
  17. F5-TTS/.github/ISSUE_TEMPLATE/help_wanted.yml +54 -0
  18. F5-TTS/.github/ISSUE_TEMPLATE/question.yml +26 -0
  19. F5-TTS/.github/workflows/pre-commit.yaml +14 -0
  20. F5-TTS/.github/workflows/publish-docker-image.yaml +60 -0
  21. F5-TTS/.github/workflows/publish-pypi.yaml +66 -0
  22. F5-TTS/.gitignore +171 -0
  23. F5-TTS/.gitmodules +3 -0
  24. F5-TTS/.pre-commit-config.yaml +17 -0
  25. F5-TTS/Dockerfile +30 -0
  26. F5-TTS/LICENSE +21 -0
  27. F5-TTS/README.md +262 -0
  28. F5-TTS/pyproject.toml +64 -0
  29. F5-TTS/ruff.toml +10 -0
  30. F5-TTS/src/f5_tts/api.py +164 -0
  31. F5-TTS/src/f5_tts/configs/E2TTS_Base.yaml +49 -0
  32. F5-TTS/src/f5_tts/configs/E2TTS_Small.yaml +49 -0
  33. F5-TTS/src/f5_tts/configs/F5TTS_Base.yaml +54 -0
  34. F5-TTS/src/f5_tts/configs/F5TTS_Small.yaml +54 -0
  35. F5-TTS/src/f5_tts/configs/F5TTS_v1_Base.yaml +55 -0
  36. F5-TTS/src/f5_tts/eval/README.md +52 -0
  37. F5-TTS/src/f5_tts/eval/ecapa_tdnn.py +331 -0
  38. F5-TTS/src/f5_tts/eval/eval_infer_batch.py +210 -0
  39. F5-TTS/src/f5_tts/eval/eval_infer_batch.sh +18 -0
  40. F5-TTS/src/f5_tts/eval/eval_librispeech_test_clean.py +89 -0
  41. F5-TTS/src/f5_tts/eval/eval_seedtts_testset.py +88 -0
  42. F5-TTS/src/f5_tts/eval/eval_utmos.py +42 -0
  43. F5-TTS/src/f5_tts/eval/utils_eval.py +419 -0
  44. F5-TTS/src/f5_tts/infer/README.md +177 -0
  45. F5-TTS/src/f5_tts/infer/SHARED.md +193 -0
  46. F5-TTS/src/f5_tts/infer/examples/basic/basic.toml +11 -0
  47. F5-TTS/src/f5_tts/infer/examples/basic/basic_ref_en.wav +3 -0
  48. F5-TTS/src/f5_tts/infer/examples/basic/basic_ref_zh.wav +3 -0
  49. F5-TTS/src/f5_tts/infer/examples/multi/country.flac +3 -0
  50. F5-TTS/src/f5_tts/infer/examples/multi/main.flac +3 -0
.gitattributes CHANGED
@@ -33,3 +33,13 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ F5-TTS/src/f5_tts/infer/examples/basic/basic_ref_en.wav filter=lfs diff=lfs merge=lfs -text
37
+ F5-TTS/src/f5_tts/infer/examples/basic/basic_ref_zh.wav filter=lfs diff=lfs merge=lfs -text
38
+ F5-TTS/src/f5_tts/infer/examples/multi/country.flac filter=lfs diff=lfs merge=lfs -text
39
+ F5-TTS/src/f5_tts/infer/examples/multi/main.flac filter=lfs diff=lfs merge=lfs -text
40
+ F5-TTS/src/f5_tts/infer/examples/multi/town.flac filter=lfs diff=lfs merge=lfs -text
41
+ src/f5_tts/infer/examples/basic/basic_ref_en.wav filter=lfs diff=lfs merge=lfs -text
42
+ src/f5_tts/infer/examples/basic/basic_ref_zh.wav filter=lfs diff=lfs merge=lfs -text
43
+ src/f5_tts/infer/examples/multi/country.flac filter=lfs diff=lfs merge=lfs -text
44
+ src/f5_tts/infer/examples/multi/main.flac filter=lfs diff=lfs merge=lfs -text
45
+ src/f5_tts/infer/examples/multi/town.flac filter=lfs diff=lfs merge=lfs -text
.github/ISSUE_TEMPLATE/bug_report.yml ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "Bug Report"
2
+ description: |
3
+ Please provide as much details to help address the issue more efficiently, including input, output, logs and screenshots.
4
+ labels:
5
+ - bug
6
+ body:
7
+ - type: checkboxes
8
+ attributes:
9
+ label: Checks
10
+ description: "To ensure timely help, please confirm the following:"
11
+ options:
12
+ - label: This template is only for bug reports, usage problems go with 'Help Wanted'.
13
+ required: true
14
+ - label: I have thoroughly reviewed the project documentation but couldn't find information to solve my problem.
15
+ required: true
16
+ - label: I have searched for existing issues, including closed ones, and couldn't find a solution.
17
+ required: true
18
+ - label: I am using English to submit this issue to facilitate community communication.
19
+ required: true
20
+ - type: textarea
21
+ attributes:
22
+ label: Environment Details
23
+ description: "Provide details including OS, GPU info, Python version, any relevant software or dependencies, and trainer setting."
24
+ placeholder: e.g., CentOS Linux 7, 4 * RTX 3090, Python 3.10, torch==2.3.0+cu118, cuda 11.8, config yaml is ...
25
+ validations:
26
+ required: true
27
+ - type: textarea
28
+ attributes:
29
+ label: Steps to Reproduce
30
+ description: |
31
+ Include detailed steps, screenshots, and logs. Use the correct markdown syntax for code blocks.
32
+ placeholder: |
33
+ 1. Create a new conda environment.
34
+ 2. Clone the repository, install as local editable and properly set up.
35
+ 3. Run the command: `accelerate launch src/f5_tts/train/train.py`.
36
+ 4. Have following error message... (attach logs).
37
+ validations:
38
+ required: true
39
+ - type: textarea
40
+ attributes:
41
+ label: ✔️ Expected Behavior
42
+ placeholder: Describe in detail what you expected to happen.
43
+ validations:
44
+ required: false
45
+ - type: textarea
46
+ attributes:
47
+ label: ❌ Actual Behavior
48
+ placeholder: Describe in detail what actually happened.
49
+ validations:
50
+ required: false
.github/ISSUE_TEMPLATE/config.yml ADDED
@@ -0,0 +1 @@
 
 
1
+ blank_issues_enabled: false
.github/ISSUE_TEMPLATE/feature_request.yml ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "Feature Request"
2
+ description: |
3
+ Some constructive suggestions and new ideas regarding current repo.
4
+ labels:
5
+ - enhancement
6
+ body:
7
+ - type: checkboxes
8
+ attributes:
9
+ label: Checks
10
+ description: "To help us grasp quickly, please confirm the following:"
11
+ options:
12
+ - label: This template is only for feature request.
13
+ required: true
14
+ - label: I have thoroughly reviewed the project documentation but couldn't find any relevant information that meets my needs.
15
+ required: true
16
+ - label: I have searched for existing issues, including closed ones, and found not discussion yet.
17
+ required: true
18
+ - label: I am using English to submit this issue to facilitate community communication.
19
+ required: true
20
+ - type: textarea
21
+ attributes:
22
+ label: 1. Is this request related to a challenge you're experiencing? Tell us your story.
23
+ description: |
24
+ Describe the specific problem or scenario you're facing in detail. For example:
25
+ *"I was trying to use [feature] for [specific task], but encountered [issue]. This was frustrating because...."*
26
+ placeholder: Please describe the situation in as much detail as possible.
27
+ validations:
28
+ required: true
29
+
30
+ - type: textarea
31
+ attributes:
32
+ label: 2. What is your suggested solution?
33
+ description: |
34
+ Provide a clear description of the feature or enhancement you'd like to propose.
35
+ How would this feature solve your issue or improve the project?
36
+ placeholder: Describe your idea or proposed solution here.
37
+ validations:
38
+ required: true
39
+
40
+ - type: textarea
41
+ attributes:
42
+ label: 3. Additional context or comments
43
+ description: |
44
+ Any other relevant information, links, documents, or screenshots that provide clarity.
45
+ Use this section for anything not covered above.
46
+ placeholder: Add any extra details here.
47
+ validations:
48
+ required: false
49
+
50
+ - type: checkboxes
51
+ attributes:
52
+ label: 4. Can you help us with this feature?
53
+ description: |
54
+ Let us know if you're interested in contributing. This is not a commitment but a way to express interest in collaboration.
55
+ options:
56
+ - label: I am interested in contributing to this feature.
57
+ required: false
58
+
59
+ - type: markdown
60
+ attributes:
61
+ value: |
62
+ **Note:** Please submit only one request per issue to keep discussions focused and manageable.
.github/ISSUE_TEMPLATE/help_wanted.yml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "Help Wanted"
2
+ description: |
3
+ Please provide as much details to help address the issue more efficiently, including input, output, logs and screenshots.
4
+ labels:
5
+ - help wanted
6
+ body:
7
+ - type: checkboxes
8
+ attributes:
9
+ label: Checks
10
+ description: "To ensure timely help, please confirm the following:"
11
+ options:
12
+ - label: This template is only for usage issues encountered.
13
+ required: true
14
+ - label: I have thoroughly reviewed the project documentation but couldn't find information to solve my problem.
15
+ required: true
16
+ - label: I have searched for existing issues, including closed ones, and couldn't find a solution.
17
+ required: true
18
+ - label: I am using English to submit this issue to facilitate community communication.
19
+ required: true
20
+ - type: textarea
21
+ attributes:
22
+ label: Environment Details
23
+ description: "Provide details such as OS, Python version, and any relevant software or dependencies."
24
+ placeholder: |
25
+ e.g., macOS 13.5, Python 3.10, torch==2.3.0, Gradio 4.44.1
26
+ If training or finetuning related, provide detailed configuration including GPU info and training setup.
27
+ validations:
28
+ required: true
29
+ - type: textarea
30
+ attributes:
31
+ label: Steps to Reproduce
32
+ description: |
33
+ Include detailed steps, screenshots, and logs. Provide used prompt wav and text. Use the correct markdown syntax for code blocks.
34
+ placeholder: |
35
+ 1. Create a new conda environment.
36
+ 2. Clone the repository and install as pip package.
37
+ 3. Run the command: `f5-tts_infer-gradio` with no ref_text provided.
38
+ 4. Stuck there with the following message... (attach logs and also error msg e.g. after ctrl-c).
39
+ 5. Prompt & generated wavs are [change suffix to .mp4 to enable direct upload or pack all to .zip].
40
+ 6. Reference audio's transcription or provided ref_text is `xxx`, and text to generate is `xxx`.
41
+ validations:
42
+ required: true
43
+ - type: textarea
44
+ attributes:
45
+ label: ✔️ Expected Behavior
46
+ placeholder: Describe what you expected to happen in detail, e.g. output a generated audio.
47
+ validations:
48
+ required: false
49
+ - type: textarea
50
+ attributes:
51
+ label: ❌ Actual Behavior
52
+ placeholder: Describe what actually happened in detail, failure messages, etc.
53
+ validations:
54
+ required: false
.github/ISSUE_TEMPLATE/question.yml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "Question"
2
+ description: |
3
+ Research question or pure inquiry about the project, usage issue goes with "help wanted".
4
+ labels:
5
+ - question
6
+ body:
7
+ - type: checkboxes
8
+ attributes:
9
+ label: Checks
10
+ description: "To help us grasp quickly, please confirm the following:"
11
+ options:
12
+ - label: This template is only for research question, not usage problems, feature requests or bug reports.
13
+ required: true
14
+ - label: I have thoroughly reviewed the project documentation and read the related paper(s).
15
+ required: true
16
+ - label: I have searched for existing issues, including closed ones, no similar questions.
17
+ required: true
18
+ - label: I am using English to submit this issue to facilitate community communication.
19
+ required: true
20
+ - type: textarea
21
+ attributes:
22
+ label: Question details
23
+ description: |
24
+ Question details, clearly stated using proper markdown syntax.
25
+ validations:
26
+ required: true
.github/workflows/pre-commit.yaml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: pre-commit
2
+
3
+ on:
4
+ pull_request:
5
+ push:
6
+ branches: [main]
7
+
8
+ jobs:
9
+ pre-commit:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v3
13
+ - uses: actions/setup-python@v3
14
+ - uses: pre-commit/action@v3.0.1
.github/workflows/publish-docker-image.yaml ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Create and publish a Docker image
2
+
3
+ # Configures this workflow to run every time a change is pushed to the branch called `release`.
4
+ on:
5
+ push:
6
+ branches: ['main']
7
+
8
+ # Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds.
9
+ env:
10
+ REGISTRY: ghcr.io
11
+ IMAGE_NAME: ${{ github.repository }}
12
+
13
+ # There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu.
14
+ jobs:
15
+ build-and-push-image:
16
+ runs-on: ubuntu-latest
17
+ # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job.
18
+ permissions:
19
+ contents: read
20
+ packages: write
21
+ #
22
+ steps:
23
+ - name: Checkout repository
24
+ uses: actions/checkout@v4
25
+ - name: Free Up GitHub Actions Ubuntu Runner Disk Space 🔧
26
+ uses: jlumbroso/free-disk-space@main
27
+ with:
28
+ # This might remove tools that are actually needed, if set to "true" but frees about 6 GB
29
+ tool-cache: false
30
+
31
+ # All of these default to true, but feel free to set to "false" if necessary for your workflow
32
+ android: true
33
+ dotnet: true
34
+ haskell: true
35
+ large-packages: false
36
+ swap-storage: false
37
+ docker-images: false
38
+ # Uses the `docker/login-action` action to log in to the Container registry registry using the account and password that will publish the packages. Once published, the packages are scoped to the account defined here.
39
+ - name: Log in to the Container registry
40
+ uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
41
+ with:
42
+ registry: ${{ env.REGISTRY }}
43
+ username: ${{ github.actor }}
44
+ password: ${{ secrets.GITHUB_TOKEN }}
45
+ # This step uses [docker/metadata-action](https://github.com/docker/metadata-action#about) to extract tags and labels that will be applied to the specified image. The `id` "meta" allows the output of this step to be referenced in a subsequent step. The `images` value provides the base name for the tags and labels.
46
+ - name: Extract metadata (tags, labels) for Docker
47
+ id: meta
48
+ uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
49
+ with:
50
+ images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
51
+ # This step uses the `docker/build-push-action` action to build the image, based on your repository's `Dockerfile`. If the build succeeds, it pushes the image to GitHub Packages.
52
+ # It uses the `context` parameter to define the build's context as the set of files located in the specified path. For more information, see "[Usage](https://github.com/docker/build-push-action#usage)" in the README of the `docker/build-push-action` repository.
53
+ # It uses the `tags` and `labels` parameters to tag and label the image with the output from the "meta" step.
54
+ - name: Build and push Docker image
55
+ uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
56
+ with:
57
+ context: .
58
+ push: true
59
+ tags: ${{ steps.meta.outputs.tags }}
60
+ labels: ${{ steps.meta.outputs.labels }}
.github/workflows/publish-pypi.yaml ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This workflow uses actions that are not certified by GitHub.
2
+ # They are provided by a third-party and are governed by
3
+ # separate terms of service, privacy policy, and support
4
+ # documentation.
5
+
6
+ # GitHub recommends pinning actions to a commit SHA.
7
+ # To get a newer version, you will need to update the SHA.
8
+ # You can also reference a tag or branch, but the action may change without warning.
9
+
10
+ name: Upload Python Package
11
+
12
+ on:
13
+ release:
14
+ types: [published]
15
+
16
+ permissions:
17
+ contents: read
18
+
19
+ jobs:
20
+ release-build:
21
+ runs-on: ubuntu-latest
22
+
23
+ steps:
24
+ - uses: actions/checkout@v4
25
+
26
+ - uses: actions/setup-python@v5
27
+ with:
28
+ python-version: "3.x"
29
+
30
+ - name: Build release distributions
31
+ run: |
32
+ # NOTE: put your own distribution build steps here.
33
+ python -m pip install build
34
+ python -m build
35
+
36
+ - name: Upload distributions
37
+ uses: actions/upload-artifact@v4
38
+ with:
39
+ name: release-dists
40
+ path: dist/
41
+
42
+ pypi-publish:
43
+ runs-on: ubuntu-latest
44
+
45
+ needs:
46
+ - release-build
47
+
48
+ permissions:
49
+ # IMPORTANT: this permission is mandatory for trusted publishing
50
+ id-token: write
51
+
52
+ # Dedicated environments with protections for publishing are strongly recommended.
53
+ environment:
54
+ name: pypi
55
+ # OPTIONAL: uncomment and update to include your PyPI project URL in the deployment status:
56
+ # url: https://pypi.org/p/YOURPROJECT
57
+
58
+ steps:
59
+ - name: Retrieve release distributions
60
+ uses: actions/download-artifact@v4
61
+ with:
62
+ name: release-dists
63
+ path: dist/
64
+
65
+ - name: Publish release distributions to PyPI
66
+ uses: pypa/gh-action-pypi-publish@release/v1
.gitignore ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Customed
2
+ .vscode/
3
+ tests/
4
+ runs/
5
+ data/
6
+ ckpts/
7
+ wandb/
8
+ results/
9
+
10
+ # Byte-compiled / optimized / DLL files
11
+ __pycache__/
12
+ *.py[cod]
13
+ *$py.class
14
+
15
+ # C extensions
16
+ *.so
17
+
18
+ # Distribution / packaging
19
+ .Python
20
+ build/
21
+ develop-eggs/
22
+ dist/
23
+ downloads/
24
+ eggs/
25
+ .eggs/
26
+ lib/
27
+ lib64/
28
+ parts/
29
+ sdist/
30
+ var/
31
+ wheels/
32
+ share/python-wheels/
33
+ *.egg-info/
34
+ .installed.cfg
35
+ *.egg
36
+ MANIFEST
37
+
38
+ # PyInstaller
39
+ # Usually these files are written by a python script from a template
40
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
41
+ *.manifest
42
+ *.spec
43
+
44
+ # Installer logs
45
+ pip-log.txt
46
+ pip-delete-this-directory.txt
47
+
48
+ # Unit test / coverage reports
49
+ htmlcov/
50
+ .tox/
51
+ .nox/
52
+ .coverage
53
+ .coverage.*
54
+ .cache
55
+ nosetests.xml
56
+ coverage.xml
57
+ *.cover
58
+ *.py,cover
59
+ .hypothesis/
60
+ .pytest_cache/
61
+ cover/
62
+
63
+ # Translations
64
+ *.mo
65
+ *.pot
66
+
67
+ # Django stuff:
68
+ *.log
69
+ local_settings.py
70
+ db.sqlite3
71
+ db.sqlite3-journal
72
+
73
+ # Flask stuff:
74
+ instance/
75
+ .webassets-cache
76
+
77
+ # Scrapy stuff:
78
+ .scrapy
79
+
80
+ # Sphinx documentation
81
+ docs/_build/
82
+
83
+ # PyBuilder
84
+ .pybuilder/
85
+ target/
86
+
87
+ # Jupyter Notebook
88
+ .ipynb_checkpoints
89
+
90
+ # IPython
91
+ profile_default/
92
+ ipython_config.py
93
+
94
+ # pyenv
95
+ # For a library or package, you might want to ignore these files since the code is
96
+ # intended to run in multiple environments; otherwise, check them in:
97
+ # .python-version
98
+
99
+ # pipenv
100
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
101
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
102
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
103
+ # install all needed dependencies.
104
+ #Pipfile.lock
105
+
106
+ # poetry
107
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
108
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
109
+ # commonly ignored for libraries.
110
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
111
+ #poetry.lock
112
+
113
+ # pdm
114
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
115
+ #pdm.lock
116
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
117
+ # in version control.
118
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
119
+ .pdm.toml
120
+ .pdm-python
121
+ .pdm-build/
122
+
123
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
124
+ __pypackages__/
125
+
126
+ # Celery stuff
127
+ celerybeat-schedule
128
+ celerybeat.pid
129
+
130
+ # SageMath parsed files
131
+ *.sage.py
132
+
133
+ # Environments
134
+ .env
135
+ .venv
136
+ env/
137
+ venv/
138
+ ENV/
139
+ env.bak/
140
+ venv.bak/
141
+
142
+ # Spyder project settings
143
+ .spyderproject
144
+ .spyproject
145
+
146
+ # Rope project settings
147
+ .ropeproject
148
+
149
+ # mkdocs documentation
150
+ /site
151
+
152
+ # mypy
153
+ .mypy_cache/
154
+ .dmypy.json
155
+ dmypy.json
156
+
157
+ # Pyre type checker
158
+ .pyre/
159
+
160
+ # pytype static type analyzer
161
+ .pytype/
162
+
163
+ # Cython debug symbols
164
+ cython_debug/
165
+
166
+ # PyCharm
167
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
168
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
169
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
170
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
171
+ #.idea/
.gitmodules ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [submodule "src/third_party/BigVGAN"]
2
+ path = src/third_party/BigVGAN
3
+ url = https://github.com/NVIDIA/BigVGAN.git
.pre-commit-config.yaml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ - repo: https://github.com/astral-sh/ruff-pre-commit
3
+ # Ruff version.
4
+ rev: v0.11.2
5
+ hooks:
6
+ - id: ruff
7
+ name: ruff linter
8
+ args: [--fix]
9
+ - id: ruff-format
10
+ name: ruff formatter
11
+ - id: ruff
12
+ name: ruff sorter
13
+ args: [--select, I, --fix]
14
+ - repo: https://github.com/pre-commit/pre-commit-hooks
15
+ rev: v5.0.0
16
+ hooks:
17
+ - id: check-yaml
Dockerfile ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM pytorch/pytorch:2.4.0-cuda12.4-cudnn9-devel
2
+
3
+ USER root
4
+
5
+ ARG DEBIAN_FRONTEND=noninteractive
6
+
7
+ LABEL github_repo="https://github.com/SWivid/F5-TTS"
8
+
9
+ RUN set -x \
10
+ && apt-get update \
11
+ && apt-get -y install wget curl man git less openssl libssl-dev unzip unar build-essential aria2 tmux vim \
12
+ && apt-get install -y openssh-server sox libsox-fmt-all libsox-fmt-mp3 libsndfile1-dev ffmpeg \
13
+ && apt-get install -y librdmacm1 libibumad3 librdmacm-dev libibverbs1 libibverbs-dev ibverbs-utils ibverbs-providers \
14
+ && rm -rf /var/lib/apt/lists/* \
15
+ && apt-get clean
16
+
17
+ WORKDIR /workspace
18
+
19
+ RUN git clone https://github.com/SWivid/F5-TTS.git \
20
+ && cd F5-TTS \
21
+ && git submodule update --init --recursive \
22
+ && pip install -e . --no-cache-dir
23
+
24
+ ENV SHELL=/bin/bash
25
+
26
+ VOLUME /root/.cache/huggingface/hub/
27
+
28
+ EXPOSE 7860
29
+
30
+ WORKDIR /workspace/F5-TTS
F5-TTS/.github/ISSUE_TEMPLATE/bug_report.yml ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "Bug Report"
2
+ description: |
3
+ Please provide as much details to help address the issue more efficiently, including input, output, logs and screenshots.
4
+ labels:
5
+ - bug
6
+ body:
7
+ - type: checkboxes
8
+ attributes:
9
+ label: Checks
10
+ description: "To ensure timely help, please confirm the following:"
11
+ options:
12
+ - label: This template is only for bug reports, usage problems go with 'Help Wanted'.
13
+ required: true
14
+ - label: I have thoroughly reviewed the project documentation but couldn't find information to solve my problem.
15
+ required: true
16
+ - label: I have searched for existing issues, including closed ones, and couldn't find a solution.
17
+ required: true
18
+ - label: I am using English to submit this issue to facilitate community communication.
19
+ required: true
20
+ - type: textarea
21
+ attributes:
22
+ label: Environment Details
23
+ description: "Provide details including OS, GPU info, Python version, any relevant software or dependencies, and trainer setting."
24
+ placeholder: e.g., CentOS Linux 7, 4 * RTX 3090, Python 3.10, torch==2.3.0+cu118, cuda 11.8, config yaml is ...
25
+ validations:
26
+ required: true
27
+ - type: textarea
28
+ attributes:
29
+ label: Steps to Reproduce
30
+ description: |
31
+ Include detailed steps, screenshots, and logs. Use the correct markdown syntax for code blocks.
32
+ placeholder: |
33
+ 1. Create a new conda environment.
34
+ 2. Clone the repository, install as local editable and properly set up.
35
+ 3. Run the command: `accelerate launch src/f5_tts/train/train.py`.
36
+ 4. Have following error message... (attach logs).
37
+ validations:
38
+ required: true
39
+ - type: textarea
40
+ attributes:
41
+ label: ✔️ Expected Behavior
42
+ placeholder: Describe in detail what you expected to happen.
43
+ validations:
44
+ required: false
45
+ - type: textarea
46
+ attributes:
47
+ label: ❌ Actual Behavior
48
+ placeholder: Describe in detail what actually happened.
49
+ validations:
50
+ required: false
F5-TTS/.github/ISSUE_TEMPLATE/config.yml ADDED
@@ -0,0 +1 @@
 
 
1
+ blank_issues_enabled: false
F5-TTS/.github/ISSUE_TEMPLATE/feature_request.yml ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "Feature Request"
2
+ description: |
3
+ Some constructive suggestions and new ideas regarding current repo.
4
+ labels:
5
+ - enhancement
6
+ body:
7
+ - type: checkboxes
8
+ attributes:
9
+ label: Checks
10
+ description: "To help us grasp quickly, please confirm the following:"
11
+ options:
12
+ - label: This template is only for feature request.
13
+ required: true
14
+ - label: I have thoroughly reviewed the project documentation but couldn't find any relevant information that meets my needs.
15
+ required: true
16
+ - label: I have searched for existing issues, including closed ones, and found not discussion yet.
17
+ required: true
18
+ - label: I am using English to submit this issue to facilitate community communication.
19
+ required: true
20
+ - type: textarea
21
+ attributes:
22
+ label: 1. Is this request related to a challenge you're experiencing? Tell us your story.
23
+ description: |
24
+ Describe the specific problem or scenario you're facing in detail. For example:
25
+ *"I was trying to use [feature] for [specific task], but encountered [issue]. This was frustrating because...."*
26
+ placeholder: Please describe the situation in as much detail as possible.
27
+ validations:
28
+ required: true
29
+
30
+ - type: textarea
31
+ attributes:
32
+ label: 2. What is your suggested solution?
33
+ description: |
34
+ Provide a clear description of the feature or enhancement you'd like to propose.
35
+ How would this feature solve your issue or improve the project?
36
+ placeholder: Describe your idea or proposed solution here.
37
+ validations:
38
+ required: true
39
+
40
+ - type: textarea
41
+ attributes:
42
+ label: 3. Additional context or comments
43
+ description: |
44
+ Any other relevant information, links, documents, or screenshots that provide clarity.
45
+ Use this section for anything not covered above.
46
+ placeholder: Add any extra details here.
47
+ validations:
48
+ required: false
49
+
50
+ - type: checkboxes
51
+ attributes:
52
+ label: 4. Can you help us with this feature?
53
+ description: |
54
+ Let us know if you're interested in contributing. This is not a commitment but a way to express interest in collaboration.
55
+ options:
56
+ - label: I am interested in contributing to this feature.
57
+ required: false
58
+
59
+ - type: markdown
60
+ attributes:
61
+ value: |
62
+ **Note:** Please submit only one request per issue to keep discussions focused and manageable.
F5-TTS/.github/ISSUE_TEMPLATE/help_wanted.yml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "Help Wanted"
2
+ description: |
3
+ Please provide as much details to help address the issue more efficiently, including input, output, logs and screenshots.
4
+ labels:
5
+ - help wanted
6
+ body:
7
+ - type: checkboxes
8
+ attributes:
9
+ label: Checks
10
+ description: "To ensure timely help, please confirm the following:"
11
+ options:
12
+ - label: This template is only for usage issues encountered.
13
+ required: true
14
+ - label: I have thoroughly reviewed the project documentation but couldn't find information to solve my problem.
15
+ required: true
16
+ - label: I have searched for existing issues, including closed ones, and couldn't find a solution.
17
+ required: true
18
+ - label: I am using English to submit this issue to facilitate community communication.
19
+ required: true
20
+ - type: textarea
21
+ attributes:
22
+ label: Environment Details
23
+ description: "Provide details such as OS, Python version, and any relevant software or dependencies."
24
+ placeholder: |
25
+ e.g., macOS 13.5, Python 3.10, torch==2.3.0, Gradio 4.44.1
26
+ If training or finetuning related, provide detailed configuration including GPU info and training setup.
27
+ validations:
28
+ required: true
29
+ - type: textarea
30
+ attributes:
31
+ label: Steps to Reproduce
32
+ description: |
33
+ Include detailed steps, screenshots, and logs. Provide used prompt wav and text. Use the correct markdown syntax for code blocks.
34
+ placeholder: |
35
+ 1. Create a new conda environment.
36
+ 2. Clone the repository and install as pip package.
37
+ 3. Run the command: `f5-tts_infer-gradio` with no ref_text provided.
38
+ 4. Stuck there with the following message... (attach logs and also error msg e.g. after ctrl-c).
39
+ 5. Prompt & generated wavs are [change suffix to .mp4 to enable direct upload or pack all to .zip].
40
+ 6. Reference audio's transcription or provided ref_text is `xxx`, and text to generate is `xxx`.
41
+ validations:
42
+ required: true
43
+ - type: textarea
44
+ attributes:
45
+ label: ✔️ Expected Behavior
46
+ placeholder: Describe what you expected to happen in detail, e.g. output a generated audio.
47
+ validations:
48
+ required: false
49
+ - type: textarea
50
+ attributes:
51
+ label: ❌ Actual Behavior
52
+ placeholder: Describe what actually happened in detail, failure messages, etc.
53
+ validations:
54
+ required: false
F5-TTS/.github/ISSUE_TEMPLATE/question.yml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "Question"
2
+ description: |
3
+ Research question or pure inquiry about the project, usage issue goes with "help wanted".
4
+ labels:
5
+ - question
6
+ body:
7
+ - type: checkboxes
8
+ attributes:
9
+ label: Checks
10
+ description: "To help us grasp quickly, please confirm the following:"
11
+ options:
12
+ - label: This template is only for research question, not usage problems, feature requests or bug reports.
13
+ required: true
14
+ - label: I have thoroughly reviewed the project documentation and read the related paper(s).
15
+ required: true
16
+ - label: I have searched for existing issues, including closed ones, no similar questions.
17
+ required: true
18
+ - label: I am using English to submit this issue to facilitate community communication.
19
+ required: true
20
+ - type: textarea
21
+ attributes:
22
+ label: Question details
23
+ description: |
24
+ Question details, clearly stated using proper markdown syntax.
25
+ validations:
26
+ required: true
F5-TTS/.github/workflows/pre-commit.yaml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: pre-commit
2
+
3
+ on:
4
+ pull_request:
5
+ push:
6
+ branches: [main]
7
+
8
+ jobs:
9
+ pre-commit:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v3
13
+ - uses: actions/setup-python@v3
14
+ - uses: pre-commit/action@v3.0.1
F5-TTS/.github/workflows/publish-docker-image.yaml ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Create and publish a Docker image
2
+
3
+ # Configures this workflow to run every time a change is pushed to the branch called `release`.
4
+ on:
5
+ push:
6
+ branches: ['main']
7
+
8
+ # Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds.
9
+ env:
10
+ REGISTRY: ghcr.io
11
+ IMAGE_NAME: ${{ github.repository }}
12
+
13
+ # There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu.
14
+ jobs:
15
+ build-and-push-image:
16
+ runs-on: ubuntu-latest
17
+ # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job.
18
+ permissions:
19
+ contents: read
20
+ packages: write
21
+ #
22
+ steps:
23
+ - name: Checkout repository
24
+ uses: actions/checkout@v4
25
+ - name: Free Up GitHub Actions Ubuntu Runner Disk Space 🔧
26
+ uses: jlumbroso/free-disk-space@main
27
+ with:
28
+ # This might remove tools that are actually needed, if set to "true" but frees about 6 GB
29
+ tool-cache: false
30
+
31
+ # All of these default to true, but feel free to set to "false" if necessary for your workflow
32
+ android: true
33
+ dotnet: true
34
+ haskell: true
35
+ large-packages: false
36
+ swap-storage: false
37
+ docker-images: false
38
+ # Uses the `docker/login-action` action to log in to the Container registry registry using the account and password that will publish the packages. Once published, the packages are scoped to the account defined here.
39
+ - name: Log in to the Container registry
40
+ uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
41
+ with:
42
+ registry: ${{ env.REGISTRY }}
43
+ username: ${{ github.actor }}
44
+ password: ${{ secrets.GITHUB_TOKEN }}
45
+ # This step uses [docker/metadata-action](https://github.com/docker/metadata-action#about) to extract tags and labels that will be applied to the specified image. The `id` "meta" allows the output of this step to be referenced in a subsequent step. The `images` value provides the base name for the tags and labels.
46
+ - name: Extract metadata (tags, labels) for Docker
47
+ id: meta
48
+ uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
49
+ with:
50
+ images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
51
+ # This step uses the `docker/build-push-action` action to build the image, based on your repository's `Dockerfile`. If the build succeeds, it pushes the image to GitHub Packages.
52
+ # It uses the `context` parameter to define the build's context as the set of files located in the specified path. For more information, see "[Usage](https://github.com/docker/build-push-action#usage)" in the README of the `docker/build-push-action` repository.
53
+ # It uses the `tags` and `labels` parameters to tag and label the image with the output from the "meta" step.
54
+ - name: Build and push Docker image
55
+ uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
56
+ with:
57
+ context: .
58
+ push: true
59
+ tags: ${{ steps.meta.outputs.tags }}
60
+ labels: ${{ steps.meta.outputs.labels }}
F5-TTS/.github/workflows/publish-pypi.yaml ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This workflow uses actions that are not certified by GitHub.
2
+ # They are provided by a third-party and are governed by
3
+ # separate terms of service, privacy policy, and support
4
+ # documentation.
5
+
6
+ # GitHub recommends pinning actions to a commit SHA.
7
+ # To get a newer version, you will need to update the SHA.
8
+ # You can also reference a tag or branch, but the action may change without warning.
9
+
10
+ name: Upload Python Package
11
+
12
+ on:
13
+ release:
14
+ types: [published]
15
+
16
+ permissions:
17
+ contents: read
18
+
19
+ jobs:
20
+ release-build:
21
+ runs-on: ubuntu-latest
22
+
23
+ steps:
24
+ - uses: actions/checkout@v4
25
+
26
+ - uses: actions/setup-python@v5
27
+ with:
28
+ python-version: "3.x"
29
+
30
+ - name: Build release distributions
31
+ run: |
32
+ # NOTE: put your own distribution build steps here.
33
+ python -m pip install build
34
+ python -m build
35
+
36
+ - name: Upload distributions
37
+ uses: actions/upload-artifact@v4
38
+ with:
39
+ name: release-dists
40
+ path: dist/
41
+
42
+ pypi-publish:
43
+ runs-on: ubuntu-latest
44
+
45
+ needs:
46
+ - release-build
47
+
48
+ permissions:
49
+ # IMPORTANT: this permission is mandatory for trusted publishing
50
+ id-token: write
51
+
52
+ # Dedicated environments with protections for publishing are strongly recommended.
53
+ environment:
54
+ name: pypi
55
+ # OPTIONAL: uncomment and update to include your PyPI project URL in the deployment status:
56
+ # url: https://pypi.org/p/YOURPROJECT
57
+
58
+ steps:
59
+ - name: Retrieve release distributions
60
+ uses: actions/download-artifact@v4
61
+ with:
62
+ name: release-dists
63
+ path: dist/
64
+
65
+ - name: Publish release distributions to PyPI
66
+ uses: pypa/gh-action-pypi-publish@release/v1
F5-TTS/.gitignore ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Customed
2
+ .vscode/
3
+ tests/
4
+ runs/
5
+ data/
6
+ ckpts/
7
+ wandb/
8
+ results/
9
+
10
+ # Byte-compiled / optimized / DLL files
11
+ __pycache__/
12
+ *.py[cod]
13
+ *$py.class
14
+
15
+ # C extensions
16
+ *.so
17
+
18
+ # Distribution / packaging
19
+ .Python
20
+ build/
21
+ develop-eggs/
22
+ dist/
23
+ downloads/
24
+ eggs/
25
+ .eggs/
26
+ lib/
27
+ lib64/
28
+ parts/
29
+ sdist/
30
+ var/
31
+ wheels/
32
+ share/python-wheels/
33
+ *.egg-info/
34
+ .installed.cfg
35
+ *.egg
36
+ MANIFEST
37
+
38
+ # PyInstaller
39
+ # Usually these files are written by a python script from a template
40
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
41
+ *.manifest
42
+ *.spec
43
+
44
+ # Installer logs
45
+ pip-log.txt
46
+ pip-delete-this-directory.txt
47
+
48
+ # Unit test / coverage reports
49
+ htmlcov/
50
+ .tox/
51
+ .nox/
52
+ .coverage
53
+ .coverage.*
54
+ .cache
55
+ nosetests.xml
56
+ coverage.xml
57
+ *.cover
58
+ *.py,cover
59
+ .hypothesis/
60
+ .pytest_cache/
61
+ cover/
62
+
63
+ # Translations
64
+ *.mo
65
+ *.pot
66
+
67
+ # Django stuff:
68
+ *.log
69
+ local_settings.py
70
+ db.sqlite3
71
+ db.sqlite3-journal
72
+
73
+ # Flask stuff:
74
+ instance/
75
+ .webassets-cache
76
+
77
+ # Scrapy stuff:
78
+ .scrapy
79
+
80
+ # Sphinx documentation
81
+ docs/_build/
82
+
83
+ # PyBuilder
84
+ .pybuilder/
85
+ target/
86
+
87
+ # Jupyter Notebook
88
+ .ipynb_checkpoints
89
+
90
+ # IPython
91
+ profile_default/
92
+ ipython_config.py
93
+
94
+ # pyenv
95
+ # For a library or package, you might want to ignore these files since the code is
96
+ # intended to run in multiple environments; otherwise, check them in:
97
+ # .python-version
98
+
99
+ # pipenv
100
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
101
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
102
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
103
+ # install all needed dependencies.
104
+ #Pipfile.lock
105
+
106
+ # poetry
107
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
108
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
109
+ # commonly ignored for libraries.
110
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
111
+ #poetry.lock
112
+
113
+ # pdm
114
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
115
+ #pdm.lock
116
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
117
+ # in version control.
118
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
119
+ .pdm.toml
120
+ .pdm-python
121
+ .pdm-build/
122
+
123
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
124
+ __pypackages__/
125
+
126
+ # Celery stuff
127
+ celerybeat-schedule
128
+ celerybeat.pid
129
+
130
+ # SageMath parsed files
131
+ *.sage.py
132
+
133
+ # Environments
134
+ .env
135
+ .venv
136
+ env/
137
+ venv/
138
+ ENV/
139
+ env.bak/
140
+ venv.bak/
141
+
142
+ # Spyder project settings
143
+ .spyderproject
144
+ .spyproject
145
+
146
+ # Rope project settings
147
+ .ropeproject
148
+
149
+ # mkdocs documentation
150
+ /site
151
+
152
+ # mypy
153
+ .mypy_cache/
154
+ .dmypy.json
155
+ dmypy.json
156
+
157
+ # Pyre type checker
158
+ .pyre/
159
+
160
+ # pytype static type analyzer
161
+ .pytype/
162
+
163
+ # Cython debug symbols
164
+ cython_debug/
165
+
166
+ # PyCharm
167
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
168
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
169
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
170
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
171
+ #.idea/
F5-TTS/.gitmodules ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [submodule "src/third_party/BigVGAN"]
2
+ path = src/third_party/BigVGAN
3
+ url = https://github.com/NVIDIA/BigVGAN.git
F5-TTS/.pre-commit-config.yaml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ - repo: https://github.com/astral-sh/ruff-pre-commit
3
+ # Ruff version.
4
+ rev: v0.11.2
5
+ hooks:
6
+ - id: ruff
7
+ name: ruff linter
8
+ args: [--fix]
9
+ - id: ruff-format
10
+ name: ruff formatter
11
+ - id: ruff
12
+ name: ruff sorter
13
+ args: [--select, I, --fix]
14
+ - repo: https://github.com/pre-commit/pre-commit-hooks
15
+ rev: v5.0.0
16
+ hooks:
17
+ - id: check-yaml
F5-TTS/Dockerfile ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM pytorch/pytorch:2.4.0-cuda12.4-cudnn9-devel
2
+
3
+ USER root
4
+
5
+ ARG DEBIAN_FRONTEND=noninteractive
6
+
7
+ LABEL github_repo="https://github.com/SWivid/F5-TTS"
8
+
9
+ RUN set -x \
10
+ && apt-get update \
11
+ && apt-get -y install wget curl man git less openssl libssl-dev unzip unar build-essential aria2 tmux vim \
12
+ && apt-get install -y openssh-server sox libsox-fmt-all libsox-fmt-mp3 libsndfile1-dev ffmpeg \
13
+ && apt-get install -y librdmacm1 libibumad3 librdmacm-dev libibverbs1 libibverbs-dev ibverbs-utils ibverbs-providers \
14
+ && rm -rf /var/lib/apt/lists/* \
15
+ && apt-get clean
16
+
17
+ WORKDIR /workspace
18
+
19
+ RUN git clone https://github.com/SWivid/F5-TTS.git \
20
+ && cd F5-TTS \
21
+ && git submodule update --init --recursive \
22
+ && pip install -e . --no-cache-dir
23
+
24
+ ENV SHELL=/bin/bash
25
+
26
+ VOLUME /root/.cache/huggingface/hub/
27
+
28
+ EXPOSE 7860
29
+
30
+ WORKDIR /workspace/F5-TTS
F5-TTS/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Yushen CHEN
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
F5-TTS/README.md ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching
2
+
3
+ [![python](https://img.shields.io/badge/Python-3.10-brightgreen)](https://github.com/SWivid/F5-TTS)
4
+ [![arXiv](https://img.shields.io/badge/arXiv-2410.06885-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2410.06885)
5
+ [![demo](https://img.shields.io/badge/GitHub-Demo-orange.svg)](https://swivid.github.io/F5-TTS/)
6
+ [![hfspace](https://img.shields.io/badge/🤗-HF%20Space-yellow)](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
7
+ [![msspace](https://img.shields.io/badge/🤖-MS%20Space-blue)](https://modelscope.cn/studios/AI-ModelScope/E2-F5-TTS)
8
+ [![lab](https://img.shields.io/badge/🏫-X--LANCE-grey?labelColor=lightgrey)](https://x-lance.sjtu.edu.cn/)
9
+ [![lab](https://img.shields.io/badge/🏫-SII-grey?labelColor=lightgrey)](https://www.sii.edu.cn/)
10
+ [![lab](https://img.shields.io/badge/🏫-PCL-grey?labelColor=lightgrey)](https://www.pcl.ac.cn)
11
+ <!-- <img src="https://github.com/user-attachments/assets/12d7749c-071a-427c-81bf-b87b91def670" alt="Watermark" style="width: 40px; height: auto"> -->
12
+
13
+ **F5-TTS**: Diffusion Transformer with ConvNeXt V2, faster trained and inference.
14
+
15
+ **E2 TTS**: Flat-UNet Transformer, closest reproduction from [paper](https://arxiv.org/abs/2406.18009).
16
+
17
+ **Sway Sampling**: Inference-time flow step sampling strategy, greatly improves performance
18
+
19
+ ### Thanks to all the contributors !
20
+
21
+ ## News
22
+ - **2025/03/12**: 🔥 F5-TTS v1 base model with better training and inference performance. [Few demo](https://swivid.github.io/F5-TTS_updates).
23
+ - **2024/10/08**: F5-TTS & E2 TTS base models on [🤗 Hugging Face](https://huggingface.co/SWivid/F5-TTS), [🤖 Model Scope](https://www.modelscope.cn/models/SWivid/F5-TTS_Emilia-ZH-EN), [🟣 Wisemodel](https://wisemodel.cn/models/SJTU_X-LANCE/F5-TTS_Emilia-ZH-EN).
24
+
25
+ ## Installation
26
+
27
+ ### Create a separate environment if needed
28
+
29
+ ```bash
30
+ # Create a conda env with python_version>=3.10 (you could also use virtualenv)
31
+ conda create -n f5-tts python=3.11
32
+ conda activate f5-tts
33
+ ```
34
+
35
+ ### Install PyTorch with matched device
36
+
37
+ <details>
38
+ <summary>NVIDIA GPU</summary>
39
+
40
+ > ```bash
41
+ > # Install pytorch with your CUDA version, e.g.
42
+ > pip install torch==2.4.0+cu124 torchaudio==2.4.0+cu124 --extra-index-url https://download.pytorch.org/whl/cu124
43
+ > ```
44
+
45
+ </details>
46
+
47
+ <details>
48
+ <summary>AMD GPU</summary>
49
+
50
+ > ```bash
51
+ > # Install pytorch with your ROCm version (Linux only), e.g.
52
+ > pip install torch==2.5.1+rocm6.2 torchaudio==2.5.1+rocm6.2 --extra-index-url https://download.pytorch.org/whl/rocm6.2
53
+ > ```
54
+
55
+ </details>
56
+
57
+ <details>
58
+ <summary>Intel GPU</summary>
59
+
60
+ > ```bash
61
+ > # Install pytorch with your XPU version, e.g.
62
+ > # Intel® Deep Learning Essentials or Intel® oneAPI Base Toolkit must be installed
63
+ > pip install torch torchaudio --index-url https://download.pytorch.org/whl/test/xpu
64
+ >
65
+ > # Intel GPU support is also available through IPEX (Intel® Extension for PyTorch)
66
+ > # IPEX does not require the Intel® Deep Learning Essentials or Intel® oneAPI Base Toolkit
67
+ > # See: https://pytorch-extension.intel.com/installation?request=platform
68
+ > ```
69
+
70
+ </details>
71
+
72
+ <details>
73
+ <summary>Apple Silicon</summary>
74
+
75
+ > ```bash
76
+ > # Install the stable pytorch, e.g.
77
+ > pip install torch torchaudio
78
+ > ```
79
+
80
+ </details>
81
+
82
+ ### Then you can choose one from below:
83
+
84
+ > ### 1. As a pip package (if just for inference)
85
+ >
86
+ > ```bash
87
+ > pip install f5-tts
88
+ > ```
89
+ >
90
+ > ### 2. Local editable (if also do training, finetuning)
91
+ >
92
+ > ```bash
93
+ > git clone https://github.com/SWivid/F5-TTS.git
94
+ > cd F5-TTS
95
+ > # git submodule update --init --recursive # (optional, if use bigvgan as vocoder)
96
+ > pip install -e .
97
+ > ```
98
+
99
+ ### Docker usage also available
100
+ ```bash
101
+ # Build from Dockerfile
102
+ docker build -t f5tts:v1 .
103
+
104
+ # Run from GitHub Container Registry
105
+ docker container run --rm -it --gpus=all --mount 'type=volume,source=f5-tts,target=/root/.cache/huggingface/hub/' -p 7860:7860 ghcr.io/swivid/f5-tts:main
106
+
107
+ # Quickstart if you want to just run the web interface (not CLI)
108
+ docker container run --rm -it --gpus=all --mount 'type=volume,source=f5-tts,target=/root/.cache/huggingface/hub/' -p 7860:7860 ghcr.io/swivid/f5-tts:main f5-tts_infer-gradio --host 0.0.0.0
109
+ ```
110
+
111
+ ### Runtime
112
+
113
+ Deployment solution with Triton and TensorRT-LLM.
114
+
115
+ #### Benchmark Results
116
+ Decoding on a single L20 GPU, using 26 different prompt_audio & target_text pairs, 16 NFE.
117
+
118
+ | Model | Concurrency | Avg Latency | RTF | Mode |
119
+ |---------------------|----------------|-------------|--------|-----------------|
120
+ | F5-TTS Base (Vocos) | 2 | 253 ms | 0.0394 | Client-Server |
121
+ | F5-TTS Base (Vocos) | 1 (Batch_size) | - | 0.0402 | Offline TRT-LLM |
122
+ | F5-TTS Base (Vocos) | 1 (Batch_size) | - | 0.1467 | Offline Pytorch |
123
+
124
+ See [detailed instructions](src/f5_tts/runtime/triton_trtllm/README.md) for more information.
125
+
126
+
127
+ ## Inference
128
+
129
+ - In order to achieve desired performance, take a moment to read [detailed guidance](src/f5_tts/infer).
130
+ - By properly searching the keywords of problem encountered, [issues](https://github.com/SWivid/F5-TTS/issues?q=is%3Aissue) are very helpful.
131
+
132
+ ### 1. Gradio App
133
+
134
+ Currently supported features:
135
+
136
+ - Basic TTS with Chunk Inference
137
+ - Multi-Style / Multi-Speaker Generation
138
+ - Voice Chat powered by Qwen2.5-3B-Instruct
139
+ - [Custom inference with more language support](src/f5_tts/infer/SHARED.md)
140
+
141
+ ```bash
142
+ # Launch a Gradio app (web interface)
143
+ f5-tts_infer-gradio
144
+
145
+ # Specify the port/host
146
+ f5-tts_infer-gradio --port 7860 --host 0.0.0.0
147
+
148
+ # Launch a share link
149
+ f5-tts_infer-gradio --share
150
+ ```
151
+
152
+ <details>
153
+ <summary>NVIDIA device docker compose file example</summary>
154
+
155
+ ```yaml
156
+ services:
157
+ f5-tts:
158
+ image: ghcr.io/swivid/f5-tts:main
159
+ ports:
160
+ - "7860:7860"
161
+ environment:
162
+ GRADIO_SERVER_PORT: 7860
163
+ entrypoint: ["f5-tts_infer-gradio", "--port", "7860", "--host", "0.0.0.0"]
164
+ deploy:
165
+ resources:
166
+ reservations:
167
+ devices:
168
+ - driver: nvidia
169
+ count: 1
170
+ capabilities: [gpu]
171
+
172
+ volumes:
173
+ f5-tts:
174
+ driver: local
175
+ ```
176
+
177
+ </details>
178
+
179
+ ### 2. CLI Inference
180
+
181
+ ```bash
182
+ # Run with flags
183
+ # Leave --ref_text "" will have ASR model transcribe (extra GPU memory usage)
184
+ f5-tts_infer-cli --model F5TTS_v1_Base \
185
+ --ref_audio "provide_prompt_wav_path_here.wav" \
186
+ --ref_text "The content, subtitle or transcription of reference audio." \
187
+ --gen_text "Some text you want TTS model generate for you."
188
+
189
+ # Run with default setting. src/f5_tts/infer/examples/basic/basic.toml
190
+ f5-tts_infer-cli
191
+ # Or with your own .toml file
192
+ f5-tts_infer-cli -c custom.toml
193
+
194
+ # Multi voice. See src/f5_tts/infer/README.md
195
+ f5-tts_infer-cli -c src/f5_tts/infer/examples/multi/story.toml
196
+ ```
197
+
198
+
199
+ ## Training
200
+
201
+ ### 1. With Hugging Face Accelerate
202
+
203
+ Refer to [training & finetuning guidance](src/f5_tts/train) for best practice.
204
+
205
+ ### 2. With Gradio App
206
+
207
+ ```bash
208
+ # Quick start with Gradio web interface
209
+ f5-tts_finetune-gradio
210
+ ```
211
+
212
+ Read [training & finetuning guidance](src/f5_tts/train) for more instructions.
213
+
214
+
215
+ ## [Evaluation](src/f5_tts/eval)
216
+
217
+
218
+ ## Development
219
+
220
+ Use pre-commit to ensure code quality (will run linters and formatters automatically):
221
+
222
+ ```bash
223
+ pip install pre-commit
224
+ pre-commit install
225
+ ```
226
+
227
+ When making a pull request, before each commit, run:
228
+
229
+ ```bash
230
+ pre-commit run --all-files
231
+ ```
232
+
233
+ Note: Some model components have linting exceptions for E722 to accommodate tensor notation.
234
+
235
+
236
+ ## Acknowledgements
237
+
238
+ - [E2-TTS](https://arxiv.org/abs/2406.18009) brilliant work, simple and effective
239
+ - [Emilia](https://arxiv.org/abs/2407.05361), [WenetSpeech4TTS](https://arxiv.org/abs/2406.05763), [LibriTTS](https://arxiv.org/abs/1904.02882), [LJSpeech](https://keithito.com/LJ-Speech-Dataset/) valuable datasets
240
+ - [lucidrains](https://github.com/lucidrains) initial CFM structure with also [bfs18](https://github.com/bfs18) for discussion
241
+ - [SD3](https://arxiv.org/abs/2403.03206) & [Hugging Face diffusers](https://github.com/huggingface/diffusers) DiT and MMDiT code structure
242
+ - [torchdiffeq](https://github.com/rtqichen/torchdiffeq) as ODE solver, [Vocos](https://huggingface.co/charactr/vocos-mel-24khz) and [BigVGAN](https://github.com/NVIDIA/BigVGAN) as vocoder
243
+ - [FunASR](https://github.com/modelscope/FunASR), [faster-whisper](https://github.com/SYSTRAN/faster-whisper), [UniSpeech](https://github.com/microsoft/UniSpeech), [SpeechMOS](https://github.com/tarepan/SpeechMOS) for evaluation tools
244
+ - [ctc-forced-aligner](https://github.com/MahmoudAshraf97/ctc-forced-aligner) for speech edit test
245
+ - [mrfakename](https://x.com/realmrfakename) huggingface space demo ~
246
+ - [f5-tts-mlx](https://github.com/lucasnewman/f5-tts-mlx/tree/main) Implementation with MLX framework by [Lucas Newman](https://github.com/lucasnewman)
247
+ - [F5-TTS-ONNX](https://github.com/DakeQQ/F5-TTS-ONNX) ONNX Runtime version by [DakeQQ](https://github.com/DakeQQ)
248
+ - [Yuekai Zhang](https://github.com/yuekaizhang) Triton and TensorRT-LLM support ~
249
+
250
+ ## Citation
251
+ If our work and codebase is useful for you, please cite as:
252
+ ```
253
+ @article{chen-etal-2024-f5tts,
254
+ title={F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching},
255
+ author={Yushen Chen and Zhikang Niu and Ziyang Ma and Keqi Deng and Chunhui Wang and Jian Zhao and Kai Yu and Xie Chen},
256
+ journal={arXiv preprint arXiv:2410.06885},
257
+ year={2024},
258
+ }
259
+ ```
260
+ ## License
261
+
262
+ Our code is released under MIT License. The pre-trained models are licensed under the CC-BY-NC license due to the training data Emilia, which is an in-the-wild dataset. Sorry for any inconvenience this may cause.
F5-TTS/pyproject.toml ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools >= 61.0", "setuptools-scm>=8.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "f5-tts"
7
+ version = "1.1.9"
8
+ description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
9
+ readme = "README.md"
10
+ license = {text = "MIT License"}
11
+ classifiers = [
12
+ "License :: OSI Approved :: MIT License",
13
+ "Operating System :: OS Independent",
14
+ "Programming Language :: Python :: 3",
15
+ ]
16
+ dependencies = [
17
+ "accelerate>=0.33.0",
18
+ "bitsandbytes>0.37.0; platform_machine!='arm64' and platform_system!='Darwin'",
19
+ "cached_path",
20
+ "click",
21
+ "datasets",
22
+ "ema_pytorch>=0.5.2",
23
+ "gradio>=5.0.0",
24
+ "hydra-core>=1.3.0",
25
+ "jieba",
26
+ "librosa",
27
+ "matplotlib",
28
+ "numpy<=1.26.4; python_version<='3.10'",
29
+ "pydantic<=2.10.6",
30
+ "pydub",
31
+ "pypinyin",
32
+ "safetensors",
33
+ "soundfile",
34
+ "tomli",
35
+ "torch>=2.0.0",
36
+ "torchaudio>=2.0.0",
37
+ "torchdiffeq",
38
+ "tqdm>=4.65.0",
39
+ "transformers",
40
+ "transformers_stream_generator",
41
+ "unidecode",
42
+ "vocos",
43
+ "wandb",
44
+ "x_transformers>=1.31.14",
45
+ ]
46
+
47
+ [project.optional-dependencies]
48
+ eval = [
49
+ "faster_whisper==0.10.1",
50
+ "funasr",
51
+ "jiwer",
52
+ "modelscope",
53
+ "zhconv",
54
+ "zhon",
55
+ ]
56
+
57
+ [project.urls]
58
+ Homepage = "https://github.com/SWivid/F5-TTS"
59
+
60
+ [project.scripts]
61
+ "f5-tts_infer-cli" = "f5_tts.infer.infer_cli:main"
62
+ "f5-tts_infer-gradio" = "f5_tts.infer.infer_gradio:main"
63
+ "f5-tts_finetune-cli" = "f5_tts.train.finetune_cli:main"
64
+ "f5-tts_finetune-gradio" = "f5_tts.train.finetune_gradio:main"
F5-TTS/ruff.toml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ line-length = 120
2
+ target-version = "py310"
3
+
4
+ [lint]
5
+ # Only ignore variables with names starting with "_".
6
+ dummy-variable-rgx = "^_.*$"
7
+
8
+ [lint.isort]
9
+ force-single-line = false
10
+ lines-after-imports = 2
F5-TTS/src/f5_tts/api.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import sys
3
+ from importlib.resources import files
4
+
5
+ import soundfile as sf
6
+ import tqdm
7
+ from cached_path import cached_path
8
+ from hydra.utils import get_class
9
+ from omegaconf import OmegaConf
10
+
11
+ from f5_tts.infer.utils_infer import (
12
+ infer_process,
13
+ load_model,
14
+ load_vocoder,
15
+ preprocess_ref_audio_text,
16
+ remove_silence_for_generated_wav,
17
+ save_spectrogram,
18
+ transcribe,
19
+ )
20
+ from f5_tts.model.utils import seed_everything
21
+
22
+
23
+ class F5TTS:
24
+ def __init__(
25
+ self,
26
+ model="F5TTS_v1_Base",
27
+ ckpt_file="",
28
+ vocab_file="",
29
+ ode_method="euler",
30
+ use_ema=True,
31
+ vocoder_local_path=None,
32
+ device=None,
33
+ hf_cache_dir=None,
34
+ ):
35
+ model_cfg = OmegaConf.load(str(files("f5_tts").joinpath(f"configs/{model}.yaml")))
36
+ model_cls = get_class(f"f5_tts.model.{model_cfg.model.backbone}")
37
+ model_arc = model_cfg.model.arch
38
+
39
+ self.mel_spec_type = model_cfg.model.mel_spec.mel_spec_type
40
+ self.target_sample_rate = model_cfg.model.mel_spec.target_sample_rate
41
+
42
+ self.ode_method = ode_method
43
+ self.use_ema = use_ema
44
+
45
+ if device is not None:
46
+ self.device = device
47
+ else:
48
+ import torch
49
+
50
+ self.device = (
51
+ "cuda"
52
+ if torch.cuda.is_available()
53
+ else "xpu"
54
+ if torch.xpu.is_available()
55
+ else "mps"
56
+ if torch.backends.mps.is_available()
57
+ else "cpu"
58
+ )
59
+
60
+ # Load models
61
+ self.vocoder = load_vocoder(
62
+ self.mel_spec_type, vocoder_local_path is not None, vocoder_local_path, self.device, hf_cache_dir
63
+ )
64
+
65
+ repo_name, ckpt_step, ckpt_type = "F5-TTS", 1250000, "safetensors"
66
+
67
+ # override for previous models
68
+ if model == "F5TTS_Base":
69
+ if self.mel_spec_type == "vocos":
70
+ ckpt_step = 1200000
71
+ elif self.mel_spec_type == "bigvgan":
72
+ model = "F5TTS_Base_bigvgan"
73
+ ckpt_type = "pt"
74
+ elif model == "E2TTS_Base":
75
+ repo_name = "E2-TTS"
76
+ ckpt_step = 1200000
77
+
78
+ if not ckpt_file:
79
+ ckpt_file = str(
80
+ cached_path(f"hf://SWivid/{repo_name}/{model}/model_{ckpt_step}.{ckpt_type}", cache_dir=hf_cache_dir)
81
+ )
82
+ self.ema_model = load_model(
83
+ model_cls, model_arc, ckpt_file, self.mel_spec_type, vocab_file, self.ode_method, self.use_ema, self.device
84
+ )
85
+
86
+ def transcribe(self, ref_audio, language=None):
87
+ return transcribe(ref_audio, language)
88
+
89
+ def export_wav(self, wav, file_wave, remove_silence=False):
90
+ sf.write(file_wave, wav, self.target_sample_rate)
91
+
92
+ if remove_silence:
93
+ remove_silence_for_generated_wav(file_wave)
94
+
95
+ def export_spectrogram(self, spec, file_spec):
96
+ save_spectrogram(spec, file_spec)
97
+
98
+ def infer(
99
+ self,
100
+ ref_file,
101
+ ref_text,
102
+ gen_text,
103
+ show_info=print,
104
+ progress=tqdm,
105
+ target_rms=0.1,
106
+ cross_fade_duration=0.15,
107
+ sway_sampling_coef=-1,
108
+ cfg_strength=2,
109
+ nfe_step=32,
110
+ speed=1.0,
111
+ fix_duration=None,
112
+ remove_silence=False,
113
+ file_wave=None,
114
+ file_spec=None,
115
+ seed=None,
116
+ ):
117
+ if seed is None:
118
+ seed = random.randint(0, sys.maxsize)
119
+ seed_everything(seed)
120
+ self.seed = seed
121
+
122
+ ref_file, ref_text = preprocess_ref_audio_text(ref_file, ref_text)
123
+
124
+ wav, sr, spec = infer_process(
125
+ ref_file,
126
+ ref_text,
127
+ gen_text,
128
+ self.ema_model,
129
+ self.vocoder,
130
+ self.mel_spec_type,
131
+ show_info=show_info,
132
+ progress=progress,
133
+ target_rms=target_rms,
134
+ cross_fade_duration=cross_fade_duration,
135
+ nfe_step=nfe_step,
136
+ cfg_strength=cfg_strength,
137
+ sway_sampling_coef=sway_sampling_coef,
138
+ speed=speed,
139
+ fix_duration=fix_duration,
140
+ device=self.device,
141
+ )
142
+
143
+ if file_wave is not None:
144
+ self.export_wav(wav, file_wave, remove_silence)
145
+
146
+ if file_spec is not None:
147
+ self.export_spectrogram(spec, file_spec)
148
+
149
+ return wav, sr, spec
150
+
151
+
152
+ if __name__ == "__main__":
153
+ f5tts = F5TTS()
154
+
155
+ wav, sr, spec = f5tts.infer(
156
+ ref_file=str(files("f5_tts").joinpath("infer/examples/basic/basic_ref_en.wav")),
157
+ ref_text="Some call me nature, others call me mother nature.",
158
+ gen_text="I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring.",
159
+ file_wave=str(files("f5_tts").joinpath("../../tests/api_out.wav")),
160
+ file_spec=str(files("f5_tts").joinpath("../../tests/api_out.png")),
161
+ seed=None,
162
+ )
163
+
164
+ print("seed :", f5tts.seed)
F5-TTS/src/f5_tts/configs/E2TTS_Base.yaml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+
5
+ datasets:
6
+ name: Emilia_ZH_EN # dataset name
7
+ batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
8
+ batch_size_type: frame # frame | sample
9
+ max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
10
+ num_workers: 16
11
+
12
+ optim:
13
+ epochs: 11
14
+ learning_rate: 7.5e-5
15
+ num_warmup_updates: 20000 # warmup updates
16
+ grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
17
+ max_grad_norm: 1.0 # gradient clipping
18
+ bnb_optimizer: False # use bnb 8bit AdamW optimizer or not
19
+
20
+ model:
21
+ name: E2TTS_Base
22
+ tokenizer: pinyin
23
+ tokenizer_path: null # if 'custom' tokenizer, define the path want to use (should be vocab.txt)
24
+ backbone: UNetT
25
+ arch:
26
+ dim: 1024
27
+ depth: 24
28
+ heads: 16
29
+ ff_mult: 4
30
+ text_mask_padding: False
31
+ pe_attn_head: 1
32
+ mel_spec:
33
+ target_sample_rate: 24000
34
+ n_mel_channels: 100
35
+ hop_length: 256
36
+ win_length: 1024
37
+ n_fft: 1024
38
+ mel_spec_type: vocos # vocos | bigvgan
39
+ vocoder:
40
+ is_local: False # use local offline ckpt or not
41
+ local_path: null # local vocoder path
42
+
43
+ ckpts:
44
+ logger: wandb # wandb | tensorboard | null
45
+ log_samples: True # infer random sample per save checkpoint. wip, normal to fail with extra long samples
46
+ save_per_updates: 50000 # save checkpoint per updates
47
+ keep_last_n_checkpoints: -1 # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
48
+ last_per_updates: 5000 # save last checkpoint per updates
49
+ save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}
F5-TTS/src/f5_tts/configs/E2TTS_Small.yaml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+
5
+ datasets:
6
+ name: Emilia_ZH_EN
7
+ batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
8
+ batch_size_type: frame # frame | sample
9
+ max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
10
+ num_workers: 16
11
+
12
+ optim:
13
+ epochs: 11
14
+ learning_rate: 7.5e-5
15
+ num_warmup_updates: 20000 # warmup updates
16
+ grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
17
+ max_grad_norm: 1.0
18
+ bnb_optimizer: False
19
+
20
+ model:
21
+ name: E2TTS_Small
22
+ tokenizer: pinyin
23
+ tokenizer_path: null # if 'custom' tokenizer, define the path want to use (should be vocab.txt)
24
+ backbone: UNetT
25
+ arch:
26
+ dim: 768
27
+ depth: 20
28
+ heads: 12
29
+ ff_mult: 4
30
+ text_mask_padding: False
31
+ pe_attn_head: 1
32
+ mel_spec:
33
+ target_sample_rate: 24000
34
+ n_mel_channels: 100
35
+ hop_length: 256
36
+ win_length: 1024
37
+ n_fft: 1024
38
+ mel_spec_type: vocos # vocos | bigvgan
39
+ vocoder:
40
+ is_local: False # use local offline ckpt or not
41
+ local_path: null # local vocoder path
42
+
43
+ ckpts:
44
+ logger: wandb # wandb | tensorboard | null
45
+ log_samples: True # infer random sample per save checkpoint. wip, normal to fail with extra long samples
46
+ save_per_updates: 50000 # save checkpoint per updates
47
+ keep_last_n_checkpoints: -1 # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
48
+ last_per_updates: 5000 # save last checkpoint per updates
49
+ save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}
F5-TTS/src/f5_tts/configs/F5TTS_Base.yaml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+
5
+ datasets:
6
+ name: Emilia_ZH_EN # dataset name
7
+ batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
8
+ batch_size_type: frame # frame | sample
9
+ max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
10
+ num_workers: 16
11
+
12
+ optim:
13
+ epochs: 11
14
+ learning_rate: 7.5e-5
15
+ num_warmup_updates: 20000 # warmup updates
16
+ grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
17
+ max_grad_norm: 1.0 # gradient clipping
18
+ bnb_optimizer: False # use bnb 8bit AdamW optimizer or not
19
+
20
+ model:
21
+ name: F5TTS_Base # model name
22
+ tokenizer: pinyin # tokenizer type
23
+ tokenizer_path: null # if 'custom' tokenizer, define the path want to use (should be vocab.txt)
24
+ backbone: DiT
25
+ arch:
26
+ dim: 1024
27
+ depth: 22
28
+ heads: 16
29
+ ff_mult: 2
30
+ text_dim: 512
31
+ text_mask_padding: False
32
+ conv_layers: 4
33
+ pe_attn_head: 1
34
+ attn_backend: torch # torch | flash_attn
35
+ attn_mask_enabled: False
36
+ checkpoint_activations: False # recompute activations and save memory for extra compute
37
+ mel_spec:
38
+ target_sample_rate: 24000
39
+ n_mel_channels: 100
40
+ hop_length: 256
41
+ win_length: 1024
42
+ n_fft: 1024
43
+ mel_spec_type: vocos # vocos | bigvgan
44
+ vocoder:
45
+ is_local: False # use local offline ckpt or not
46
+ local_path: null # local vocoder path
47
+
48
+ ckpts:
49
+ logger: wandb # wandb | tensorboard | null
50
+ log_samples: True # infer random sample per save checkpoint. wip, normal to fail with extra long samples
51
+ save_per_updates: 50000 # save checkpoint per updates
52
+ keep_last_n_checkpoints: -1 # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
53
+ last_per_updates: 5000 # save last checkpoint per updates
54
+ save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}
F5-TTS/src/f5_tts/configs/F5TTS_Small.yaml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+
5
+ datasets:
6
+ name: Emilia_ZH_EN
7
+ batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
8
+ batch_size_type: frame # frame | sample
9
+ max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
10
+ num_workers: 16
11
+
12
+ optim:
13
+ epochs: 11 # only suitable for Emilia, if you want to train it on LibriTTS, set epoch 686
14
+ learning_rate: 7.5e-5
15
+ num_warmup_updates: 20000 # warmup updates
16
+ grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
17
+ max_grad_norm: 1.0 # gradient clipping
18
+ bnb_optimizer: False # use bnb 8bit AdamW optimizer or not
19
+
20
+ model:
21
+ name: F5TTS_Small
22
+ tokenizer: pinyin
23
+ tokenizer_path: null # if 'custom' tokenizer, define the path want to use (should be vocab.txt)
24
+ backbone: DiT
25
+ arch:
26
+ dim: 768
27
+ depth: 18
28
+ heads: 12
29
+ ff_mult: 2
30
+ text_dim: 512
31
+ text_mask_padding: False
32
+ conv_layers: 4
33
+ pe_attn_head: 1
34
+ attn_backend: torch # torch | flash_attn
35
+ attn_mask_enabled: False
36
+ checkpoint_activations: False # recompute activations and save memory for extra compute
37
+ mel_spec:
38
+ target_sample_rate: 24000
39
+ n_mel_channels: 100
40
+ hop_length: 256
41
+ win_length: 1024
42
+ n_fft: 1024
43
+ mel_spec_type: vocos # vocos | bigvgan
44
+ vocoder:
45
+ is_local: False # use local offline ckpt or not
46
+ local_path: null # local vocoder path
47
+
48
+ ckpts:
49
+ logger: wandb # wandb | tensorboard | null
50
+ log_samples: True # infer random sample per save checkpoint. wip, normal to fail with extra long samples
51
+ save_per_updates: 50000 # save checkpoint per updates
52
+ keep_last_n_checkpoints: -1 # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
53
+ last_per_updates: 5000 # save last checkpoint per updates
54
+ save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}
F5-TTS/src/f5_tts/configs/F5TTS_v1_Base.yaml ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+
5
+ datasets:
6
+ name: Emilia_ZH_EN # dataset name
7
+ batch_size_per_gpu: 38400 # 8 GPUs, 8 * 38400 = 307200
8
+ batch_size_type: frame # frame | sample
9
+ max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
10
+ num_workers: 16
11
+
12
+ optim:
13
+ epochs: 11
14
+ learning_rate: 7.5e-5
15
+ num_warmup_updates: 20000 # warmup updates
16
+ grad_accumulation_steps: 1 # note: updates = steps / grad_accumulation_steps
17
+ max_grad_norm: 1.0 # gradient clipping
18
+ bnb_optimizer: False # use bnb 8bit AdamW optimizer or not
19
+
20
+ model:
21
+ name: F5TTS_v1_Base # model name
22
+ tokenizer: pinyin # tokenizer type
23
+ tokenizer_path: null # if 'custom' tokenizer, define the path want to use (should be vocab.txt)
24
+ backbone: DiT
25
+ arch:
26
+ dim: 1024
27
+ depth: 22
28
+ heads: 16
29
+ ff_mult: 2
30
+ text_dim: 512
31
+ text_mask_padding: True
32
+ qk_norm: null # null | rms_norm
33
+ conv_layers: 4
34
+ pe_attn_head: null
35
+ attn_backend: torch # torch | flash_attn
36
+ attn_mask_enabled: False
37
+ checkpoint_activations: False # recompute activations and save memory for extra compute
38
+ mel_spec:
39
+ target_sample_rate: 24000
40
+ n_mel_channels: 100
41
+ hop_length: 256
42
+ win_length: 1024
43
+ n_fft: 1024
44
+ mel_spec_type: vocos # vocos | bigvgan
45
+ vocoder:
46
+ is_local: False # use local offline ckpt or not
47
+ local_path: null # local vocoder path
48
+
49
+ ckpts:
50
+ logger: wandb # wandb | tensorboard | null
51
+ log_samples: True # infer random sample per save checkpoint. wip, normal to fail with extra long samples
52
+ save_per_updates: 50000 # save checkpoint per updates
53
+ keep_last_n_checkpoints: -1 # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
54
+ last_per_updates: 5000 # save last checkpoint per updates
55
+ save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}
F5-TTS/src/f5_tts/eval/README.md ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Evaluation
3
+
4
+ Install packages for evaluation:
5
+
6
+ ```bash
7
+ pip install -e .[eval]
8
+ ```
9
+
10
+ ## Generating Samples for Evaluation
11
+
12
+ ### Prepare Test Datasets
13
+
14
+ 1. *Seed-TTS testset*: Download from [seed-tts-eval](https://github.com/BytedanceSpeech/seed-tts-eval).
15
+ 2. *LibriSpeech test-clean*: Download from [OpenSLR](http://www.openslr.org/12/).
16
+ 3. Unzip the downloaded datasets and place them in the `data/` directory.
17
+ 4. Update the path for *LibriSpeech test-clean* data in `src/f5_tts/eval/eval_infer_batch.py`
18
+ 5. Our filtered LibriSpeech-PC 4-10s subset: `data/librispeech_pc_test_clean_cross_sentence.lst`
19
+
20
+ ### Batch Inference for Test Set
21
+
22
+ To run batch inference for evaluations, execute the following commands:
23
+
24
+ ```bash
25
+ # batch inference for evaluations
26
+ accelerate config # if not set before
27
+ bash src/f5_tts/eval/eval_infer_batch.sh
28
+ ```
29
+
30
+ ## Objective Evaluation on Generated Results
31
+
32
+ ### Download Evaluation Model Checkpoints
33
+
34
+ 1. Chinese ASR Model: [Paraformer-zh](https://huggingface.co/funasr/paraformer-zh)
35
+ 2. English ASR Model: [Faster-Whisper](https://huggingface.co/Systran/faster-whisper-large-v3)
36
+ 3. WavLM Model: Download from [Google Drive](https://drive.google.com/file/d/1-aE1NfzpRCLxA4GUxX9ITI3F9LlbtEGP/view).
37
+
38
+ Then update in the following scripts with the paths you put evaluation model ckpts to.
39
+
40
+ ### Objective Evaluation
41
+
42
+ Update the path with your batch-inferenced results, and carry out WER / SIM / UTMOS evaluations:
43
+ ```bash
44
+ # Evaluation [WER] for Seed-TTS test [ZH] set
45
+ python src/f5_tts/eval/eval_seedtts_testset.py --eval_task wer --lang zh --gen_wav_dir <GEN_WAV_DIR> --gpu_nums 8
46
+
47
+ # Evaluation [SIM] for LibriSpeech-PC test-clean (cross-sentence)
48
+ python src/f5_tts/eval/eval_librispeech_test_clean.py --eval_task sim --gen_wav_dir <GEN_WAV_DIR> --librispeech_test_clean_path <TEST_CLEAN_PATH>
49
+
50
+ # Evaluation [UTMOS]. --ext: Audio extension
51
+ python src/f5_tts/eval/eval_utmos.py --audio_dir <WAV_DIR> --ext wav
52
+ ```
F5-TTS/src/f5_tts/eval/ecapa_tdnn.py ADDED
@@ -0,0 +1,331 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # just for speaker similarity evaluation, third-party code
2
+
3
+ # From https://github.com/microsoft/UniSpeech/blob/main/downstreams/speaker_verification/models/
4
+ # part of the code is borrowed from https://github.com/lawlict/ECAPA-TDNN
5
+
6
+ import os
7
+
8
+ import torch
9
+ import torch.nn as nn
10
+ import torch.nn.functional as F
11
+
12
+
13
+ """ Res2Conv1d + BatchNorm1d + ReLU
14
+ """
15
+
16
+
17
+ class Res2Conv1dReluBn(nn.Module):
18
+ """
19
+ in_channels == out_channels == channels
20
+ """
21
+
22
+ def __init__(self, channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=True, scale=4):
23
+ super().__init__()
24
+ assert channels % scale == 0, "{} % {} != 0".format(channels, scale)
25
+ self.scale = scale
26
+ self.width = channels // scale
27
+ self.nums = scale if scale == 1 else scale - 1
28
+
29
+ self.convs = []
30
+ self.bns = []
31
+ for i in range(self.nums):
32
+ self.convs.append(nn.Conv1d(self.width, self.width, kernel_size, stride, padding, dilation, bias=bias))
33
+ self.bns.append(nn.BatchNorm1d(self.width))
34
+ self.convs = nn.ModuleList(self.convs)
35
+ self.bns = nn.ModuleList(self.bns)
36
+
37
+ def forward(self, x):
38
+ out = []
39
+ spx = torch.split(x, self.width, 1)
40
+ for i in range(self.nums):
41
+ if i == 0:
42
+ sp = spx[i]
43
+ else:
44
+ sp = sp + spx[i]
45
+ # Order: conv -> relu -> bn
46
+ sp = self.convs[i](sp)
47
+ sp = self.bns[i](F.relu(sp))
48
+ out.append(sp)
49
+ if self.scale != 1:
50
+ out.append(spx[self.nums])
51
+ out = torch.cat(out, dim=1)
52
+
53
+ return out
54
+
55
+
56
+ """ Conv1d + BatchNorm1d + ReLU
57
+ """
58
+
59
+
60
+ class Conv1dReluBn(nn.Module):
61
+ def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=True):
62
+ super().__init__()
63
+ self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias)
64
+ self.bn = nn.BatchNorm1d(out_channels)
65
+
66
+ def forward(self, x):
67
+ return self.bn(F.relu(self.conv(x)))
68
+
69
+
70
+ """ The SE connection of 1D case.
71
+ """
72
+
73
+
74
+ class SE_Connect(nn.Module):
75
+ def __init__(self, channels, se_bottleneck_dim=128):
76
+ super().__init__()
77
+ self.linear1 = nn.Linear(channels, se_bottleneck_dim)
78
+ self.linear2 = nn.Linear(se_bottleneck_dim, channels)
79
+
80
+ def forward(self, x):
81
+ out = x.mean(dim=2)
82
+ out = F.relu(self.linear1(out))
83
+ out = torch.sigmoid(self.linear2(out))
84
+ out = x * out.unsqueeze(2)
85
+
86
+ return out
87
+
88
+
89
+ """ SE-Res2Block of the ECAPA-TDNN architecture.
90
+ """
91
+
92
+ # def SE_Res2Block(channels, kernel_size, stride, padding, dilation, scale):
93
+ # return nn.Sequential(
94
+ # Conv1dReluBn(channels, 512, kernel_size=1, stride=1, padding=0),
95
+ # Res2Conv1dReluBn(512, kernel_size, stride, padding, dilation, scale=scale),
96
+ # Conv1dReluBn(512, channels, kernel_size=1, stride=1, padding=0),
97
+ # SE_Connect(channels)
98
+ # )
99
+
100
+
101
+ class SE_Res2Block(nn.Module):
102
+ def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation, scale, se_bottleneck_dim):
103
+ super().__init__()
104
+ self.Conv1dReluBn1 = Conv1dReluBn(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
105
+ self.Res2Conv1dReluBn = Res2Conv1dReluBn(out_channels, kernel_size, stride, padding, dilation, scale=scale)
106
+ self.Conv1dReluBn2 = Conv1dReluBn(out_channels, out_channels, kernel_size=1, stride=1, padding=0)
107
+ self.SE_Connect = SE_Connect(out_channels, se_bottleneck_dim)
108
+
109
+ self.shortcut = None
110
+ if in_channels != out_channels:
111
+ self.shortcut = nn.Conv1d(
112
+ in_channels=in_channels,
113
+ out_channels=out_channels,
114
+ kernel_size=1,
115
+ )
116
+
117
+ def forward(self, x):
118
+ residual = x
119
+ if self.shortcut:
120
+ residual = self.shortcut(x)
121
+
122
+ x = self.Conv1dReluBn1(x)
123
+ x = self.Res2Conv1dReluBn(x)
124
+ x = self.Conv1dReluBn2(x)
125
+ x = self.SE_Connect(x)
126
+
127
+ return x + residual
128
+
129
+
130
+ """ Attentive weighted mean and standard deviation pooling.
131
+ """
132
+
133
+
134
+ class AttentiveStatsPool(nn.Module):
135
+ def __init__(self, in_dim, attention_channels=128, global_context_att=False):
136
+ super().__init__()
137
+ self.global_context_att = global_context_att
138
+
139
+ # Use Conv1d with stride == 1 rather than Linear, then we don't need to transpose inputs.
140
+ if global_context_att:
141
+ self.linear1 = nn.Conv1d(in_dim * 3, attention_channels, kernel_size=1) # equals W and b in the paper
142
+ else:
143
+ self.linear1 = nn.Conv1d(in_dim, attention_channels, kernel_size=1) # equals W and b in the paper
144
+ self.linear2 = nn.Conv1d(attention_channels, in_dim, kernel_size=1) # equals V and k in the paper
145
+
146
+ def forward(self, x):
147
+ if self.global_context_att:
148
+ context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x)
149
+ context_std = torch.sqrt(torch.var(x, dim=-1, keepdim=True) + 1e-10).expand_as(x)
150
+ x_in = torch.cat((x, context_mean, context_std), dim=1)
151
+ else:
152
+ x_in = x
153
+
154
+ # DON'T use ReLU here! In experiments, I find ReLU hard to converge.
155
+ alpha = torch.tanh(self.linear1(x_in))
156
+ # alpha = F.relu(self.linear1(x_in))
157
+ alpha = torch.softmax(self.linear2(alpha), dim=2)
158
+ mean = torch.sum(alpha * x, dim=2)
159
+ residuals = torch.sum(alpha * (x**2), dim=2) - mean**2
160
+ std = torch.sqrt(residuals.clamp(min=1e-9))
161
+ return torch.cat([mean, std], dim=1)
162
+
163
+
164
+ class ECAPA_TDNN(nn.Module):
165
+ def __init__(
166
+ self,
167
+ feat_dim=80,
168
+ channels=512,
169
+ emb_dim=192,
170
+ global_context_att=False,
171
+ feat_type="wavlm_large",
172
+ sr=16000,
173
+ feature_selection="hidden_states",
174
+ update_extract=False,
175
+ config_path=None,
176
+ ):
177
+ super().__init__()
178
+
179
+ self.feat_type = feat_type
180
+ self.feature_selection = feature_selection
181
+ self.update_extract = update_extract
182
+ self.sr = sr
183
+
184
+ torch.hub._validate_not_a_forked_repo = lambda a, b, c: True
185
+ try:
186
+ local_s3prl_path = os.path.expanduser("~/.cache/torch/hub/s3prl_s3prl_main")
187
+ self.feature_extract = torch.hub.load(local_s3prl_path, feat_type, source="local", config_path=config_path)
188
+ except: # noqa: E722
189
+ self.feature_extract = torch.hub.load("s3prl/s3prl", feat_type)
190
+
191
+ if len(self.feature_extract.model.encoder.layers) == 24 and hasattr(
192
+ self.feature_extract.model.encoder.layers[23].self_attn, "fp32_attention"
193
+ ):
194
+ self.feature_extract.model.encoder.layers[23].self_attn.fp32_attention = False
195
+ if len(self.feature_extract.model.encoder.layers) == 24 and hasattr(
196
+ self.feature_extract.model.encoder.layers[11].self_attn, "fp32_attention"
197
+ ):
198
+ self.feature_extract.model.encoder.layers[11].self_attn.fp32_attention = False
199
+
200
+ self.feat_num = self.get_feat_num()
201
+ self.feature_weight = nn.Parameter(torch.zeros(self.feat_num))
202
+
203
+ if feat_type != "fbank" and feat_type != "mfcc":
204
+ freeze_list = ["final_proj", "label_embs_concat", "mask_emb", "project_q", "quantizer"]
205
+ for name, param in self.feature_extract.named_parameters():
206
+ for freeze_val in freeze_list:
207
+ if freeze_val in name:
208
+ param.requires_grad = False
209
+ break
210
+
211
+ if not self.update_extract:
212
+ for param in self.feature_extract.parameters():
213
+ param.requires_grad = False
214
+
215
+ self.instance_norm = nn.InstanceNorm1d(feat_dim)
216
+ # self.channels = [channels] * 4 + [channels * 3]
217
+ self.channels = [channels] * 4 + [1536]
218
+
219
+ self.layer1 = Conv1dReluBn(feat_dim, self.channels[0], kernel_size=5, padding=2)
220
+ self.layer2 = SE_Res2Block(
221
+ self.channels[0],
222
+ self.channels[1],
223
+ kernel_size=3,
224
+ stride=1,
225
+ padding=2,
226
+ dilation=2,
227
+ scale=8,
228
+ se_bottleneck_dim=128,
229
+ )
230
+ self.layer3 = SE_Res2Block(
231
+ self.channels[1],
232
+ self.channels[2],
233
+ kernel_size=3,
234
+ stride=1,
235
+ padding=3,
236
+ dilation=3,
237
+ scale=8,
238
+ se_bottleneck_dim=128,
239
+ )
240
+ self.layer4 = SE_Res2Block(
241
+ self.channels[2],
242
+ self.channels[3],
243
+ kernel_size=3,
244
+ stride=1,
245
+ padding=4,
246
+ dilation=4,
247
+ scale=8,
248
+ se_bottleneck_dim=128,
249
+ )
250
+
251
+ # self.conv = nn.Conv1d(self.channels[-1], self.channels[-1], kernel_size=1)
252
+ cat_channels = channels * 3
253
+ self.conv = nn.Conv1d(cat_channels, self.channels[-1], kernel_size=1)
254
+ self.pooling = AttentiveStatsPool(
255
+ self.channels[-1], attention_channels=128, global_context_att=global_context_att
256
+ )
257
+ self.bn = nn.BatchNorm1d(self.channels[-1] * 2)
258
+ self.linear = nn.Linear(self.channels[-1] * 2, emb_dim)
259
+
260
+ def get_feat_num(self):
261
+ self.feature_extract.eval()
262
+ wav = [torch.randn(self.sr).to(next(self.feature_extract.parameters()).device)]
263
+ with torch.no_grad():
264
+ features = self.feature_extract(wav)
265
+ select_feature = features[self.feature_selection]
266
+ if isinstance(select_feature, (list, tuple)):
267
+ return len(select_feature)
268
+ else:
269
+ return 1
270
+
271
+ def get_feat(self, x):
272
+ if self.update_extract:
273
+ x = self.feature_extract([sample for sample in x])
274
+ else:
275
+ with torch.no_grad():
276
+ if self.feat_type == "fbank" or self.feat_type == "mfcc":
277
+ x = self.feature_extract(x) + 1e-6 # B x feat_dim x time_len
278
+ else:
279
+ x = self.feature_extract([sample for sample in x])
280
+
281
+ if self.feat_type == "fbank":
282
+ x = x.log()
283
+
284
+ if self.feat_type != "fbank" and self.feat_type != "mfcc":
285
+ x = x[self.feature_selection]
286
+ if isinstance(x, (list, tuple)):
287
+ x = torch.stack(x, dim=0)
288
+ else:
289
+ x = x.unsqueeze(0)
290
+ norm_weights = F.softmax(self.feature_weight, dim=-1).unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)
291
+ x = (norm_weights * x).sum(dim=0)
292
+ x = torch.transpose(x, 1, 2) + 1e-6
293
+
294
+ x = self.instance_norm(x)
295
+ return x
296
+
297
+ def forward(self, x):
298
+ x = self.get_feat(x)
299
+
300
+ out1 = self.layer1(x)
301
+ out2 = self.layer2(out1)
302
+ out3 = self.layer3(out2)
303
+ out4 = self.layer4(out3)
304
+
305
+ out = torch.cat([out2, out3, out4], dim=1)
306
+ out = F.relu(self.conv(out))
307
+ out = self.bn(self.pooling(out))
308
+ out = self.linear(out)
309
+
310
+ return out
311
+
312
+
313
+ def ECAPA_TDNN_SMALL(
314
+ feat_dim,
315
+ emb_dim=256,
316
+ feat_type="wavlm_large",
317
+ sr=16000,
318
+ feature_selection="hidden_states",
319
+ update_extract=False,
320
+ config_path=None,
321
+ ):
322
+ return ECAPA_TDNN(
323
+ feat_dim=feat_dim,
324
+ channels=512,
325
+ emb_dim=emb_dim,
326
+ feat_type=feat_type,
327
+ sr=sr,
328
+ feature_selection=feature_selection,
329
+ update_extract=update_extract,
330
+ config_path=config_path,
331
+ )
F5-TTS/src/f5_tts/eval/eval_infer_batch.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+
5
+ sys.path.append(os.getcwd())
6
+
7
+ import argparse
8
+ import time
9
+ from importlib.resources import files
10
+
11
+ import torch
12
+ import torchaudio
13
+ from accelerate import Accelerator
14
+ from hydra.utils import get_class
15
+ from omegaconf import OmegaConf
16
+ from tqdm import tqdm
17
+
18
+ from f5_tts.eval.utils_eval import (
19
+ get_inference_prompt,
20
+ get_librispeech_test_clean_metainfo,
21
+ get_seedtts_testset_metainfo,
22
+ )
23
+ from f5_tts.infer.utils_infer import load_checkpoint, load_vocoder
24
+ from f5_tts.model import CFM
25
+ from f5_tts.model.utils import get_tokenizer
26
+
27
+
28
+ accelerator = Accelerator()
29
+ device = f"cuda:{accelerator.process_index}"
30
+
31
+
32
+ use_ema = True
33
+ target_rms = 0.1
34
+
35
+
36
+ rel_path = str(files("f5_tts").joinpath("../../"))
37
+
38
+
39
+ def main():
40
+ parser = argparse.ArgumentParser(description="batch inference")
41
+
42
+ parser.add_argument("-s", "--seed", default=None, type=int)
43
+ parser.add_argument("-n", "--expname", required=True)
44
+ parser.add_argument("-c", "--ckptstep", default=1250000, type=int)
45
+
46
+ parser.add_argument("-nfe", "--nfestep", default=32, type=int)
47
+ parser.add_argument("-o", "--odemethod", default="euler")
48
+ parser.add_argument("-ss", "--swaysampling", default=-1, type=float)
49
+
50
+ parser.add_argument("-t", "--testset", required=True)
51
+
52
+ args = parser.parse_args()
53
+
54
+ seed = args.seed
55
+ exp_name = args.expname
56
+ ckpt_step = args.ckptstep
57
+
58
+ nfe_step = args.nfestep
59
+ ode_method = args.odemethod
60
+ sway_sampling_coef = args.swaysampling
61
+
62
+ testset = args.testset
63
+
64
+ infer_batch_size = 1 # max frames. 1 for ddp single inference (recommended)
65
+ cfg_strength = 2.0
66
+ speed = 1.0
67
+ use_truth_duration = False
68
+ no_ref_audio = False
69
+
70
+ model_cfg = OmegaConf.load(str(files("f5_tts").joinpath(f"configs/{exp_name}.yaml")))
71
+ model_cls = get_class(f"f5_tts.model.{model_cfg.model.backbone}")
72
+ model_arc = model_cfg.model.arch
73
+
74
+ dataset_name = model_cfg.datasets.name
75
+ tokenizer = model_cfg.model.tokenizer
76
+
77
+ mel_spec_type = model_cfg.model.mel_spec.mel_spec_type
78
+ target_sample_rate = model_cfg.model.mel_spec.target_sample_rate
79
+ n_mel_channels = model_cfg.model.mel_spec.n_mel_channels
80
+ hop_length = model_cfg.model.mel_spec.hop_length
81
+ win_length = model_cfg.model.mel_spec.win_length
82
+ n_fft = model_cfg.model.mel_spec.n_fft
83
+
84
+ if testset == "ls_pc_test_clean":
85
+ metalst = rel_path + "/data/librispeech_pc_test_clean_cross_sentence.lst"
86
+ librispeech_test_clean_path = "<SOME_PATH>/LibriSpeech/test-clean" # test-clean path
87
+ metainfo = get_librispeech_test_clean_metainfo(metalst, librispeech_test_clean_path)
88
+
89
+ elif testset == "seedtts_test_zh":
90
+ metalst = rel_path + "/data/seedtts_testset/zh/meta.lst"
91
+ metainfo = get_seedtts_testset_metainfo(metalst)
92
+
93
+ elif testset == "seedtts_test_en":
94
+ metalst = rel_path + "/data/seedtts_testset/en/meta.lst"
95
+ metainfo = get_seedtts_testset_metainfo(metalst)
96
+
97
+ # path to save genereted wavs
98
+ output_dir = (
99
+ f"{rel_path}/"
100
+ f"results/{exp_name}_{ckpt_step}/{testset}/"
101
+ f"seed{seed}_{ode_method}_nfe{nfe_step}_{mel_spec_type}"
102
+ f"{f'_ss{sway_sampling_coef}' if sway_sampling_coef else ''}"
103
+ f"_cfg{cfg_strength}_speed{speed}"
104
+ f"{'_gt-dur' if use_truth_duration else ''}"
105
+ f"{'_no-ref-audio' if no_ref_audio else ''}"
106
+ )
107
+
108
+ # -------------------------------------------------#
109
+
110
+ prompts_all = get_inference_prompt(
111
+ metainfo,
112
+ speed=speed,
113
+ tokenizer=tokenizer,
114
+ target_sample_rate=target_sample_rate,
115
+ n_mel_channels=n_mel_channels,
116
+ hop_length=hop_length,
117
+ mel_spec_type=mel_spec_type,
118
+ target_rms=target_rms,
119
+ use_truth_duration=use_truth_duration,
120
+ infer_batch_size=infer_batch_size,
121
+ )
122
+
123
+ # Vocoder model
124
+ local = False
125
+ if mel_spec_type == "vocos":
126
+ vocoder_local_path = "../checkpoints/charactr/vocos-mel-24khz"
127
+ elif mel_spec_type == "bigvgan":
128
+ vocoder_local_path = "../checkpoints/bigvgan_v2_24khz_100band_256x"
129
+ vocoder = load_vocoder(vocoder_name=mel_spec_type, is_local=local, local_path=vocoder_local_path)
130
+
131
+ # Tokenizer
132
+ vocab_char_map, vocab_size = get_tokenizer(dataset_name, tokenizer)
133
+
134
+ # Model
135
+ model = CFM(
136
+ transformer=model_cls(**model_arc, text_num_embeds=vocab_size, mel_dim=n_mel_channels),
137
+ mel_spec_kwargs=dict(
138
+ n_fft=n_fft,
139
+ hop_length=hop_length,
140
+ win_length=win_length,
141
+ n_mel_channels=n_mel_channels,
142
+ target_sample_rate=target_sample_rate,
143
+ mel_spec_type=mel_spec_type,
144
+ ),
145
+ odeint_kwargs=dict(
146
+ method=ode_method,
147
+ ),
148
+ vocab_char_map=vocab_char_map,
149
+ ).to(device)
150
+
151
+ ckpt_prefix = rel_path + f"/ckpts/{exp_name}/model_{ckpt_step}"
152
+ if os.path.exists(ckpt_prefix + ".pt"):
153
+ ckpt_path = ckpt_prefix + ".pt"
154
+ elif os.path.exists(ckpt_prefix + ".safetensors"):
155
+ ckpt_path = ckpt_prefix + ".safetensors"
156
+ else:
157
+ print("Loading from self-organized training checkpoints rather than released pretrained.")
158
+ ckpt_path = rel_path + f"/{model_cfg.ckpts.save_dir}/model_{ckpt_step}.pt"
159
+
160
+ dtype = torch.float32 if mel_spec_type == "bigvgan" else None
161
+ model = load_checkpoint(model, ckpt_path, device, dtype=dtype, use_ema=use_ema)
162
+
163
+ if not os.path.exists(output_dir) and accelerator.is_main_process:
164
+ os.makedirs(output_dir)
165
+
166
+ # start batch inference
167
+ accelerator.wait_for_everyone()
168
+ start = time.time()
169
+
170
+ with accelerator.split_between_processes(prompts_all) as prompts:
171
+ for prompt in tqdm(prompts, disable=not accelerator.is_local_main_process):
172
+ utts, ref_rms_list, ref_mels, ref_mel_lens, total_mel_lens, final_text_list = prompt
173
+ ref_mels = ref_mels.to(device)
174
+ ref_mel_lens = torch.tensor(ref_mel_lens, dtype=torch.long).to(device)
175
+ total_mel_lens = torch.tensor(total_mel_lens, dtype=torch.long).to(device)
176
+
177
+ # Inference
178
+ with torch.inference_mode():
179
+ generated, _ = model.sample(
180
+ cond=ref_mels,
181
+ text=final_text_list,
182
+ duration=total_mel_lens,
183
+ lens=ref_mel_lens,
184
+ steps=nfe_step,
185
+ cfg_strength=cfg_strength,
186
+ sway_sampling_coef=sway_sampling_coef,
187
+ no_ref_audio=no_ref_audio,
188
+ seed=seed,
189
+ )
190
+ # Final result
191
+ for i, gen in enumerate(generated):
192
+ gen = gen[ref_mel_lens[i] : total_mel_lens[i], :].unsqueeze(0)
193
+ gen_mel_spec = gen.permute(0, 2, 1).to(torch.float32)
194
+ if mel_spec_type == "vocos":
195
+ generated_wave = vocoder.decode(gen_mel_spec).cpu()
196
+ elif mel_spec_type == "bigvgan":
197
+ generated_wave = vocoder(gen_mel_spec).squeeze(0).cpu()
198
+
199
+ if ref_rms_list[i] < target_rms:
200
+ generated_wave = generated_wave * ref_rms_list[i] / target_rms
201
+ torchaudio.save(f"{output_dir}/{utts[i]}.wav", generated_wave, target_sample_rate)
202
+
203
+ accelerator.wait_for_everyone()
204
+ if accelerator.is_main_process:
205
+ timediff = time.time() - start
206
+ print(f"Done batch inference in {timediff / 60:.2f} minutes.")
207
+
208
+
209
+ if __name__ == "__main__":
210
+ main()
F5-TTS/src/f5_tts/eval/eval_infer_batch.sh ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # e.g. F5-TTS, 16 NFE
4
+ accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "F5TTS_v1_Base" -t "seedtts_test_zh" -nfe 16
5
+ accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "F5TTS_v1_Base" -t "seedtts_test_en" -nfe 16
6
+ accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "F5TTS_v1_Base" -t "ls_pc_test_clean" -nfe 16
7
+
8
+ # e.g. Vanilla E2 TTS, 32 NFE
9
+ accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "E2TTS_Base" -c 1200000 -t "seedtts_test_zh" -o "midpoint" -ss 0
10
+ accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "E2TTS_Base" -c 1200000 -t "seedtts_test_en" -o "midpoint" -ss 0
11
+ accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "E2TTS_Base" -c 1200000 -t "ls_pc_test_clean" -o "midpoint" -ss 0
12
+
13
+ # e.g. evaluate F5-TTS 16 NFE result on Seed-TTS test-zh
14
+ python src/f5_tts/eval/eval_seedtts_testset.py -e wer -l zh --gen_wav_dir results/F5TTS_v1_Base_1250000/seedtts_test_zh/seed0_euler_nfe32_vocos_ss-1_cfg2.0_speed1.0 --gpu_nums 8
15
+ python src/f5_tts/eval/eval_seedtts_testset.py -e sim -l zh --gen_wav_dir results/F5TTS_v1_Base_1250000/seedtts_test_zh/seed0_euler_nfe32_vocos_ss-1_cfg2.0_speed1.0 --gpu_nums 8
16
+ python src/f5_tts/eval/eval_utmos.py --audio_dir results/F5TTS_v1_Base_1250000/seedtts_test_zh/seed0_euler_nfe32_vocos_ss-1_cfg2.0_speed1.0
17
+
18
+ # etc.
F5-TTS/src/f5_tts/eval/eval_librispeech_test_clean.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Evaluate with Librispeech test-clean, ~3s prompt to generate 4-10s audio (the way of valle/voicebox evaluation)
2
+
3
+ import argparse
4
+ import json
5
+ import os
6
+ import sys
7
+
8
+
9
+ sys.path.append(os.getcwd())
10
+
11
+ import multiprocessing as mp
12
+ from importlib.resources import files
13
+
14
+ import numpy as np
15
+
16
+ from f5_tts.eval.utils_eval import get_librispeech_test, run_asr_wer, run_sim
17
+
18
+
19
+ rel_path = str(files("f5_tts").joinpath("../../"))
20
+
21
+
22
+ def get_args():
23
+ parser = argparse.ArgumentParser()
24
+ parser.add_argument("-e", "--eval_task", type=str, default="wer", choices=["sim", "wer"])
25
+ parser.add_argument("-l", "--lang", type=str, default="en")
26
+ parser.add_argument("-g", "--gen_wav_dir", type=str, required=True)
27
+ parser.add_argument("-p", "--librispeech_test_clean_path", type=str, required=True)
28
+ parser.add_argument("-n", "--gpu_nums", type=int, default=8, help="Number of GPUs to use")
29
+ parser.add_argument("--local", action="store_true", help="Use local custom checkpoint directory")
30
+ return parser.parse_args()
31
+
32
+
33
+ def main():
34
+ args = get_args()
35
+ eval_task = args.eval_task
36
+ lang = args.lang
37
+ librispeech_test_clean_path = args.librispeech_test_clean_path # test-clean path
38
+ gen_wav_dir = args.gen_wav_dir
39
+ metalst = rel_path + "/data/librispeech_pc_test_clean_cross_sentence.lst"
40
+
41
+ gpus = list(range(args.gpu_nums))
42
+ test_set = get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path)
43
+
44
+ ## In LibriSpeech, some speakers utilized varying voice characteristics for different characters in the book,
45
+ ## leading to a low similarity for the ground truth in some cases.
46
+ # test_set = get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path, eval_ground_truth = True) # eval ground truth
47
+
48
+ local = args.local
49
+ if local: # use local custom checkpoint dir
50
+ asr_ckpt_dir = "../checkpoints/Systran/faster-whisper-large-v3"
51
+ else:
52
+ asr_ckpt_dir = "" # auto download to cache dir
53
+ wavlm_ckpt_dir = "../checkpoints/UniSpeech/wavlm_large_finetune.pth"
54
+
55
+ # --------------------------------------------------------------------------
56
+
57
+ full_results = []
58
+ metrics = []
59
+
60
+ if eval_task == "wer":
61
+ with mp.Pool(processes=len(gpus)) as pool:
62
+ args = [(rank, lang, sub_test_set, asr_ckpt_dir) for (rank, sub_test_set) in test_set]
63
+ results = pool.map(run_asr_wer, args)
64
+ for r in results:
65
+ full_results.extend(r)
66
+ elif eval_task == "sim":
67
+ with mp.Pool(processes=len(gpus)) as pool:
68
+ args = [(rank, sub_test_set, wavlm_ckpt_dir) for (rank, sub_test_set) in test_set]
69
+ results = pool.map(run_sim, args)
70
+ for r in results:
71
+ full_results.extend(r)
72
+ else:
73
+ raise ValueError(f"Unknown metric type: {eval_task}")
74
+
75
+ result_path = f"{gen_wav_dir}/_{eval_task}_results.jsonl"
76
+ with open(result_path, "w") as f:
77
+ for line in full_results:
78
+ metrics.append(line[eval_task])
79
+ f.write(json.dumps(line, ensure_ascii=False) + "\n")
80
+ metric = round(np.mean(metrics), 5)
81
+ f.write(f"\n{eval_task.upper()}: {metric}\n")
82
+
83
+ print(f"\nTotal {len(metrics)} samples")
84
+ print(f"{eval_task.upper()}: {metric}")
85
+ print(f"{eval_task.upper()} results saved to {result_path}")
86
+
87
+
88
+ if __name__ == "__main__":
89
+ main()
F5-TTS/src/f5_tts/eval/eval_seedtts_testset.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Evaluate with Seed-TTS testset
2
+
3
+ import argparse
4
+ import json
5
+ import os
6
+ import sys
7
+
8
+
9
+ sys.path.append(os.getcwd())
10
+
11
+ import multiprocessing as mp
12
+ from importlib.resources import files
13
+
14
+ import numpy as np
15
+
16
+ from f5_tts.eval.utils_eval import get_seed_tts_test, run_asr_wer, run_sim
17
+
18
+
19
+ rel_path = str(files("f5_tts").joinpath("../../"))
20
+
21
+
22
+ def get_args():
23
+ parser = argparse.ArgumentParser()
24
+ parser.add_argument("-e", "--eval_task", type=str, default="wer", choices=["sim", "wer"])
25
+ parser.add_argument("-l", "--lang", type=str, default="en", choices=["zh", "en"])
26
+ parser.add_argument("-g", "--gen_wav_dir", type=str, required=True)
27
+ parser.add_argument("-n", "--gpu_nums", type=int, default=8, help="Number of GPUs to use")
28
+ parser.add_argument("--local", action="store_true", help="Use local custom checkpoint directory")
29
+ return parser.parse_args()
30
+
31
+
32
+ def main():
33
+ args = get_args()
34
+ eval_task = args.eval_task
35
+ lang = args.lang
36
+ gen_wav_dir = args.gen_wav_dir
37
+ metalst = rel_path + f"/data/seedtts_testset/{lang}/meta.lst" # seed-tts testset
38
+
39
+ # NOTE. paraformer-zh result will be slightly different according to the number of gpus, cuz batchsize is different
40
+ # zh 1.254 seems a result of 4 workers wer_seed_tts
41
+ gpus = list(range(args.gpu_nums))
42
+ test_set = get_seed_tts_test(metalst, gen_wav_dir, gpus)
43
+
44
+ local = args.local
45
+ if local: # use local custom checkpoint dir
46
+ if lang == "zh":
47
+ asr_ckpt_dir = "../checkpoints/funasr" # paraformer-zh dir under funasr
48
+ elif lang == "en":
49
+ asr_ckpt_dir = "../checkpoints/Systran/faster-whisper-large-v3"
50
+ else:
51
+ asr_ckpt_dir = "" # auto download to cache dir
52
+ wavlm_ckpt_dir = "../checkpoints/UniSpeech/wavlm_large_finetune.pth"
53
+
54
+ # --------------------------------------------------------------------------
55
+
56
+ full_results = []
57
+ metrics = []
58
+
59
+ if eval_task == "wer":
60
+ with mp.Pool(processes=len(gpus)) as pool:
61
+ args = [(rank, lang, sub_test_set, asr_ckpt_dir) for (rank, sub_test_set) in test_set]
62
+ results = pool.map(run_asr_wer, args)
63
+ for r in results:
64
+ full_results.extend(r)
65
+ elif eval_task == "sim":
66
+ with mp.Pool(processes=len(gpus)) as pool:
67
+ args = [(rank, sub_test_set, wavlm_ckpt_dir) for (rank, sub_test_set) in test_set]
68
+ results = pool.map(run_sim, args)
69
+ for r in results:
70
+ full_results.extend(r)
71
+ else:
72
+ raise ValueError(f"Unknown metric type: {eval_task}")
73
+
74
+ result_path = f"{gen_wav_dir}/_{eval_task}_results.jsonl"
75
+ with open(result_path, "w") as f:
76
+ for line in full_results:
77
+ metrics.append(line[eval_task])
78
+ f.write(json.dumps(line, ensure_ascii=False) + "\n")
79
+ metric = round(np.mean(metrics), 5)
80
+ f.write(f"\n{eval_task.upper()}: {metric}\n")
81
+
82
+ print(f"\nTotal {len(metrics)} samples")
83
+ print(f"{eval_task.upper()}: {metric}")
84
+ print(f"{eval_task.upper()} results saved to {result_path}")
85
+
86
+
87
+ if __name__ == "__main__":
88
+ main()
F5-TTS/src/f5_tts/eval/eval_utmos.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ from pathlib import Path
4
+
5
+ import librosa
6
+ import torch
7
+ from tqdm import tqdm
8
+
9
+
10
+ def main():
11
+ parser = argparse.ArgumentParser(description="UTMOS Evaluation")
12
+ parser.add_argument("--audio_dir", type=str, required=True, help="Audio file path.")
13
+ parser.add_argument("--ext", type=str, default="wav", help="Audio extension.")
14
+ args = parser.parse_args()
15
+
16
+ device = "cuda" if torch.cuda.is_available() else "xpu" if torch.xpu.is_available() else "cpu"
17
+
18
+ predictor = torch.hub.load("tarepan/SpeechMOS:v1.2.0", "utmos22_strong", trust_repo=True)
19
+ predictor = predictor.to(device)
20
+
21
+ audio_paths = list(Path(args.audio_dir).rglob(f"*.{args.ext}"))
22
+ utmos_score = 0
23
+
24
+ utmos_result_path = Path(args.audio_dir) / "_utmos_results.jsonl"
25
+ with open(utmos_result_path, "w", encoding="utf-8") as f:
26
+ for audio_path in tqdm(audio_paths, desc="Processing"):
27
+ wav, sr = librosa.load(audio_path, sr=None, mono=True)
28
+ wav_tensor = torch.from_numpy(wav).to(device).unsqueeze(0)
29
+ score = predictor(wav_tensor, sr)
30
+ line = {}
31
+ line["wav"], line["utmos"] = str(audio_path.stem), score.item()
32
+ utmos_score += score.item()
33
+ f.write(json.dumps(line, ensure_ascii=False) + "\n")
34
+ avg_score = utmos_score / len(audio_paths) if len(audio_paths) > 0 else 0
35
+ f.write(f"\nUTMOS: {avg_score:.4f}\n")
36
+
37
+ print(f"UTMOS: {avg_score:.4f}")
38
+ print(f"UTMOS results saved to {utmos_result_path}")
39
+
40
+
41
+ if __name__ == "__main__":
42
+ main()
F5-TTS/src/f5_tts/eval/utils_eval.py ADDED
@@ -0,0 +1,419 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import os
3
+ import random
4
+ import string
5
+ from pathlib import Path
6
+
7
+ import torch
8
+ import torch.nn.functional as F
9
+ import torchaudio
10
+ from tqdm import tqdm
11
+
12
+ from f5_tts.eval.ecapa_tdnn import ECAPA_TDNN_SMALL
13
+ from f5_tts.model.modules import MelSpec
14
+ from f5_tts.model.utils import convert_char_to_pinyin
15
+
16
+
17
+ # seedtts testset metainfo: utt, prompt_text, prompt_wav, gt_text, gt_wav
18
+ def get_seedtts_testset_metainfo(metalst):
19
+ f = open(metalst)
20
+ lines = f.readlines()
21
+ f.close()
22
+ metainfo = []
23
+ for line in lines:
24
+ if len(line.strip().split("|")) == 5:
25
+ utt, prompt_text, prompt_wav, gt_text, gt_wav = line.strip().split("|")
26
+ elif len(line.strip().split("|")) == 4:
27
+ utt, prompt_text, prompt_wav, gt_text = line.strip().split("|")
28
+ gt_wav = os.path.join(os.path.dirname(metalst), "wavs", utt + ".wav")
29
+ if not os.path.isabs(prompt_wav):
30
+ prompt_wav = os.path.join(os.path.dirname(metalst), prompt_wav)
31
+ metainfo.append((utt, prompt_text, prompt_wav, gt_text, gt_wav))
32
+ return metainfo
33
+
34
+
35
+ # librispeech test-clean metainfo: gen_utt, ref_txt, ref_wav, gen_txt, gen_wav
36
+ def get_librispeech_test_clean_metainfo(metalst, librispeech_test_clean_path):
37
+ f = open(metalst)
38
+ lines = f.readlines()
39
+ f.close()
40
+ metainfo = []
41
+ for line in lines:
42
+ ref_utt, ref_dur, ref_txt, gen_utt, gen_dur, gen_txt = line.strip().split("\t")
43
+
44
+ # ref_txt = ref_txt[0] + ref_txt[1:].lower() + '.' # if use librispeech test-clean (no-pc)
45
+ ref_spk_id, ref_chaptr_id, _ = ref_utt.split("-")
46
+ ref_wav = os.path.join(librispeech_test_clean_path, ref_spk_id, ref_chaptr_id, ref_utt + ".flac")
47
+
48
+ # gen_txt = gen_txt[0] + gen_txt[1:].lower() + '.' # if use librispeech test-clean (no-pc)
49
+ gen_spk_id, gen_chaptr_id, _ = gen_utt.split("-")
50
+ gen_wav = os.path.join(librispeech_test_clean_path, gen_spk_id, gen_chaptr_id, gen_utt + ".flac")
51
+
52
+ metainfo.append((gen_utt, ref_txt, ref_wav, " " + gen_txt, gen_wav))
53
+
54
+ return metainfo
55
+
56
+
57
+ # padded to max length mel batch
58
+ def padded_mel_batch(ref_mels):
59
+ max_mel_length = torch.LongTensor([mel.shape[-1] for mel in ref_mels]).amax()
60
+ padded_ref_mels = []
61
+ for mel in ref_mels:
62
+ padded_ref_mel = F.pad(mel, (0, max_mel_length - mel.shape[-1]), value=0)
63
+ padded_ref_mels.append(padded_ref_mel)
64
+ padded_ref_mels = torch.stack(padded_ref_mels)
65
+ padded_ref_mels = padded_ref_mels.permute(0, 2, 1)
66
+ return padded_ref_mels
67
+
68
+
69
+ # get prompts from metainfo containing: utt, prompt_text, prompt_wav, gt_text, gt_wav
70
+
71
+
72
+ def get_inference_prompt(
73
+ metainfo,
74
+ speed=1.0,
75
+ tokenizer="pinyin",
76
+ polyphone=True,
77
+ target_sample_rate=24000,
78
+ n_fft=1024,
79
+ win_length=1024,
80
+ n_mel_channels=100,
81
+ hop_length=256,
82
+ mel_spec_type="vocos",
83
+ target_rms=0.1,
84
+ use_truth_duration=False,
85
+ infer_batch_size=1,
86
+ num_buckets=200,
87
+ min_secs=3,
88
+ max_secs=40,
89
+ ):
90
+ prompts_all = []
91
+
92
+ min_tokens = min_secs * target_sample_rate // hop_length
93
+ max_tokens = max_secs * target_sample_rate // hop_length
94
+
95
+ batch_accum = [0] * num_buckets
96
+ utts, ref_rms_list, ref_mels, ref_mel_lens, total_mel_lens, final_text_list = (
97
+ [[] for _ in range(num_buckets)] for _ in range(6)
98
+ )
99
+
100
+ mel_spectrogram = MelSpec(
101
+ n_fft=n_fft,
102
+ hop_length=hop_length,
103
+ win_length=win_length,
104
+ n_mel_channels=n_mel_channels,
105
+ target_sample_rate=target_sample_rate,
106
+ mel_spec_type=mel_spec_type,
107
+ )
108
+
109
+ for utt, prompt_text, prompt_wav, gt_text, gt_wav in tqdm(metainfo, desc="Processing prompts..."):
110
+ # Audio
111
+ ref_audio, ref_sr = torchaudio.load(prompt_wav)
112
+ ref_rms = torch.sqrt(torch.mean(torch.square(ref_audio)))
113
+ if ref_rms < target_rms:
114
+ ref_audio = ref_audio * target_rms / ref_rms
115
+ assert ref_audio.shape[-1] > 5000, f"Empty prompt wav: {prompt_wav}, or torchaudio backend issue."
116
+ if ref_sr != target_sample_rate:
117
+ resampler = torchaudio.transforms.Resample(ref_sr, target_sample_rate)
118
+ ref_audio = resampler(ref_audio)
119
+
120
+ # Text
121
+ if len(prompt_text[-1].encode("utf-8")) == 1:
122
+ prompt_text = prompt_text + " "
123
+ text = [prompt_text + gt_text]
124
+ if tokenizer == "pinyin":
125
+ text_list = convert_char_to_pinyin(text, polyphone=polyphone)
126
+ else:
127
+ text_list = text
128
+
129
+ # to mel spectrogram
130
+ ref_mel = mel_spectrogram(ref_audio)
131
+ ref_mel = ref_mel.squeeze(0)
132
+
133
+ # Duration, mel frame length
134
+ ref_mel_len = ref_mel.shape[-1]
135
+
136
+ if use_truth_duration:
137
+ gt_audio, gt_sr = torchaudio.load(gt_wav)
138
+ if gt_sr != target_sample_rate:
139
+ resampler = torchaudio.transforms.Resample(gt_sr, target_sample_rate)
140
+ gt_audio = resampler(gt_audio)
141
+ total_mel_len = ref_mel_len + int(gt_audio.shape[-1] / hop_length / speed)
142
+
143
+ # # test vocoder resynthesis
144
+ # ref_audio = gt_audio
145
+ else:
146
+ ref_text_len = len(prompt_text.encode("utf-8"))
147
+ gen_text_len = len(gt_text.encode("utf-8"))
148
+ total_mel_len = ref_mel_len + int(ref_mel_len / ref_text_len * gen_text_len / speed)
149
+
150
+ # deal with batch
151
+ assert infer_batch_size > 0, "infer_batch_size should be greater than 0."
152
+ assert min_tokens <= total_mel_len <= max_tokens, (
153
+ f"Audio {utt} has duration {total_mel_len * hop_length // target_sample_rate}s out of range [{min_secs}, {max_secs}]."
154
+ )
155
+ bucket_i = math.floor((total_mel_len - min_tokens) / (max_tokens - min_tokens + 1) * num_buckets)
156
+
157
+ utts[bucket_i].append(utt)
158
+ ref_rms_list[bucket_i].append(ref_rms)
159
+ ref_mels[bucket_i].append(ref_mel)
160
+ ref_mel_lens[bucket_i].append(ref_mel_len)
161
+ total_mel_lens[bucket_i].append(total_mel_len)
162
+ final_text_list[bucket_i].extend(text_list)
163
+
164
+ batch_accum[bucket_i] += total_mel_len
165
+
166
+ if batch_accum[bucket_i] >= infer_batch_size:
167
+ # print(f"\n{len(ref_mels[bucket_i][0][0])}\n{ref_mel_lens[bucket_i]}\n{total_mel_lens[bucket_i]}")
168
+ prompts_all.append(
169
+ (
170
+ utts[bucket_i],
171
+ ref_rms_list[bucket_i],
172
+ padded_mel_batch(ref_mels[bucket_i]),
173
+ ref_mel_lens[bucket_i],
174
+ total_mel_lens[bucket_i],
175
+ final_text_list[bucket_i],
176
+ )
177
+ )
178
+ batch_accum[bucket_i] = 0
179
+ (
180
+ utts[bucket_i],
181
+ ref_rms_list[bucket_i],
182
+ ref_mels[bucket_i],
183
+ ref_mel_lens[bucket_i],
184
+ total_mel_lens[bucket_i],
185
+ final_text_list[bucket_i],
186
+ ) = [], [], [], [], [], []
187
+
188
+ # add residual
189
+ for bucket_i, bucket_frames in enumerate(batch_accum):
190
+ if bucket_frames > 0:
191
+ prompts_all.append(
192
+ (
193
+ utts[bucket_i],
194
+ ref_rms_list[bucket_i],
195
+ padded_mel_batch(ref_mels[bucket_i]),
196
+ ref_mel_lens[bucket_i],
197
+ total_mel_lens[bucket_i],
198
+ final_text_list[bucket_i],
199
+ )
200
+ )
201
+ # not only leave easy work for last workers
202
+ random.seed(666)
203
+ random.shuffle(prompts_all)
204
+
205
+ return prompts_all
206
+
207
+
208
+ # get wav_res_ref_text of seed-tts test metalst
209
+ # https://github.com/BytedanceSpeech/seed-tts-eval
210
+
211
+
212
+ def get_seed_tts_test(metalst, gen_wav_dir, gpus):
213
+ f = open(metalst)
214
+ lines = f.readlines()
215
+ f.close()
216
+
217
+ test_set_ = []
218
+ for line in tqdm(lines):
219
+ if len(line.strip().split("|")) == 5:
220
+ utt, prompt_text, prompt_wav, gt_text, gt_wav = line.strip().split("|")
221
+ elif len(line.strip().split("|")) == 4:
222
+ utt, prompt_text, prompt_wav, gt_text = line.strip().split("|")
223
+
224
+ if not os.path.exists(os.path.join(gen_wav_dir, utt + ".wav")):
225
+ continue
226
+ gen_wav = os.path.join(gen_wav_dir, utt + ".wav")
227
+ if not os.path.isabs(prompt_wav):
228
+ prompt_wav = os.path.join(os.path.dirname(metalst), prompt_wav)
229
+
230
+ test_set_.append((gen_wav, prompt_wav, gt_text))
231
+
232
+ num_jobs = len(gpus)
233
+ if num_jobs == 1:
234
+ return [(gpus[0], test_set_)]
235
+
236
+ wav_per_job = len(test_set_) // num_jobs + 1
237
+ test_set = []
238
+ for i in range(num_jobs):
239
+ test_set.append((gpus[i], test_set_[i * wav_per_job : (i + 1) * wav_per_job]))
240
+
241
+ return test_set
242
+
243
+
244
+ # get librispeech test-clean cross sentence test
245
+
246
+
247
+ def get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path, eval_ground_truth=False):
248
+ f = open(metalst)
249
+ lines = f.readlines()
250
+ f.close()
251
+
252
+ test_set_ = []
253
+ for line in tqdm(lines):
254
+ ref_utt, ref_dur, ref_txt, gen_utt, gen_dur, gen_txt = line.strip().split("\t")
255
+
256
+ if eval_ground_truth:
257
+ gen_spk_id, gen_chaptr_id, _ = gen_utt.split("-")
258
+ gen_wav = os.path.join(librispeech_test_clean_path, gen_spk_id, gen_chaptr_id, gen_utt + ".flac")
259
+ else:
260
+ if not os.path.exists(os.path.join(gen_wav_dir, gen_utt + ".wav")):
261
+ raise FileNotFoundError(f"Generated wav not found: {gen_utt}")
262
+ gen_wav = os.path.join(gen_wav_dir, gen_utt + ".wav")
263
+
264
+ ref_spk_id, ref_chaptr_id, _ = ref_utt.split("-")
265
+ ref_wav = os.path.join(librispeech_test_clean_path, ref_spk_id, ref_chaptr_id, ref_utt + ".flac")
266
+
267
+ test_set_.append((gen_wav, ref_wav, gen_txt))
268
+
269
+ num_jobs = len(gpus)
270
+ if num_jobs == 1:
271
+ return [(gpus[0], test_set_)]
272
+
273
+ wav_per_job = len(test_set_) // num_jobs + 1
274
+ test_set = []
275
+ for i in range(num_jobs):
276
+ test_set.append((gpus[i], test_set_[i * wav_per_job : (i + 1) * wav_per_job]))
277
+
278
+ return test_set
279
+
280
+
281
+ # load asr model
282
+
283
+
284
+ def load_asr_model(lang, ckpt_dir=""):
285
+ if lang == "zh":
286
+ from funasr import AutoModel
287
+
288
+ model = AutoModel(
289
+ model=os.path.join(ckpt_dir, "paraformer-zh"),
290
+ # vad_model = os.path.join(ckpt_dir, "fsmn-vad"),
291
+ # punc_model = os.path.join(ckpt_dir, "ct-punc"),
292
+ # spk_model = os.path.join(ckpt_dir, "cam++"),
293
+ disable_update=True,
294
+ ) # following seed-tts setting
295
+ elif lang == "en":
296
+ from faster_whisper import WhisperModel
297
+
298
+ model_size = "large-v3" if ckpt_dir == "" else ckpt_dir
299
+ model = WhisperModel(model_size, device="cuda", compute_type="float16")
300
+ return model
301
+
302
+
303
+ # WER Evaluation, the way Seed-TTS does
304
+
305
+
306
+ def run_asr_wer(args):
307
+ rank, lang, test_set, ckpt_dir = args
308
+
309
+ if lang == "zh":
310
+ import zhconv
311
+
312
+ torch.cuda.set_device(rank)
313
+ elif lang == "en":
314
+ os.environ["CUDA_VISIBLE_DEVICES"] = str(rank)
315
+ else:
316
+ raise NotImplementedError(
317
+ "lang support only 'zh' (funasr paraformer-zh), 'en' (faster-whisper-large-v3), for now."
318
+ )
319
+
320
+ asr_model = load_asr_model(lang, ckpt_dir=ckpt_dir)
321
+
322
+ from zhon.hanzi import punctuation
323
+
324
+ punctuation_all = punctuation + string.punctuation
325
+ wer_results = []
326
+
327
+ from jiwer import compute_measures
328
+
329
+ for gen_wav, prompt_wav, truth in tqdm(test_set):
330
+ if lang == "zh":
331
+ res = asr_model.generate(input=gen_wav, batch_size_s=300, disable_pbar=True)
332
+ hypo = res[0]["text"]
333
+ hypo = zhconv.convert(hypo, "zh-cn")
334
+ elif lang == "en":
335
+ segments, _ = asr_model.transcribe(gen_wav, beam_size=5, language="en")
336
+ hypo = ""
337
+ for segment in segments:
338
+ hypo = hypo + " " + segment.text
339
+
340
+ raw_truth = truth
341
+ raw_hypo = hypo
342
+
343
+ for x in punctuation_all:
344
+ truth = truth.replace(x, "")
345
+ hypo = hypo.replace(x, "")
346
+
347
+ truth = truth.replace(" ", " ")
348
+ hypo = hypo.replace(" ", " ")
349
+
350
+ if lang == "zh":
351
+ truth = " ".join([x for x in truth])
352
+ hypo = " ".join([x for x in hypo])
353
+ elif lang == "en":
354
+ truth = truth.lower()
355
+ hypo = hypo.lower()
356
+
357
+ measures = compute_measures(truth, hypo)
358
+ wer = measures["wer"]
359
+
360
+ # ref_list = truth.split(" ")
361
+ # subs = measures["substitutions"] / len(ref_list)
362
+ # dele = measures["deletions"] / len(ref_list)
363
+ # inse = measures["insertions"] / len(ref_list)
364
+
365
+ wer_results.append(
366
+ {
367
+ "wav": Path(gen_wav).stem,
368
+ "truth": raw_truth,
369
+ "hypo": raw_hypo,
370
+ "wer": wer,
371
+ }
372
+ )
373
+
374
+ return wer_results
375
+
376
+
377
+ # SIM Evaluation
378
+
379
+
380
+ def run_sim(args):
381
+ rank, test_set, ckpt_dir = args
382
+ device = f"cuda:{rank}"
383
+
384
+ model = ECAPA_TDNN_SMALL(feat_dim=1024, feat_type="wavlm_large", config_path=None)
385
+ state_dict = torch.load(ckpt_dir, weights_only=True, map_location=lambda storage, loc: storage)
386
+ model.load_state_dict(state_dict["model"], strict=False)
387
+
388
+ use_gpu = True if torch.cuda.is_available() else False
389
+ if use_gpu:
390
+ model = model.cuda(device)
391
+ model.eval()
392
+
393
+ sim_results = []
394
+ for gen_wav, prompt_wav, truth in tqdm(test_set):
395
+ wav1, sr1 = torchaudio.load(gen_wav)
396
+ wav2, sr2 = torchaudio.load(prompt_wav)
397
+
398
+ resample1 = torchaudio.transforms.Resample(orig_freq=sr1, new_freq=16000)
399
+ resample2 = torchaudio.transforms.Resample(orig_freq=sr2, new_freq=16000)
400
+ wav1 = resample1(wav1)
401
+ wav2 = resample2(wav2)
402
+
403
+ if use_gpu:
404
+ wav1 = wav1.cuda(device)
405
+ wav2 = wav2.cuda(device)
406
+ with torch.no_grad():
407
+ emb1 = model(wav1)
408
+ emb2 = model(wav2)
409
+
410
+ sim = F.cosine_similarity(emb1, emb2)[0].item()
411
+ # print(f"VSim score between two audios: {sim:.4f} (-1.0, 1.0).")
412
+ sim_results.append(
413
+ {
414
+ "wav": Path(gen_wav).stem,
415
+ "sim": sim,
416
+ }
417
+ )
418
+
419
+ return sim_results
F5-TTS/src/f5_tts/infer/README.md ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Inference
2
+
3
+ The pretrained model checkpoints can be reached at [🤗 Hugging Face](https://huggingface.co/SWivid/F5-TTS) and [🤖 Model Scope](https://www.modelscope.cn/models/SWivid/F5-TTS_Emilia-ZH-EN), or will be automatically downloaded when running inference scripts.
4
+
5
+ **More checkpoints with whole community efforts can be found in [SHARED.md](SHARED.md), supporting more languages.**
6
+
7
+ Currently support **30s for a single** generation, which is the **total length** (same logic if `fix_duration`) including both prompt and output audio. However, `infer_cli` and `infer_gradio` will automatically do chunk generation for longer text. Long reference audio will be **clip short to ~12s**.
8
+
9
+ To avoid possible inference failures, make sure you have seen through the following instructions.
10
+
11
+ - Use reference audio <12s and leave proper silence space (e.g. 1s) at the end. Otherwise there is a risk of truncating in the middle of word, leading to suboptimal generation.
12
+ - <ins>Uppercased letters</ins> (best with form like K.F.C.) will be uttered letter by letter, and lowercased letters used for common words.
13
+ - Add some spaces (blank: " ") or punctuations (e.g. "," ".") <ins>to explicitly introduce some pauses</ins>.
14
+ - If English punctuation marks the end of a sentence, make sure there is a space " " after it. Otherwise not regarded as when chunk.
15
+ - <ins>Preprocess numbers</ins> to Chinese letters if you want to have them read in Chinese, otherwise in English.
16
+ - If the generation output is blank (pure silence), <ins>check for FFmpeg installation</ins>.
17
+ - Try <ins>turn off `use_ema` if using an early-stage</ins> finetuned checkpoint (which goes just few updates).
18
+
19
+
20
+ ## Gradio App
21
+
22
+ Currently supported features:
23
+
24
+ - Basic TTS with Chunk Inference
25
+ - Multi-Style / Multi-Speaker Generation
26
+ - Voice Chat powered by Qwen2.5-3B-Instruct
27
+ - [Custom inference with more language support](SHARED.md)
28
+
29
+ The cli command `f5-tts_infer-gradio` equals to `python src/f5_tts/infer/infer_gradio.py`, which launches a Gradio APP (web interface) for inference.
30
+
31
+ The script will load model checkpoints from Huggingface. You can also manually download files and update the path to `load_model()` in `infer_gradio.py`. Currently only load TTS models first, will load ASR model to do transcription if `ref_text` not provided, will load LLM model if use Voice Chat.
32
+
33
+ More flags options:
34
+
35
+ ```bash
36
+ # Automatically launch the interface in the default web browser
37
+ f5-tts_infer-gradio --inbrowser
38
+
39
+ # Set the root path of the application, if it's not served from the root ("/") of the domain
40
+ # For example, if the application is served at "https://example.com/myapp"
41
+ f5-tts_infer-gradio --root_path "/myapp"
42
+ ```
43
+
44
+ Could also be used as a component for larger application:
45
+ ```python
46
+ import gradio as gr
47
+ from f5_tts.infer.infer_gradio import app
48
+
49
+ with gr.Blocks() as main_app:
50
+ gr.Markdown("# This is an example of using F5-TTS within a bigger Gradio app")
51
+
52
+ # ... other Gradio components
53
+
54
+ app.render()
55
+
56
+ main_app.launch()
57
+ ```
58
+
59
+
60
+ ## CLI Inference
61
+
62
+ The cli command `f5-tts_infer-cli` equals to `python src/f5_tts/infer/infer_cli.py`, which is a command line tool for inference.
63
+
64
+ The script will load model checkpoints from Huggingface. You can also manually download files and use `--ckpt_file` to specify the model you want to load, or directly update in `infer_cli.py`.
65
+
66
+ For change vocab.txt use `--vocab_file` to provide your `vocab.txt` file.
67
+
68
+ Basically you can inference with flags:
69
+ ```bash
70
+ # Leave --ref_text "" will have ASR model transcribe (extra GPU memory usage)
71
+ f5-tts_infer-cli \
72
+ --model F5TTS_v1_Base \
73
+ --ref_audio "ref_audio.wav" \
74
+ --ref_text "The content, subtitle or transcription of reference audio." \
75
+ --gen_text "Some text you want TTS model generate for you."
76
+
77
+ # Use BigVGAN as vocoder. Currently only support F5TTS_Base.
78
+ f5-tts_infer-cli --model F5TTS_Base --vocoder_name bigvgan --load_vocoder_from_local
79
+
80
+ # Use custom path checkpoint, e.g.
81
+ f5-tts_infer-cli --ckpt_file ckpts/F5TTS_v1_Base/model_1250000.safetensors
82
+
83
+ # More instructions
84
+ f5-tts_infer-cli --help
85
+ ```
86
+
87
+ And a `.toml` file would help with more flexible usage.
88
+
89
+ ```bash
90
+ f5-tts_infer-cli -c custom.toml
91
+ ```
92
+
93
+ For example, you can use `.toml` to pass in variables, refer to `src/f5_tts/infer/examples/basic/basic.toml`:
94
+
95
+ ```toml
96
+ # F5TTS_v1_Base | E2TTS_Base
97
+ model = "F5TTS_v1_Base"
98
+ ref_audio = "infer/examples/basic/basic_ref_en.wav"
99
+ # If an empty "", transcribes the reference audio automatically.
100
+ ref_text = "Some call me nature, others call me mother nature."
101
+ gen_text = "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring."
102
+ # File with text to generate. Ignores the text above.
103
+ gen_file = ""
104
+ remove_silence = false
105
+ output_dir = "tests"
106
+ ```
107
+
108
+ You can also leverage `.toml` file to do multi-style generation, refer to `src/f5_tts/infer/examples/multi/story.toml`.
109
+
110
+ ```toml
111
+ # F5TTS_v1_Base | E2TTS_Base
112
+ model = "F5TTS_v1_Base"
113
+ ref_audio = "infer/examples/multi/main.flac"
114
+ # If an empty "", transcribes the reference audio automatically.
115
+ ref_text = ""
116
+ gen_text = ""
117
+ # File with text to generate. Ignores the text above.
118
+ gen_file = "infer/examples/multi/story.txt"
119
+ remove_silence = true
120
+ output_dir = "tests"
121
+
122
+ [voices.town]
123
+ ref_audio = "infer/examples/multi/town.flac"
124
+ ref_text = ""
125
+
126
+ [voices.country]
127
+ ref_audio = "infer/examples/multi/country.flac"
128
+ ref_text = ""
129
+ ```
130
+ You should mark the voice with `[main]` `[town]` `[country]` whenever you want to change voice, refer to `src/f5_tts/infer/examples/multi/story.txt`.
131
+
132
+ ## API Usage
133
+
134
+ ```python
135
+ from importlib.resources import files
136
+ from f5_tts.api import F5TTS
137
+
138
+ f5tts = F5TTS()
139
+ wav, sr, spec = f5tts.infer(
140
+ ref_file=str(files("f5_tts").joinpath("infer/examples/basic/basic_ref_en.wav")),
141
+ ref_text="some call me nature, others call me mother nature.",
142
+ gen_text="""I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring. Respect me and I'll nurture you; ignore me and you shall face the consequences.""",
143
+ file_wave=str(files("f5_tts").joinpath("../../tests/api_out.wav")),
144
+ file_spec=str(files("f5_tts").joinpath("../../tests/api_out.png")),
145
+ seed=None,
146
+ )
147
+ ```
148
+ Check [api.py](../api.py) for more details.
149
+
150
+ ## TensorRT-LLM Deployment
151
+
152
+ See [detailed instructions](../runtime/triton_trtllm/README.md) for more information.
153
+
154
+ ## Socket Real-time Service
155
+
156
+ Real-time voice output with chunk stream:
157
+
158
+ ```bash
159
+ # Start socket server
160
+ python src/f5_tts/socket_server.py
161
+
162
+ # If PyAudio not installed
163
+ sudo apt-get install portaudio19-dev
164
+ pip install pyaudio
165
+
166
+ # Communicate with socket client
167
+ python src/f5_tts/socket_client.py
168
+ ```
169
+
170
+ ## Speech Editing
171
+
172
+ To test speech editing capabilities, use the following command:
173
+
174
+ ```bash
175
+ python src/f5_tts/infer/speech_edit.py
176
+ ```
177
+
F5-TTS/src/f5_tts/infer/SHARED.md ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- omit in toc -->
2
+ # Shared Model Cards
3
+
4
+ <!-- omit in toc -->
5
+ ### **Prerequisites of using**
6
+ - This document is serving as a quick lookup table for the community training/finetuning result, with various language support.
7
+ - The models in this repository are open source and are based on voluntary contributions from contributors.
8
+ - The use of models must be conditioned on respect for the respective creators. The convenience brought comes from their efforts.
9
+
10
+ <!-- omit in toc -->
11
+ ### **Welcome to share here**
12
+ - Have a pretrained/finetuned result: model checkpoint (pruned best to facilitate inference, i.e. leave only `ema_model_state_dict`) and corresponding vocab file (for tokenization).
13
+ - Host a public [huggingface model repository](https://huggingface.co/new) and upload the model related files.
14
+ - Make a pull request adding a model card to the current page, i.e. `src\f5_tts\infer\SHARED.md`.
15
+
16
+ <!-- omit in toc -->
17
+ ### Supported Languages
18
+ - [Multilingual](#multilingual)
19
+ - [F5-TTS v1 v0 Base @ zh \& en @ F5-TTS](#f5-tts-v1-v0-base--zh--en--f5-tts)
20
+ - [English](#english)
21
+ - [Finnish](#finnish)
22
+ - [F5-TTS Base @ fi @ AsmoKoskinen](#f5-tts-base--fi--asmokoskinen)
23
+ - [French](#french)
24
+ - [F5-TTS Base @ fr @ RASPIAUDIO](#f5-tts-base--fr--raspiaudio)
25
+ - [German](#german)
26
+ - [F5-TTS Base @ de @ hvoss-techfak](#f5-tts-base--de--hvoss-techfak)
27
+ - [Hindi](#hindi)
28
+ - [F5-TTS Small @ hi @ SPRINGLab](#f5-tts-small--hi--springlab)
29
+ - [Italian](#italian)
30
+ - [F5-TTS Base @ it @ alien79](#f5-tts-base--it--alien79)
31
+ - [Japanese](#japanese)
32
+ - [F5-TTS Base @ ja @ Jmica](#f5-tts-base--ja--jmica)
33
+ - [Mandarin](#mandarin)
34
+ - [Russian](#russian)
35
+ - [F5-TTS Base @ ru @ HotDro4illa](#f5-tts-base--ru--hotdro4illa)
36
+ - [Spanish](#spanish)
37
+ - [F5-TTS Base @ es @ jpgallegoar](#f5-tts-base--es--jpgallegoar)
38
+
39
+
40
+ ## Multilingual
41
+
42
+ #### F5-TTS v1 v0 Base @ zh & en @ F5-TTS
43
+ |Model|🤗Hugging Face|Data (Hours)|Model License|
44
+ |:---:|:------------:|:-----------:|:-------------:|
45
+ |F5-TTS v1 Base|[ckpt & vocab](https://huggingface.co/SWivid/F5-TTS/tree/main/F5TTS_v1_Base)|[Emilia 95K zh&en](https://huggingface.co/datasets/amphion/Emilia-Dataset/tree/fc71e07)|cc-by-nc-4.0|
46
+
47
+ ```bash
48
+ Model: hf://SWivid/F5-TTS/F5TTS_v1_Base/model_1250000.safetensors
49
+ # A Variant Model: hf://SWivid/F5-TTS/F5TTS_v1_Base_no_zero_init/model_1250000.safetensors
50
+ Vocab: hf://SWivid/F5-TTS/F5TTS_v1_Base/vocab.txt
51
+ Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "conv_layers": 4}
52
+ ```
53
+
54
+ |Model|🤗Hugging Face|Data (Hours)|Model License|
55
+ |:---:|:------------:|:-----------:|:-------------:|
56
+ |F5-TTS Base|[ckpt & vocab](https://huggingface.co/SWivid/F5-TTS/tree/main/F5TTS_Base)|[Emilia 95K zh&en](https://huggingface.co/datasets/amphion/Emilia-Dataset/tree/fc71e07)|cc-by-nc-4.0|
57
+
58
+ ```bash
59
+ Model: hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors
60
+ Vocab: hf://SWivid/F5-TTS/F5TTS_Base/vocab.txt
61
+ Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "text_mask_padding": False, "conv_layers": 4, "pe_attn_head": 1}
62
+ ```
63
+
64
+ *Other infos, e.g. Author info, Github repo, Link to some sampled results, Usage instruction, Tutorial (Blog, Video, etc.) ...*
65
+
66
+
67
+ ## English
68
+
69
+
70
+ ## Finnish
71
+
72
+ #### F5-TTS Base @ fi @ AsmoKoskinen
73
+ |Model|🤗Hugging Face|Data|Model License|
74
+ |:---:|:------------:|:-----------:|:-------------:|
75
+ |F5-TTS Base|[ckpt & vocab](https://huggingface.co/AsmoKoskinen/F5-TTS_Finnish_Model)|[Common Voice](https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0), [Vox Populi](https://huggingface.co/datasets/facebook/voxpopuli)|cc-by-nc-4.0|
76
+
77
+ ```bash
78
+ Model: hf://AsmoKoskinen/F5-TTS_Finnish_Model/model_common_voice_fi_vox_populi_fi_20241206.safetensors
79
+ Vocab: hf://AsmoKoskinen/F5-TTS_Finnish_Model/vocab.txt
80
+ Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "text_mask_padding": False, "conv_layers": 4, "pe_attn_head": 1}
81
+ ```
82
+
83
+
84
+ ## French
85
+
86
+ #### F5-TTS Base @ fr @ RASPIAUDIO
87
+ |Model|🤗Hugging Face|Data (Hours)|Model License|
88
+ |:---:|:------------:|:-----------:|:-------------:|
89
+ |F5-TTS Base|[ckpt & vocab](https://huggingface.co/RASPIAUDIO/F5-French-MixedSpeakers-reduced)|[LibriVox](https://librivox.org/)|cc-by-nc-4.0|
90
+
91
+ ```bash
92
+ Model: hf://RASPIAUDIO/F5-French-MixedSpeakers-reduced/model_last_reduced.pt
93
+ Vocab: hf://RASPIAUDIO/F5-French-MixedSpeakers-reduced/vocab.txt
94
+ Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "text_mask_padding": False, "conv_layers": 4, "pe_attn_head": 1}
95
+ ```
96
+
97
+ - [Online Inference with Hugging Face Space](https://huggingface.co/spaces/RASPIAUDIO/f5-tts_french).
98
+ - [Tutorial video to train a new language model](https://www.youtube.com/watch?v=UO4usaOojys).
99
+ - [Discussion about this training can be found here](https://github.com/SWivid/F5-TTS/issues/434).
100
+
101
+
102
+ ## German
103
+
104
+ #### F5-TTS Base @ de @ hvoss-techfak
105
+ |Model|🤗Hugging Face|Data (Hours)|Model License|
106
+ |:---:|:------------:|:-----------:|:-------------:|
107
+ |F5-TTS Base|[ckpt & vocab](https://huggingface.co/hvoss-techfak/F5-TTS-German)|[Mozilla Common Voice 19.0](https://commonvoice.mozilla.org/en/datasets) & 800 hours Crowdsourced |cc-by-nc-4.0|
108
+
109
+ ```bash
110
+ Model: hf://hvoss-techfak/F5-TTS-German/model_f5tts_german.pt
111
+ Vocab: hf://hvoss-techfak/F5-TTS-German/vocab.txt
112
+ Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "text_mask_padding": False, "conv_layers": 4, "pe_attn_head": 1}
113
+ ```
114
+
115
+ - Finetuned by [@hvoss-techfak](https://github.com/hvoss-techfak)
116
+
117
+
118
+ ## Hindi
119
+
120
+ #### F5-TTS Small @ hi @ SPRINGLab
121
+ |Model|🤗Hugging Face|Data (Hours)|Model License|
122
+ |:---:|:------------:|:-----------:|:-------------:|
123
+ |F5-TTS Small|[ckpt & vocab](https://huggingface.co/SPRINGLab/F5-Hindi-24KHz)|[IndicTTS Hi](https://huggingface.co/datasets/SPRINGLab/IndicTTS-Hindi) & [IndicVoices-R Hi](https://huggingface.co/datasets/SPRINGLab/IndicVoices-R_Hindi) |cc-by-4.0|
124
+
125
+ ```bash
126
+ Model: hf://SPRINGLab/F5-Hindi-24KHz/model_2500000.safetensors
127
+ Vocab: hf://SPRINGLab/F5-Hindi-24KHz/vocab.txt
128
+ Config: {"dim": 768, "depth": 18, "heads": 12, "ff_mult": 2, "text_dim": 512, "text_mask_padding": False, "conv_layers": 4, "pe_attn_head": 1}
129
+ ```
130
+
131
+ - Authors: SPRING Lab, Indian Institute of Technology, Madras
132
+ - Website: https://asr.iitm.ac.in/
133
+
134
+
135
+ ## Italian
136
+
137
+ #### F5-TTS Base @ it @ alien79
138
+ |Model|🤗Hugging Face|Data|Model License|
139
+ |:---:|:------------:|:-----------:|:-------------:|
140
+ |F5-TTS Base|[ckpt & vocab](https://huggingface.co/alien79/F5-TTS-italian)|[ylacombe/cml-tts](https://huggingface.co/datasets/ylacombe/cml-tts) |cc-by-nc-4.0|
141
+
142
+ ```bash
143
+ Model: hf://alien79/F5-TTS-italian/model_159600.safetensors
144
+ Vocab: hf://alien79/F5-TTS-italian/vocab.txt
145
+ Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "text_mask_padding": False, "conv_layers": 4, "pe_attn_head": 1}
146
+ ```
147
+
148
+ - Trained by [Mithril Man](https://github.com/MithrilMan)
149
+ - Model details on [hf project home](https://huggingface.co/alien79/F5-TTS-italian)
150
+ - Open to collaborations to further improve the model
151
+
152
+
153
+ ## Japanese
154
+
155
+ #### F5-TTS Base @ ja @ Jmica
156
+ |Model|🤗Hugging Face|Data (Hours)|Model License|
157
+ |:---:|:------------:|:-----------:|:-------------:|
158
+ |F5-TTS Base|[ckpt & vocab](https://huggingface.co/Jmica/F5TTS/tree/main/JA_21999120)|[Emilia 1.7k JA](https://huggingface.co/datasets/amphion/Emilia-Dataset/tree/fc71e07) & [Galgame Dataset 5.4k](https://huggingface.co/datasets/OOPPEENN/Galgame_Dataset)|cc-by-nc-4.0|
159
+
160
+ ```bash
161
+ Model: hf://Jmica/F5TTS/JA_21999120/model_21999120.pt
162
+ Vocab: hf://Jmica/F5TTS/JA_21999120/vocab_japanese.txt
163
+ Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "text_mask_padding": False, "conv_layers": 4, "pe_attn_head": 1}
164
+ ```
165
+
166
+
167
+ ## Mandarin
168
+
169
+
170
+ ## Russian
171
+
172
+ #### F5-TTS Base @ ru @ HotDro4illa
173
+ |Model|🤗Hugging Face|Data (Hours)|Model License|
174
+ |:---:|:------------:|:-----------:|:-------------:|
175
+ |F5-TTS Base|[ckpt & vocab](https://huggingface.co/hotstone228/F5-TTS-Russian)|[Common voice](https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0)|cc-by-nc-4.0|
176
+
177
+ ```bash
178
+ Model: hf://hotstone228/F5-TTS-Russian/model_last.safetensors
179
+ Vocab: hf://hotstone228/F5-TTS-Russian/vocab.txt
180
+ Config: {"dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "text_mask_padding": False, "conv_layers": 4, "pe_attn_head": 1}
181
+ ```
182
+ - Finetuned by [HotDro4illa](https://github.com/HotDro4illa)
183
+ - Any improvements are welcome
184
+
185
+
186
+ ## Spanish
187
+
188
+ #### F5-TTS Base @ es @ jpgallegoar
189
+ |Model|🤗Hugging Face|Data (Hours)|Model License|
190
+ |:---:|:------------:|:-----------:|:-------------:|
191
+ |F5-TTS Base|[ckpt & vocab](https://huggingface.co/jpgallegoar/F5-Spanish)|[Voxpopuli](https://huggingface.co/datasets/facebook/voxpopuli) & Crowdsourced & TEDx, 218 hours|cc0-1.0|
192
+
193
+ - @jpgallegoar [GitHub repo](https://github.com/jpgallegoar/Spanish-F5), Jupyter Notebook and Gradio usage for Spanish model.
F5-TTS/src/f5_tts/infer/examples/basic/basic.toml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # F5TTS_v1_Base | E2TTS_Base
2
+ model = "F5TTS_v1_Base"
3
+ ref_audio = "infer/examples/basic/basic_ref_en.wav"
4
+ # If an empty "", transcribes the reference audio automatically.
5
+ ref_text = "Some call me nature, others call me mother nature."
6
+ gen_text = "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring."
7
+ # File with text to generate. Ignores the text above.
8
+ gen_file = ""
9
+ remove_silence = false
10
+ output_dir = "tests"
11
+ output_file = "infer_cli_basic.wav"
F5-TTS/src/f5_tts/infer/examples/basic/basic_ref_en.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0e22048e72414fcc1e6b6342e47a774d748a195ed34e4a5b3fcf416707f2b71
3
+ size 256018
F5-TTS/src/f5_tts/infer/examples/basic/basic_ref_zh.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96724a113240d1f82c6ded1334122f0176b96c9226ccd3c919e625bcfd2a3ede
3
+ size 324558
F5-TTS/src/f5_tts/infer/examples/multi/country.flac ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb15708b4b3875e37beec46591a5d89e1a9a63fdad3b8fe4a5c8738f4f554400
3
+ size 180321
F5-TTS/src/f5_tts/infer/examples/multi/main.flac ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4abb1107771ce7e14926fde879b959dde6db6e572476b98684f04e45e978ab19
3
+ size 279219