kevinwang676 commited on
Commit
95c3696
1 Parent(s): 438ddbe

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +5 -0
  2. .github/ISSUE_TEMPLATE/bug_report.md +25 -0
  3. .github/ISSUE_TEMPLATE/feature_request.md +22 -0
  4. .github/pull_request_template.md +7 -0
  5. .github/workflows/build-windows-package.yml +51 -0
  6. .github/workflows/docs.yml +30 -0
  7. .gitignore +29 -0
  8. .pre-commit-config.yaml +32 -0
  9. .project-root +0 -0
  10. .readthedocs.yaml +19 -0
  11. API_FLAGS.txt +6 -0
  12. LICENSE +437 -0
  13. README.zh.md +74 -0
  14. docker-compose.dev.yml +16 -0
  15. dockerfile +24 -0
  16. docs/CNAME +1 -0
  17. docs/assets/figs/VS_1.jpg +0 -0
  18. docs/assets/figs/diagram.png +0 -0
  19. docs/en/finetune.md +125 -0
  20. docs/en/index.md +128 -0
  21. docs/en/inference.md +153 -0
  22. docs/en/samples.md +223 -0
  23. docs/ja/finetune.md +125 -0
  24. docs/ja/index.md +128 -0
  25. docs/ja/inference.md +157 -0
  26. docs/ja/samples.md +223 -0
  27. docs/requirements.txt +3 -0
  28. docs/stylesheets/extra.css +3 -0
  29. docs/zh/finetune.md +136 -0
  30. docs/zh/index.md +118 -0
  31. docs/zh/inference.md +164 -0
  32. docs/zh/samples.md +223 -0
  33. fish_speech/callbacks/__init__.py +3 -0
  34. fish_speech/callbacks/grad_norm.py +113 -0
  35. fish_speech/configs/base.yaml +87 -0
  36. fish_speech/configs/firefly_gan_vq.yaml +34 -0
  37. fish_speech/configs/lora/r_8_alpha_16.yaml +4 -0
  38. fish_speech/configs/text2semantic_finetune.yaml +83 -0
  39. fish_speech/conversation.py +2 -0
  40. fish_speech/datasets/concat_repeat.py +53 -0
  41. fish_speech/datasets/protos/text-data.proto +24 -0
  42. fish_speech/datasets/protos/text_data_pb2.py +33 -0
  43. fish_speech/datasets/protos/text_data_stream.py +36 -0
  44. fish_speech/datasets/semantic.py +496 -0
  45. fish_speech/datasets/vqgan.py +147 -0
  46. fish_speech/i18n/README.md +27 -0
  47. fish_speech/i18n/__init__.py +3 -0
  48. fish_speech/i18n/core.py +40 -0
  49. fish_speech/i18n/locale/en_US.json +122 -0
  50. fish_speech/i18n/locale/es_ES.json +122 -0
.dockerignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ results
2
+ data
3
+ *.filelist
4
+ /data_server/target
5
+ checkpoints
.github/ISSUE_TEMPLATE/bug_report.md ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: Bug report
3
+ about: Create a report to help us improve
4
+ title: "[BUG]"
5
+ labels: bug
6
+ assignees: ''
7
+
8
+ ---
9
+
10
+ Feel free to ask any kind of questions in the issues page, but please use English since other users may find your questions valuable.
11
+
12
+ **Describe the bug**
13
+ A clear and concise description of what the bug is.
14
+
15
+ **To Reproduce**
16
+ Steps to reproduce the behavior:
17
+
18
+ **Expected behavior**
19
+ A clear and concise description of what you expected to happen.
20
+
21
+ **Screenshots / log**
22
+ If applicable, add screenshots / logs to help explain your problem.
23
+
24
+ **Additional context**
25
+ Add any other context about the problem here.
.github/ISSUE_TEMPLATE/feature_request.md ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: Feature request
3
+ about: Suggest an idea for this project
4
+ title: "[Feature]"
5
+ labels: enhancement
6
+ assignees: ''
7
+
8
+ ---
9
+
10
+ Feel free to ask any kind of questions in the issues page, but please use English since other users may find your questions valuable.
11
+
12
+ **Is your feature request related to a problem? Please describe.**
13
+ A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
14
+
15
+ **Describe the solution you'd like**
16
+ A clear and concise description of what you want to happen.
17
+
18
+ **Describe alternatives you've considered**
19
+ A clear and concise description of any alternative solutions or features you've considered.
20
+
21
+ **Additional context**
22
+ Add any other context or screenshots about the feature request here.
.github/pull_request_template.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ **Is this PR adding new feature or fix a BUG?**
2
+
3
+ Add feature / Fix BUG.
4
+
5
+ **Is this pull request related to any issue? If yes, please link the issue.**
6
+
7
+ #xxx
.github/workflows/build-windows-package.yml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: build-windows-package
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+
8
+ jobs:
9
+ deploy:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - name: Remove unnecessary files
13
+ run: |
14
+ sudo rm -rf /usr/share/dotnet
15
+ sudo rm -rf /opt/ghc
16
+ sudo rm -rf /usr/local/lib/android
17
+ sudo rm -rf "/usr/local/share/boost"
18
+ sudo rm -rf "$AGENT_TOOLSDIRECTORY"
19
+ - uses: actions/setup-python@v5
20
+ with:
21
+ python-version: 3.12
22
+ - uses: actions/checkout@v4
23
+ with:
24
+ path: ./fish-speech
25
+ - name: Setup Hugging Face CLI
26
+ run: pip3 install huggingface-hub
27
+ - name: Download Windows Binaries
28
+ env:
29
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
30
+ run: |
31
+ if [[ "${{ github.actor }}" = "Leng Yue" ]] || [[ "${{ github.actor }}" = "AnyaCoder" ]] || [[ "${{ github.actor }}" = "pre-commit-ci[bot]" ]]; then
32
+ ls -la
33
+ else
34
+ echo "Author is not Leng Yue nor AnyaCoder. No upload performed."
35
+ fi
36
+ - uses: actions/upload-artifact@v4
37
+ with:
38
+ name: fish-speech-main-${{ github.run_id }}
39
+ path: ./fish-speech
40
+
41
+ - name: Upload to Hugging Face
42
+ env:
43
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
44
+ run: |
45
+ if [ "${{ github.actor }}" = "AnyaCoder" ]; then
46
+ echo "Author is AnyaCoder. Performing the zipping && upload."
47
+ zip -qr fish-speech-main-${{ github.run_id }}.zip ./fish-speech
48
+ huggingface-cli upload SpicyqSama007/fish-speech-packed ./fish-speech-main-${{ github.run_id }}.zip fish-speech-main-${{ github.run_id }}.zip
49
+ else
50
+ echo "Author is not AnyaCoder. No upload performed."
51
+ fi
.github/workflows/docs.yml ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: docs
2
+ on:
3
+ push:
4
+ branches:
5
+ - main
6
+
7
+ permissions:
8
+ contents: write
9
+
10
+ jobs:
11
+ deploy:
12
+ runs-on: ubuntu-latest
13
+ steps:
14
+ - uses: actions/checkout@v4
15
+ - name: Configure Git Credentials
16
+ run: |
17
+ git config user.name github-actions[bot]
18
+ git config user.email 41898282+github-actions[bot]@users.noreply.github.com
19
+ - uses: actions/setup-python@v5
20
+ with:
21
+ python-version: 3.x
22
+ - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
23
+ - uses: actions/cache@v4
24
+ with:
25
+ key: mkdocs-material-${{ env.cache_id }}
26
+ path: .cache
27
+ restore-keys: |
28
+ mkdocs-material-
29
+ - run: pip install -r docs/requirements.txt
30
+ - run: mkdocs gh-deploy --force
.gitignore ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .pgx.*
2
+ .pdm-python
3
+ /fish_speech.egg-info
4
+ __pycache__
5
+ /results
6
+ /data
7
+ /*.test.sh
8
+ *.filelist
9
+ filelists
10
+ /fish_speech/text/cmudict_cache.pickle
11
+ /checkpoints
12
+ /.vscode
13
+ /data_server/target
14
+ /*.npy
15
+ /*.wav
16
+ /*.mp3
17
+ /results
18
+ /data
19
+ /.idea
20
+ ffmpeg.exe
21
+ ffprobe.exe
22
+ asr-label*
23
+ /.cache
24
+ /fishenv
25
+ /.locale
26
+ /demo-audios
27
+ ref_data*
28
+ /example
29
+ /faster_whisper
.pre-commit-config.yaml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ci:
2
+ autoupdate_schedule: monthly
3
+
4
+ repos:
5
+ - repo: https://github.com/pycqa/isort
6
+ rev: 5.13.2
7
+ hooks:
8
+ - id: isort
9
+ args: [--profile=black]
10
+
11
+ - repo: https://github.com/psf/black
12
+ rev: 24.4.2
13
+ hooks:
14
+ - id: black
15
+
16
+ - repo: https://github.com/codespell-project/codespell
17
+ rev: v2.3.0
18
+ hooks:
19
+ - id: codespell
20
+ files: ^.*\.(py|md|rst|yml)$
21
+ args: [-L=fro]
22
+
23
+ - repo: https://github.com/pre-commit/pre-commit-hooks
24
+ rev: v4.6.0
25
+ hooks:
26
+ - id: end-of-file-fixer
27
+ - id: check-yaml
28
+ - id: check-json
29
+ - id: mixed-line-ending
30
+ args: ['--fix=lf']
31
+ - id: check-added-large-files
32
+ args: ['--maxkb=5000']
.project-root ADDED
File without changes
.readthedocs.yaml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Read the Docs configuration file for MkDocs projects
2
+ # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
3
+
4
+ # Required
5
+ version: 2
6
+
7
+ # Set the version of Python and other tools you might need
8
+ build:
9
+ os: ubuntu-22.04
10
+ tools:
11
+ python: "3.12"
12
+
13
+ mkdocs:
14
+ configuration: mkdocs.yml
15
+
16
+ # Optionally declare the Python requirements required to build your docs
17
+ python:
18
+ install:
19
+ - requirements: docs/requirements.txt
API_FLAGS.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # --infer
2
+ # --api
3
+ --listen 0.0.0.0:8080 \
4
+ --llama-checkpoint-path "checkpoints/fish-speech-1.2-sft" \
5
+ --decoder-checkpoint-path "checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth" \
6
+ --decoder-config-name firefly_gan_vq
LICENSE ADDED
@@ -0,0 +1,437 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Attribution-NonCommercial-ShareAlike 4.0 International
2
+
3
+ =======================================================================
4
+
5
+ Creative Commons Corporation ("Creative Commons") is not a law firm and
6
+ does not provide legal services or legal advice. Distribution of
7
+ Creative Commons public licenses does not create a lawyer-client or
8
+ other relationship. Creative Commons makes its licenses and related
9
+ information available on an "as-is" basis. Creative Commons gives no
10
+ warranties regarding its licenses, any material licensed under their
11
+ terms and conditions, or any related information. Creative Commons
12
+ disclaims all liability for damages resulting from their use to the
13
+ fullest extent possible.
14
+
15
+ Using Creative Commons Public Licenses
16
+
17
+ Creative Commons public licenses provide a standard set of terms and
18
+ conditions that creators and other rights holders may use to share
19
+ original works of authorship and other material subject to copyright
20
+ and certain other rights specified in the public license below. The
21
+ following considerations are for informational purposes only, are not
22
+ exhaustive, and do not form part of our licenses.
23
+
24
+ Considerations for licensors: Our public licenses are
25
+ intended for use by those authorized to give the public
26
+ permission to use material in ways otherwise restricted by
27
+ copyright and certain other rights. Our licenses are
28
+ irrevocable. Licensors should read and understand the terms
29
+ and conditions of the license they choose before applying it.
30
+ Licensors should also secure all rights necessary before
31
+ applying our licenses so that the public can reuse the
32
+ material as expected. Licensors should clearly mark any
33
+ material not subject to the license. This includes other CC-
34
+ licensed material, or material used under an exception or
35
+ limitation to copyright. More considerations for licensors:
36
+ wiki.creativecommons.org/Considerations_for_licensors
37
+
38
+ Considerations for the public: By using one of our public
39
+ licenses, a licensor grants the public permission to use the
40
+ licensed material under specified terms and conditions. If
41
+ the licensor's permission is not necessary for any reason--for
42
+ example, because of any applicable exception or limitation to
43
+ copyright--then that use is not regulated by the license. Our
44
+ licenses grant only permissions under copyright and certain
45
+ other rights that a licensor has authority to grant. Use of
46
+ the licensed material may still be restricted for other
47
+ reasons, including because others have copyright or other
48
+ rights in the material. A licensor may make special requests,
49
+ such as asking that all changes be marked or described.
50
+ Although not required by our licenses, you are encouraged to
51
+ respect those requests where reasonable. More considerations
52
+ for the public:
53
+ wiki.creativecommons.org/Considerations_for_licensees
54
+
55
+ =======================================================================
56
+
57
+ Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International
58
+ Public License
59
+
60
+ By exercising the Licensed Rights (defined below), You accept and agree
61
+ to be bound by the terms and conditions of this Creative Commons
62
+ Attribution-NonCommercial-ShareAlike 4.0 International Public License
63
+ ("Public License"). To the extent this Public License may be
64
+ interpreted as a contract, You are granted the Licensed Rights in
65
+ consideration of Your acceptance of these terms and conditions, and the
66
+ Licensor grants You such rights in consideration of benefits the
67
+ Licensor receives from making the Licensed Material available under
68
+ these terms and conditions.
69
+
70
+
71
+ Section 1 -- Definitions.
72
+
73
+ a. Adapted Material means material subject to Copyright and Similar
74
+ Rights that is derived from or based upon the Licensed Material
75
+ and in which the Licensed Material is translated, altered,
76
+ arranged, transformed, or otherwise modified in a manner requiring
77
+ permission under the Copyright and Similar Rights held by the
78
+ Licensor. For purposes of this Public License, where the Licensed
79
+ Material is a musical work, performance, or sound recording,
80
+ Adapted Material is always produced where the Licensed Material is
81
+ synched in timed relation with a moving image.
82
+
83
+ b. Adapter's License means the license You apply to Your Copyright
84
+ and Similar Rights in Your contributions to Adapted Material in
85
+ accordance with the terms and conditions of this Public License.
86
+
87
+ c. BY-NC-SA Compatible License means a license listed at
88
+ creativecommons.org/compatiblelicenses, approved by Creative
89
+ Commons as essentially the equivalent of this Public License.
90
+
91
+ d. Copyright and Similar Rights means copyright and/or similar rights
92
+ closely related to copyright including, without limitation,
93
+ performance, broadcast, sound recording, and Sui Generis Database
94
+ Rights, without regard to how the rights are labeled or
95
+ categorized. For purposes of this Public License, the rights
96
+ specified in Section 2(b)(1)-(2) are not Copyright and Similar
97
+ Rights.
98
+
99
+ e. Effective Technological Measures means those measures that, in the
100
+ absence of proper authority, may not be circumvented under laws
101
+ fulfilling obligations under Article 11 of the WIPO Copyright
102
+ Treaty adopted on December 20, 1996, and/or similar international
103
+ agreements.
104
+
105
+ f. Exceptions and Limitations means fair use, fair dealing, and/or
106
+ any other exception or limitation to Copyright and Similar Rights
107
+ that applies to Your use of the Licensed Material.
108
+
109
+ g. License Elements means the license attributes listed in the name
110
+ of a Creative Commons Public License. The License Elements of this
111
+ Public License are Attribution, NonCommercial, and ShareAlike.
112
+
113
+ h. Licensed Material means the artistic or literary work, database,
114
+ or other material to which the Licensor applied this Public
115
+ License.
116
+
117
+ i. Licensed Rights means the rights granted to You subject to the
118
+ terms and conditions of this Public License, which are limited to
119
+ all Copyright and Similar Rights that apply to Your use of the
120
+ Licensed Material and that the Licensor has authority to license.
121
+
122
+ j. Licensor means the individual(s) or entity(ies) granting rights
123
+ under this Public License.
124
+
125
+ k. NonCommercial means not primarily intended for or directed towards
126
+ commercial advantage or monetary compensation. For purposes of
127
+ this Public License, the exchange of the Licensed Material for
128
+ other material subject to Copyright and Similar Rights by digital
129
+ file-sharing or similar means is NonCommercial provided there is
130
+ no payment of monetary compensation in connection with the
131
+ exchange.
132
+
133
+ l. Share means to provide material to the public by any means or
134
+ process that requires permission under the Licensed Rights, such
135
+ as reproduction, public display, public performance, distribution,
136
+ dissemination, communication, or importation, and to make material
137
+ available to the public including in ways that members of the
138
+ public may access the material from a place and at a time
139
+ individually chosen by them.
140
+
141
+ m. Sui Generis Database Rights means rights other than copyright
142
+ resulting from Directive 96/9/EC of the European Parliament and of
143
+ the Council of 11 March 1996 on the legal protection of databases,
144
+ as amended and/or succeeded, as well as other essentially
145
+ equivalent rights anywhere in the world.
146
+
147
+ n. You means the individual or entity exercising the Licensed Rights
148
+ under this Public License. Your has a corresponding meaning.
149
+
150
+
151
+ Section 2 -- Scope.
152
+
153
+ a. License grant.
154
+
155
+ 1. Subject to the terms and conditions of this Public License,
156
+ the Licensor hereby grants You a worldwide, royalty-free,
157
+ non-sublicensable, non-exclusive, irrevocable license to
158
+ exercise the Licensed Rights in the Licensed Material to:
159
+
160
+ a. reproduce and Share the Licensed Material, in whole or
161
+ in part, for NonCommercial purposes only; and
162
+
163
+ b. produce, reproduce, and Share Adapted Material for
164
+ NonCommercial purposes only.
165
+
166
+ 2. Exceptions and Limitations. For the avoidance of doubt, where
167
+ Exceptions and Limitations apply to Your use, this Public
168
+ License does not apply, and You do not need to comply with
169
+ its terms and conditions.
170
+
171
+ 3. Term. The term of this Public License is specified in Section
172
+ 6(a).
173
+
174
+ 4. Media and formats; technical modifications allowed. The
175
+ Licensor authorizes You to exercise the Licensed Rights in
176
+ all media and formats whether now known or hereafter created,
177
+ and to make technical modifications necessary to do so. The
178
+ Licensor waives and/or agrees not to assert any right or
179
+ authority to forbid You from making technical modifications
180
+ necessary to exercise the Licensed Rights, including
181
+ technical modifications necessary to circumvent Effective
182
+ Technological Measures. For purposes of this Public License,
183
+ simply making modifications authorized by this Section 2(a)
184
+ (4) never produces Adapted Material.
185
+
186
+ 5. Downstream recipients.
187
+
188
+ a. Offer from the Licensor -- Licensed Material. Every
189
+ recipient of the Licensed Material automatically
190
+ receives an offer from the Licensor to exercise the
191
+ Licensed Rights under the terms and conditions of this
192
+ Public License.
193
+
194
+ b. Additional offer from the Licensor -- Adapted Material.
195
+ Every recipient of Adapted Material from You
196
+ automatically receives an offer from the Licensor to
197
+ exercise the Licensed Rights in the Adapted Material
198
+ under the conditions of the Adapter's License You apply.
199
+
200
+ c. No downstream restrictions. You may not offer or impose
201
+ any additional or different terms or conditions on, or
202
+ apply any Effective Technological Measures to, the
203
+ Licensed Material if doing so restricts exercise of the
204
+ Licensed Rights by any recipient of the Licensed
205
+ Material.
206
+
207
+ 6. No endorsement. Nothing in this Public License constitutes or
208
+ may be construed as permission to assert or imply that You
209
+ are, or that Your use of the Licensed Material is, connected
210
+ with, or sponsored, endorsed, or granted official status by,
211
+ the Licensor or others designated to receive attribution as
212
+ provided in Section 3(a)(1)(A)(i).
213
+
214
+ b. Other rights.
215
+
216
+ 1. Moral rights, such as the right of integrity, are not
217
+ licensed under this Public License, nor are publicity,
218
+ privacy, and/or other similar personality rights; however, to
219
+ the extent possible, the Licensor waives and/or agrees not to
220
+ assert any such rights held by the Licensor to the limited
221
+ extent necessary to allow You to exercise the Licensed
222
+ Rights, but not otherwise.
223
+
224
+ 2. Patent and trademark rights are not licensed under this
225
+ Public License.
226
+
227
+ 3. To the extent possible, the Licensor waives any right to
228
+ collect royalties from You for the exercise of the Licensed
229
+ Rights, whether directly or through a collecting society
230
+ under any voluntary or waivable statutory or compulsory
231
+ licensing scheme. In all other cases the Licensor expressly
232
+ reserves any right to collect such royalties, including when
233
+ the Licensed Material is used other than for NonCommercial
234
+ purposes.
235
+
236
+
237
+ Section 3 -- License Conditions.
238
+
239
+ Your exercise of the Licensed Rights is expressly made subject to the
240
+ following conditions.
241
+
242
+ a. Attribution.
243
+
244
+ 1. If You Share the Licensed Material (including in modified
245
+ form), You must:
246
+
247
+ a. retain the following if it is supplied by the Licensor
248
+ with the Licensed Material:
249
+
250
+ i. identification of the creator(s) of the Licensed
251
+ Material and any others designated to receive
252
+ attribution, in any reasonable manner requested by
253
+ the Licensor (including by pseudonym if
254
+ designated);
255
+
256
+ ii. a copyright notice;
257
+
258
+ iii. a notice that refers to this Public License;
259
+
260
+ iv. a notice that refers to the disclaimer of
261
+ warranties;
262
+
263
+ v. a URI or hyperlink to the Licensed Material to the
264
+ extent reasonably practicable;
265
+
266
+ b. indicate if You modified the Licensed Material and
267
+ retain an indication of any previous modifications; and
268
+
269
+ c. indicate the Licensed Material is licensed under this
270
+ Public License, and include the text of, or the URI or
271
+ hyperlink to, this Public License.
272
+
273
+ 2. You may satisfy the conditions in Section 3(a)(1) in any
274
+ reasonable manner based on the medium, means, and context in
275
+ which You Share the Licensed Material. For example, it may be
276
+ reasonable to satisfy the conditions by providing a URI or
277
+ hyperlink to a resource that includes the required
278
+ information.
279
+ 3. If requested by the Licensor, You must remove any of the
280
+ information required by Section 3(a)(1)(A) to the extent
281
+ reasonably practicable.
282
+
283
+ b. ShareAlike.
284
+
285
+ In addition to the conditions in Section 3(a), if You Share
286
+ Adapted Material You produce, the following conditions also apply.
287
+
288
+ 1. The Adapter's License You apply must be a Creative Commons
289
+ license with the same License Elements, this version or
290
+ later, or a BY-NC-SA Compatible License.
291
+
292
+ 2. You must include the text of, or the URI or hyperlink to, the
293
+ Adapter's License You apply. You may satisfy this condition
294
+ in any reasonable manner based on the medium, means, and
295
+ context in which You Share Adapted Material.
296
+
297
+ 3. You may not offer or impose any additional or different terms
298
+ or conditions on, or apply any Effective Technological
299
+ Measures to, Adapted Material that restrict exercise of the
300
+ rights granted under the Adapter's License You apply.
301
+
302
+
303
+ Section 4 -- Sui Generis Database Rights.
304
+
305
+ Where the Licensed Rights include Sui Generis Database Rights that
306
+ apply to Your use of the Licensed Material:
307
+
308
+ a. for the avoidance of doubt, Section 2(a)(1) grants You the right
309
+ to extract, reuse, reproduce, and Share all or a substantial
310
+ portion of the contents of the database for NonCommercial purposes
311
+ only;
312
+
313
+ b. if You include all or a substantial portion of the database
314
+ contents in a database in which You have Sui Generis Database
315
+ Rights, then the database in which You have Sui Generis Database
316
+ Rights (but not its individual contents) is Adapted Material,
317
+ including for purposes of Section 3(b); and
318
+
319
+ c. You must comply with the conditions in Section 3(a) if You Share
320
+ all or a substantial portion of the contents of the database.
321
+
322
+ For the avoidance of doubt, this Section 4 supplements and does not
323
+ replace Your obligations under this Public License where the Licensed
324
+ Rights include other Copyright and Similar Rights.
325
+
326
+
327
+ Section 5 -- Disclaimer of Warranties and Limitation of Liability.
328
+
329
+ a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
330
+ EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
331
+ AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
332
+ ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
333
+ IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
334
+ WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
335
+ PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
336
+ ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
337
+ KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
338
+ ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
339
+
340
+ b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
341
+ TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
342
+ NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
343
+ INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
344
+ COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
345
+ USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
346
+ ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
347
+ DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
348
+ IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
349
+
350
+ c. The disclaimer of warranties and limitation of liability provided
351
+ above shall be interpreted in a manner that, to the extent
352
+ possible, most closely approximates an absolute disclaimer and
353
+ waiver of all liability.
354
+
355
+
356
+ Section 6 -- Term and Termination.
357
+
358
+ a. This Public License applies for the term of the Copyright and
359
+ Similar Rights licensed here. However, if You fail to comply with
360
+ this Public License, then Your rights under this Public License
361
+ terminate automatically.
362
+
363
+ b. Where Your right to use the Licensed Material has terminated under
364
+ Section 6(a), it reinstates:
365
+
366
+ 1. automatically as of the date the violation is cured, provided
367
+ it is cured within 30 days of Your discovery of the
368
+ violation; or
369
+
370
+ 2. upon express reinstatement by the Licensor.
371
+
372
+ For the avoidance of doubt, this Section 6(b) does not affect any
373
+ right the Licensor may have to seek remedies for Your violations
374
+ of this Public License.
375
+
376
+ c. For the avoidance of doubt, the Licensor may also offer the
377
+ Licensed Material under separate terms or conditions or stop
378
+ distributing the Licensed Material at any time; however, doing so
379
+ will not terminate this Public License.
380
+
381
+ d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
382
+ License.
383
+
384
+
385
+ Section 7 -- Other Terms and Conditions.
386
+
387
+ a. The Licensor shall not be bound by any additional or different
388
+ terms or conditions communicated by You unless expressly agreed.
389
+
390
+ b. Any arrangements, understandings, or agreements regarding the
391
+ Licensed Material not stated herein are separate from and
392
+ independent of the terms and conditions of this Public License.
393
+
394
+
395
+ Section 8 -- Interpretation.
396
+
397
+ a. For the avoidance of doubt, this Public License does not, and
398
+ shall not be interpreted to, reduce, limit, restrict, or impose
399
+ conditions on any use of the Licensed Material that could lawfully
400
+ be made without permission under this Public License.
401
+
402
+ b. To the extent possible, if any provision of this Public License is
403
+ deemed unenforceable, it shall be automatically reformed to the
404
+ minimum extent necessary to make it enforceable. If the provision
405
+ cannot be reformed, it shall be severed from this Public License
406
+ without affecting the enforceability of the remaining terms and
407
+ conditions.
408
+
409
+ c. No term or condition of this Public License will be waived and no
410
+ failure to comply consented to unless expressly agreed to by the
411
+ Licensor.
412
+
413
+ d. Nothing in this Public License constitutes or may be interpreted
414
+ as a limitation upon, or waiver of, any privileges and immunities
415
+ that apply to the Licensor or You, including from the legal
416
+ processes of any jurisdiction or authority.
417
+
418
+ =======================================================================
419
+
420
+ Creative Commons is not a party to its public
421
+ licenses. Notwithstanding, Creative Commons may elect to apply one of
422
+ its public licenses to material it publishes and in those instances
423
+ will be considered the “Licensor.” The text of the Creative Commons
424
+ public licenses is dedicated to the public domain under the CC0 Public
425
+ Domain Dedication. Except for the limited purpose of indicating that
426
+ material is shared under a Creative Commons public license or as
427
+ otherwise permitted by the Creative Commons policies published at
428
+ creativecommons.org/policies, Creative Commons does not authorize the
429
+ use of the trademark "Creative Commons" or any other trademark or logo
430
+ of Creative Commons without its prior written consent including,
431
+ without limitation, in connection with any unauthorized modifications
432
+ to any of its public licenses or any other arrangements,
433
+ understandings, or agreements concerning use of licensed material. For
434
+ the avoidance of doubt, this paragraph does not form part of the
435
+ public licenses.
436
+
437
+ Creative Commons may be contacted at creativecommons.org.
README.zh.md ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Fish Speech
2
+
3
+ <div>
4
+ <a target="_blank" href="https://discord.gg/Es5qTB9BcN">
5
+ <img alt="Discord" src="https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square"/>
6
+ </a>
7
+ <a target="_blank" href="http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=jCKlUP7QgSm9kh95UlBoYv6s1I-Apl1M&authKey=xI5ttVAp3do68IpEYEalwXSYZFdfxZSkah%2BctF5FIMyN2NqAa003vFtLqJyAVRfF&noverify=0&group_code=593946093">
8
+ <img alt="QQ" src="https://img.shields.io/badge/QQ Group-%2312B7F5?logo=tencent-qq&logoColor=white&style=flat-square"/>
9
+ </a>
10
+ <a target="_blank" href="https://hub.docker.com/r/lengyue233/fish-speech">
11
+ <img alt="Docker" src="https://img.shields.io/docker/pulls/lengyue233/fish-speech?style=flat-square&logo=docker"/>
12
+ </a>
13
+ <a target="_blank" href="https://github.com/fishaudio/fish-speech/actions/workflows/build-windows-package.yml">
14
+ <img alt="Action" src="https://img.shields.io/github/actions/workflow/status/fishaudio/fish-speech/build-windows-package.yml?style=flat-square&label=Build%20Windows%20Package&logo=github"/>
15
+ </a>
16
+ </div>
17
+
18
+ 此代码库及模型根据 CC-BY-NC-SA-4.0 许可证发布。请参阅 [LICENSE](LICENSE) 了解更多细节.
19
+
20
+ ## 免责声明
21
+
22
+ 我们不对代码库的任何非法使用承担任何责任. 请参阅您当地关于 DMCA (数字千年法案) 和其他相关法律法规.
23
+
24
+ ## 在线 DEMO
25
+
26
+ [Fish Audio](https://fish.audio)
27
+
28
+ ## 快速开始本地推理
29
+
30
+ [inference.ipynb](/inference.ipynb)
31
+
32
+ ## 视频
33
+
34
+ #### 1.2 介绍: https://www.bilibili.com/video/BV1wz421B71D
35
+
36
+ #### 1.1 技术介绍: https://www.bilibili.com/video/BV1zJ4m1K7cj
37
+
38
+ ## 文档
39
+
40
+ - [English](https://speech.fish.audio/en/)
41
+ - [中文](https://speech.fish.audio/)
42
+ - [日本語](https://speech.fish.audio/ja/)
43
+
44
+ ## 例子
45
+
46
+ - [English](https://speech.fish.audio/en/samples/)
47
+ - [中文](https://speech.fish.audio/samples/)
48
+ - [日本語](https://speech.fish.audio/ja/samples/)
49
+
50
+ ## 鸣谢
51
+
52
+ - [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2)
53
+ - [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2)
54
+ - [GPT VITS](https://github.com/innnky/gpt-vits)
55
+ - [MQTTS](https://github.com/b04901014/MQTTS)
56
+ - [GPT Fast](https://github.com/pytorch-labs/gpt-fast)
57
+ - [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)
58
+
59
+ ## 赞助
60
+
61
+ <div>
62
+ <a href="https://6block.com/">
63
+ <img src="https://avatars.githubusercontent.com/u/60573493" width="100" height="100" alt="6Block Avatar"/>
64
+ </a>
65
+ <br>
66
+ <a href="https://6block.com/">数据处理服务器由 6Block 提供</a>
67
+ </div>
68
+ <div>
69
+ <a href="https://www.lepton.ai/">
70
+ <img src="https://www.lepton.ai/favicons/apple-touch-icon.png" width="100" height="100" alt="Lepton Avatar"/>
71
+ </a>
72
+ <br>
73
+ <a href="https://www.lepton.ai/">Fish Audio 在线推理与 Lepton 合作</a>
74
+ </div>
docker-compose.dev.yml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ services:
4
+ fish-speech:
5
+ build: .
6
+ container_name: fish-speech
7
+ volumes:
8
+ - ./:/exp
9
+ deploy:
10
+ resources:
11
+ reservations:
12
+ devices:
13
+ - driver: nvidia
14
+ count: all
15
+ capabilities: [gpu]
16
+ command: tail -f /dev/null
dockerfile ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10.14-bookworm
2
+
3
+ # Install system dependencies
4
+ ENV DEBIAN_FRONTEND=noninteractive
5
+ RUN apt-get update && apt-get install -y git curl build-essential ffmpeg libsm6 libxext6 libjpeg-dev \
6
+ zlib1g-dev aria2 zsh openssh-server sudo protobuf-compiler cmake libsox-dev && \
7
+ apt-get clean && rm -rf /var/lib/apt/lists/*
8
+
9
+ # Install oh-my-zsh so your terminal looks nice
10
+ RUN sh -c "$(curl https://raw.githubusercontent.com/robbyrussell/oh-my-zsh/master/tools/install.sh)" "" --unattended
11
+
12
+ # Set zsh as default shell
13
+ RUN chsh -s /usr/bin/zsh
14
+ ENV SHELL=/usr/bin/zsh
15
+
16
+ # Setup torchaudio
17
+ RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
18
+
19
+ # Project Env
20
+ WORKDIR /exp
21
+ COPY . .
22
+ RUN pip3 install -e .
23
+
24
+ CMD /bin/zsh
docs/CNAME ADDED
@@ -0,0 +1 @@
 
 
1
+ speech.fish.audio
docs/assets/figs/VS_1.jpg ADDED
docs/assets/figs/diagram.png ADDED
docs/en/finetune.md ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Fine-tuning
2
+
3
+ Obviously, when you opened this page, you were not satisfied with the performance of the few-shot pre-trained model. You want to fine-tune a model to improve its performance on your dataset.
4
+
5
+ In current version, you only need to finetune the 'LLAMA' part.
6
+
7
+ ## Fine-tuning LLAMA
8
+ ### 1. Prepare the dataset
9
+
10
+ ```
11
+ .
12
+ ├── SPK1
13
+ │ ├── 21.15-26.44.lab
14
+ │ ├── 21.15-26.44.mp3
15
+ │ ├── 27.51-29.98.lab
16
+ │ ├── 27.51-29.98.mp3
17
+ │ ├── 30.1-32.71.lab
18
+ │ └── 30.1-32.71.mp3
19
+ └── SPK2
20
+ ├── 38.79-40.85.lab
21
+ └── 38.79-40.85.mp3
22
+ ```
23
+
24
+ You need to convert your dataset into the above format and place it under `data`. The audio file can have the extensions `.mp3`, `.wav`, or `.flac`, and the annotation file should have the extensions `.lab`.
25
+
26
+ !!! warning
27
+ It's recommended to apply loudness normalization to the dataset. You can use [fish-audio-preprocess](https://github.com/fishaudio/audio-preprocess) to do this.
28
+
29
+ ```bash
30
+ fap loudness-norm data-raw data --clean
31
+ ```
32
+
33
+
34
+ ### 2. Batch extraction of semantic tokens
35
+
36
+ Make sure you have downloaded the VQGAN weights. If not, run the following command:
37
+
38
+ ```bash
39
+ huggingface-cli download fishaudio/fish-speech-1.2-sft --local-dir checkpoints/fish-speech-1.2-sft
40
+ ```
41
+
42
+ You can then run the following command to extract semantic tokens:
43
+
44
+ ```bash
45
+ python tools/vqgan/extract_vq.py data \
46
+ --num-workers 1 --batch-size 16 \
47
+ --config-name "firefly_gan_vq" \
48
+ --checkpoint-path "checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth"
49
+ ```
50
+
51
+ !!! note
52
+ You can adjust `--num-workers` and `--batch-size` to increase extraction speed, but please make sure not to exceed your GPU memory limit.
53
+ For the VITS format, you can specify a file list using `--filelist xxx.list`.
54
+
55
+ This command will create `.npy` files in the `data` directory, as shown below:
56
+
57
+ ```
58
+ .
59
+ ├── SPK1
60
+ │ ├── 21.15-26.44.lab
61
+ │ ├── 21.15-26.44.mp3
62
+ │ ├── 21.15-26.44.npy
63
+ │ ├── 27.51-29.98.lab
64
+ │ ├── 27.51-29.98.mp3
65
+ │ ├── 27.51-29.98.npy
66
+ │ ├── 30.1-32.71.lab
67
+ │ ├── 30.1-32.71.mp3
68
+ │ └── 30.1-32.71.npy
69
+ └── SPK2
70
+ ├── 38.79-40.85.lab
71
+ ├── 38.79-40.85.mp3
72
+ └── 38.79-40.85.npy
73
+ ```
74
+
75
+ ### 3. Pack the dataset into protobuf
76
+
77
+ ```bash
78
+ python tools/llama/build_dataset.py \
79
+ --input "data" \
80
+ --output "data/protos" \
81
+ --text-extension .lab \
82
+ --num-workers 16
83
+ ```
84
+
85
+ After the command finishes executing, you should see the `quantized-dataset-ft.protos` file in the `data` directory.
86
+
87
+ ### 4. Finally, fine-tuning with LoRA
88
+
89
+ Similarly, make sure you have downloaded the `LLAMA` weights. If not, run the following command:
90
+
91
+ ```bash
92
+ huggingface-cli download fishaudio/fish-speech-1.2-sft --local-dir checkpoints/fish-speech-1.2-sft
93
+ ```
94
+
95
+ Finally, you can start the fine-tuning by running the following command:
96
+
97
+ ```bash
98
+ python fish_speech/train.py --config-name text2semantic_finetune \
99
+ project=$project \
100
+ +lora@model.model.lora_config=r_8_alpha_16
101
+ ```
102
+
103
+ !!! note
104
+ You can modify the training parameters such as `batch_size`, `gradient_accumulation_steps`, etc. to fit your GPU memory by modifying `fish_speech/configs/text2semantic_finetune.yaml`.
105
+
106
+ !!! note
107
+ For Windows users, you can use `trainer.strategy.process_group_backend=gloo` to avoid `nccl` issues.
108
+
109
+ After training is complete, you can refer to the [inference](inference.md) section, and use `--speaker SPK1` to generate speech.
110
+
111
+ !!! info
112
+ By default, the model will only learn the speaker's speech patterns and not the timbre. You still need to use prompts to ensure timbre stability.
113
+ If you want to learn the timbre, you can increase the number of training steps, but this may lead to overfitting.
114
+
115
+ After training, you need to convert the LoRA weights to regular weights before performing inference.
116
+
117
+ ```bash
118
+ python tools/llama/merge_lora.py \
119
+ --lora-config r_8_alpha_16 \
120
+ --base-weight checkpoints/fish-speech-1.2-sft \
121
+ --lora-weight results/$project/checkpoints/step_000000010.ckpt \
122
+ --output checkpoints/fish-speech-1.2-sft-yth-lora/
123
+ ```
124
+ !!! note
125
+ You may also try other checkpoints. We suggest using the earliest checkpoint that meets your requirements, as they often perform better on out-of-distribution (OOD) data.
docs/en/index.md ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Introduction
2
+
3
+ <div>
4
+ <a target="_blank" href="https://discord.gg/Es5qTB9BcN">
5
+ <img alt="Discord" src="https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square"/>
6
+ </a>
7
+ <a target="_blank" href="http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=jCKlUP7QgSm9kh95UlBoYv6s1I-Apl1M&authKey=xI5ttVAp3do68IpEYEalwXSYZFdfxZSkah%2BctF5FIMyN2NqAa003vFtLqJyAVRfF&noverify=0&group_code=593946093">
8
+ <img alt="QQ" src="https://img.shields.io/badge/QQ Group-%2312B7F5?logo=tencent-qq&logoColor=white&style=flat-square"/>
9
+ </a>
10
+ <a target="_blank" href="https://hub.docker.com/r/lengyue233/fish-speech">
11
+ <img alt="Docker" src="https://img.shields.io/docker/pulls/lengyue233/fish-speech?style=flat-square&logo=docker"/>
12
+ </a>
13
+ </div>
14
+
15
+ !!! warning
16
+ We assume no responsibility for any illegal use of the codebase. Please refer to the local laws regarding DMCA (Digital Millennium Copyright Act) and other relevant laws in your area. <br/>
17
+ This codebase is released under the `BSD-3-Clause` license, and all models are released under the CC-BY-NC-SA-4.0 license.
18
+
19
+ <p align="center">
20
+ <img src="../assets/figs/diagram.png" width="75%">
21
+ </p>
22
+
23
+ ## Requirements
24
+
25
+ - GPU Memory: 4GB (for inference), 8GB (for fine-tuning)
26
+ - System: Linux, Windows
27
+
28
+ ## Windows Setup
29
+
30
+ Windows professional users may consider WSL2 or Docker to run the codebase.
31
+
32
+ Non-professional Windows users can consider the following methods to run the codebase without a Linux environment (with model compilation capabilities aka `torch.compile`):
33
+
34
+ <ol>
35
+ <li>Unzip the project package.</li>
36
+ <li>Click <code>install_env.bat</code> to install the environment.
37
+ <ul>
38
+ <li>You can decide whether to use a mirror site for downloads by editing the <code>USE_MIRROR</code> item in <code>install_env.bat</code>.</li>
39
+ <li><code>USE_MIRROR=false</code> downloads the latest stable version of <code>torch</code> from the original site. <code>USE_MIRROR=true</code> downloads the latest version of <code>torch</code> from a mirror site. The default is <code>true</code>.</li>
40
+ <li>You can decide whether to enable the compiled environment download by editing the <code>INSTALL_TYPE</code> item in <code>install_env.bat</code>.</li>
41
+ <li><code>INSTALL_TYPE=preview</code> downloads the preview version with the compiled environment. <code>INSTALL_TYPE=stable</code> downloads the stable version without the compiled environment.</li>
42
+ </ul>
43
+ </li>
44
+ <li>If step 2 has <code>USE_MIRROR=preview</code>, execute this step (optional, for activating the compiled model environment):
45
+ <ol>
46
+ <li>Download the LLVM compiler using the following links:
47
+ <ul>
48
+ <li><a href="https://huggingface.co/fishaudio/fish-speech-1/resolve/main/LLVM-17.0.6-win64.exe?download=true">LLVM-17.0.6 (original site download)</a></li>
49
+ <li><a href="https://hf-mirror.com/fishaudio/fish-speech-1/resolve/main/LLVM-17.0.6-win64.exe?download=true">LLVM-17.0.6 (mirror site download)</a></li>
50
+ <li>After downloading <code>LLVM-17.0.6-win64.exe</code>, double-click to install it, choose an appropriate installation location, and most importantly, check <code>Add Path to Current User</code> to add to the environment variables.</li>
51
+ <li>Confirm the installation is complete.</li>
52
+ </ul>
53
+ </li>
54
+ <li>Download and install the Microsoft Visual C++ Redistributable package to resolve potential .dll missing issues.
55
+ <ul>
56
+ <li><a href="https://aka.ms/vs/17/release/vc_redist.x64.exe">MSVC++ 14.40.33810.0 Download</a></li>
57
+ </ul>
58
+ </li>
59
+ <li>Download and install Visual Studio Community Edition to obtain MSVC++ build tools, resolving LLVM header file dependencies.
60
+ <ul>
61
+ <li><a href="https://visualstudio.microsoft.com/zh-hans/downloads/">Visual Studio Download</a></li>
62
+ <li>After installing Visual Studio Installer, download Visual Studio Community 2022.</li>
63
+ <li>Click the <code>Modify</code> button as shown below, find the <code>Desktop development with C++</code> option, and check it for download.</li>
64
+ <p align="center">
65
+ <img src="../assets/figs/VS_1.jpg" width="75%">
66
+ </p>
67
+ </ul>
68
+ </li>
69
+ <li>Install <a href="https://developer.nvidia.com/cuda-12-1-0-download-archive?target_os=Windows&target_arch=x86_64">CUDA Toolkit 12</a></li>
70
+ </ol>
71
+ </li>
72
+ <li>Double-click <code>start.bat</code> to enter the Fish-Speech training inference configuration WebUI page.
73
+ <ul>
74
+ <li>(Optional) Want to go directly to the inference page? Edit the <code>API_FLAGS.txt</code> in the project root directory and modify the first three lines as follows:
75
+ <pre><code>--infer
76
+ # --api
77
+ # --listen ...
78
+ ...</code></pre>
79
+ </li>
80
+ <li>(Optional) Want to start the API server? Edit the <code>API_FLAGS.txt</code> in the project root directory and modify the first three lines as follows:
81
+ <pre><code># --infer
82
+ --api
83
+ --listen ...
84
+ ...</code></pre>
85
+ </li>
86
+ </ul>
87
+ </li>
88
+ <li>(Optional) Double-click <code>run_cmd.bat</code> to enter the conda/python command line environment of this project.</li>
89
+ </ol>
90
+
91
+ ## Linux Setup
92
+
93
+ ```bash
94
+ # Create a python 3.10 virtual environment, you can also use virtualenv
95
+ conda create -n fish-speech python=3.10
96
+ conda activate fish-speech
97
+
98
+ # Install pytorch
99
+ pip3 install torch torchvision torchaudio
100
+
101
+ # Install fish-speech
102
+ pip3 install -e .
103
+
104
+ # (Ubuntu / Debian User) Install sox
105
+ apt install libsox-dev
106
+ ```
107
+
108
+ ## Changelog
109
+
110
+ - 2024/07/02: Updated Fish-Speech to 1.2 version, remove VITS Decoder, and greatly enhanced zero-shot ability.
111
+ - 2024/05/10: Updated Fish-Speech to 1.1 version, implement VITS decoder to reduce WER and improve timbre similarity.
112
+ - 2024/04/22: Finished Fish-Speech 1.0 version, significantly modified VQGAN and LLAMA models.
113
+ - 2023/12/28: Added `lora` fine-tuning support.
114
+ - 2023/12/27: Add `gradient checkpointing`, `causual sampling`, and `flash-attn` support.
115
+ - 2023/12/19: Updated webui and HTTP API.
116
+ - 2023/12/18: Updated fine-tuning documentation and related examples.
117
+ - 2023/12/17: Updated `text2semantic` model, supporting phoneme-free mode.
118
+ - 2023/12/13: Beta version released, includes VQGAN model and a language model based on LLAMA (phoneme support only).
119
+
120
+ ## Acknowledgements
121
+
122
+ - [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2)
123
+ - [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2)
124
+ - [GPT VITS](https://github.com/innnky/gpt-vits)
125
+ - [MQTTS](https://github.com/b04901014/MQTTS)
126
+ - [GPT Fast](https://github.com/pytorch-labs/gpt-fast)
127
+ - [Transformers](https://github.com/huggingface/transformers)
128
+ - [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)
docs/en/inference.md ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Inference
2
+
3
+ Inference support command line, HTTP API and web UI.
4
+
5
+ !!! note
6
+ Overall, reasoning consists of several parts:
7
+
8
+ 1. Encode a given ~10 seconds of voice using VQGAN.
9
+ 2. Input the encoded semantic tokens and the corresponding text into the language model as an example.
10
+ 3. Given a new piece of text, let the model generate the corresponding semantic tokens.
11
+ 4. Input the generated semantic tokens into VITS / VQGAN to decode and generate the corresponding voice.
12
+
13
+ ## Command Line Inference
14
+
15
+ Download the required `vqgan` and `llama` models from our Hugging Face repository.
16
+
17
+ ```bash
18
+ huggingface-cli download fishaudio/fish-speech-1.2-sft --local-dir checkpoints/fish-speech-1.2-sft
19
+ ```
20
+
21
+ ### 1. Generate prompt from voice:
22
+
23
+ !!! note
24
+ If you plan to let the model randomly choose a voice timbre, you can skip this step.
25
+
26
+ ```bash
27
+ python tools/vqgan/inference.py \
28
+ -i "paimon.wav" \
29
+ --checkpoint-path "checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth"
30
+ ```
31
+
32
+ You should get a `fake.npy` file.
33
+
34
+ ### 2. Generate semantic tokens from text:
35
+
36
+ ```bash
37
+ python tools/llama/generate.py \
38
+ --text "The text you want to convert" \
39
+ --prompt-text "Your reference text" \
40
+ --prompt-tokens "fake.npy" \
41
+ --checkpoint-path "checkpoints/fish-speech-1.2-sft" \
42
+ --num-samples 2 \
43
+ --compile
44
+ ```
45
+
46
+ This command will create a `codes_N` file in the working directory, where N is an integer starting from 0.
47
+
48
+ !!! note
49
+ You may want to use `--compile` to fuse CUDA kernels for faster inference (~30 tokens/second -> ~500 tokens/second).
50
+ Correspondingly, if you do not plan to use acceleration, you can comment out the `--compile` parameter.
51
+
52
+ !!! info
53
+ For GPUs that do not support bf16, you may need to use the `--half` parameter.
54
+
55
+ ### 3. Generate vocals from semantic tokens:
56
+
57
+ #### VQGAN Decoder
58
+
59
+ ```bash
60
+ python tools/vqgan/inference.py \
61
+ -i "codes_0.npy" \
62
+ --checkpoint-path "checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth"
63
+ ```
64
+
65
+ ## HTTP API Inference
66
+
67
+ We provide a HTTP API for inference. You can use the following command to start the server:
68
+
69
+ ```bash
70
+ python -m tools.api \
71
+ --listen 0.0.0.0:8080 \
72
+ --llama-checkpoint-path "checkpoints/fish-speech-1.2-sft" \
73
+ --decoder-checkpoint-path "checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth" \
74
+ --decoder-config-name firefly_gan_vq
75
+ ```
76
+
77
+ If you want to speed up inference, you can add the --compile parameter.
78
+
79
+ After that, you can view and test the API at http://127.0.0.1:8080/.
80
+
81
+ Below is an example of sending a request using `tools/post_api.py`.
82
+
83
+ ```bash
84
+ python -m tools.post_api \
85
+ --text "Text to be input" \
86
+ --reference_audio "Path to reference audio" \
87
+ --reference_text "Text content of the reference audio" \
88
+ --streaming True
89
+ ```
90
+
91
+ The above command indicates synthesizing the desired audio according to the reference audio information and returning it in a streaming manner.
92
+
93
+ If you need to randomly select reference audio based on `{SPEAKER}` and `{EMOTION}`, configure it according to the following steps:
94
+
95
+ ### 1. Create a `ref_data` folder in the root directory of the project.
96
+
97
+ ### 2. Create a directory structure similar to the following within the `ref_data` folder.
98
+
99
+ ```
100
+ .
101
+ ├── SPEAKER1
102
+ │ ├──EMOTION1
103
+ │ │ ├── 21.15-26.44.lab
104
+ │ │ ├── 21.15-26.44.wav
105
+ │ │ ├── 27.51-29.98.lab
106
+ │ │ ├── 27.51-29.98.wav
107
+ │ │ ├── 30.1-32.71.lab
108
+ │ │ └── 30.1-32.71.flac
109
+ │ └──EMOTION2
110
+ │ ├── 30.1-32.71.lab
111
+ │ └── 30.1-32.71.mp3
112
+ └── SPEAKER2
113
+ └─── EMOTION3
114
+ ├── 30.1-32.71.lab
115
+ └── 30.1-32.71.mp3
116
+ ```
117
+
118
+ That is, first place `{SPEAKER}` folders in `ref_data`, then place `{EMOTION}` folders under each speaker, and place any number of `audio-text pairs` under each emotion folder.
119
+
120
+ ### 3. Enter the following command in the virtual environment
121
+
122
+ ```bash
123
+ python tools/gen_ref.py
124
+
125
+ ```
126
+
127
+ ### 4. Call the API.
128
+
129
+ ```bash
130
+ python -m tools.post_api \
131
+ --text "Text to be input" \
132
+ --speaker "${SPEAKER1}" \
133
+ --emotion "${EMOTION1}" \
134
+ --streaming True
135
+ ```
136
+
137
+ The above example is for testing purposes only.
138
+
139
+ ## WebUI Inference
140
+
141
+ You can start the WebUI using the following command:
142
+
143
+ ```bash
144
+ python -m tools.webui \
145
+ --llama-checkpoint-path "checkpoints/fish-speech-1.2-sft" \
146
+ --decoder-checkpoint-path "checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth" \
147
+ --decoder-config-name firefly_gan_vq
148
+ ```
149
+
150
+ !!! note
151
+ You can use Gradio environment variables, such as `GRADIO_SHARE`, `GRADIO_SERVER_PORT`, `GRADIO_SERVER_NAME` to configure WebUI.
152
+
153
+ Enjoy!
docs/en/samples.md ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Samples
2
+
3
+ v1.2 samples are available on [Bilibili](https://www.bilibili.com/video/BV1wz421B71D/).
4
+
5
+ The following samples are from the v1.1 model.
6
+
7
+ ## Chinese Sentence 1
8
+ ```
9
+ 人间灯火倒映湖中,她的渴望让静水泛起涟漪。若代价只是孤独,那就让这份愿望肆意流淌。
10
+ 流入她所注视的世间,也流入她如湖水般澄澈的目光。
11
+ ```
12
+
13
+ <table>
14
+ <thead>
15
+ <tr>
16
+ <th>Speaker</th>
17
+ <th>Input Audio</th>
18
+ <th>Synthesized Audio</th>
19
+ </tr>
20
+ </thead>
21
+ <tbody>
22
+ <tr>
23
+ <td>Nahida (Genshin Impact)</td>
24
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/0_input.wav" /></td>
25
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/0_output.wav" /></td>
26
+ </tr>
27
+ <tr>
28
+ <td>Zhongli (Genshin Impact)</td>
29
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/1_input.wav" /></td>
30
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/1_output.wav" /></td>
31
+ </tr>
32
+ <tr>
33
+ <td>Furina (Genshin Impact)</td>
34
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/2_input.wav" /></td>
35
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/2_output.wav" /></td>
36
+ </tr>
37
+ <tr>
38
+ <td>Random Speaker 1</td>
39
+ <td> - </td>
40
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/4_output.wav" /></td>
41
+ </tr>
42
+ <tr>
43
+ <td>Random Speaker 2</td>
44
+ <td> - </td>
45
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/5_output.wav" /></td>
46
+ </tr>
47
+ </tbody>
48
+ </table>
49
+
50
+
51
+ ## Chinese Sentence 2
52
+ ```
53
+ 你们这个是什么群啊,你们这是害人不浅啊你们这个群!谁是群主,出来!真的太过分了。你们搞这个群干什么?
54
+ 我儿子每一科的成绩都不过那个平均分呐,他现在初二,你叫我儿子怎么办啊?他现在还不到高中啊?
55
+ 你们害死我儿子了!快点出来你这个群主!再这样我去报警了啊!我跟你们说你们这一帮人啊,一天到晚啊,
56
+ 搞这些什么游戏啊,动漫啊,会害死你们的,你们没有前途我跟你说。你们这九百多个人,好好学习不好吗?
57
+ 一天到晚在上网。有什么意思啊?麻烦你重视一下你们的生活的目标啊?有一点学习目标行不行?一天到晚上网是不是人啊?
58
+ ```
59
+
60
+ <table>
61
+ <thead>
62
+ <tr>
63
+ <th>Speaker</th>
64
+ <th>Input Audio</th>
65
+ <th>Synthesized Audio</th>
66
+ </tr>
67
+ </thead>
68
+ <tbody>
69
+ <tr>
70
+ <td>Nahida (Genshin Impact)</td>
71
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/0_input.wav" /></td>
72
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/6_output.wav" /></td>
73
+ </tr>
74
+ <tr>
75
+ <td>Random Speaker</td>
76
+ <td> - </td>
77
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/7_output.wav" /></td>
78
+ </tr>
79
+ </tbody>
80
+ </table>
81
+
82
+
83
+ ## Chinese Sentence 3
84
+ ```
85
+ 大家好,我是 Fish Audio 开发的开源文本转语音模型。经过十五万小时的数据训练,
86
+ 我已经能够熟练掌握中文、日语和英语,我的语言处理能力接近人类水平,声音表现形式丰富多变。
87
+ 作为一个仅有亿级参数的模型,我相信社区成员能够在个人设备上轻松运行和微调,让我成为您的私人语音助手。
88
+ ```
89
+
90
+
91
+ <table>
92
+ <thead>
93
+ <tr>
94
+ <th>Speaker</th>
95
+ <th>Input Audio</th>
96
+ <th>Synthesized Audio</th>
97
+ </tr>
98
+ </thead>
99
+ <tbody>
100
+ <tr>
101
+ <td>Random Speaker</td>
102
+ <td> - </td>
103
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/8_output.wav" /></td>
104
+ </tr>
105
+ </tbody>
106
+ </table>
107
+
108
+ ## English Sentence 1
109
+
110
+ ```
111
+ In the realm of advanced technology, the evolution of artificial intelligence stands as a
112
+ monumental achievement. This dynamic field, constantly pushing the boundaries of what
113
+ machines can do, has seen rapid growth and innovation. From deciphering complex data
114
+ patterns to driving cars autonomously, AI's applications are vast and diverse.
115
+ ```
116
+
117
+ <table>
118
+ <thead>
119
+ <tr>
120
+ <th>Speaker</th>
121
+ <th>Input Audio</th>
122
+ <th>Synthesized Audio</th>
123
+ </tr>
124
+ </thead>
125
+ <tbody>
126
+ <tr>
127
+ <td>Random Speaker 1</td>
128
+ <td> - </td>
129
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/en/0_output.wav" /></td>
130
+ </tr>
131
+ <tr>
132
+ <td>Random Speaker 2</td>
133
+ <td> - </td>
134
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/en/1_output.wav" /></td>
135
+ </tr>
136
+ </tbody>
137
+ </table>
138
+
139
+ ## English Sentence 2
140
+ ```
141
+ Hello everyone, I am an open-source text-to-speech model developed by
142
+ Fish Audio. After training with 150,000 hours of data, I have become proficient
143
+ in Chinese, Japanese, and English, and my language processing abilities
144
+ are close to human level. My voice is capable of a wide range of expressions.
145
+ As a model with only hundreds of millions of parameters, I believe community
146
+ members can easily run and fine-tune me on their personal devices, allowing
147
+ me to serve as your personal voice assistant.
148
+ ```
149
+
150
+ <table>
151
+ <thead>
152
+ <tr>
153
+ <th>Speaker</th>
154
+ <th>Input Audio</th>
155
+ <th>Synthesized Audio</th>
156
+ </tr>
157
+ </thead>
158
+ <tbody>
159
+ <tr>
160
+ <td>Random Speaker</td>
161
+ <td> - </td>
162
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/en/2_output.wav" /></td>
163
+ </tr>
164
+ </tbody>
165
+ </table>
166
+
167
+ ## Japanese Sentence 1
168
+
169
+ ```
170
+ 先進技術の領域において、人工知能の進化は画期的な成果として立っています。常に機械ができることの限界を
171
+ 押し広げているこのダイナミックな分野は、急速な成長と革新を見せています。複雑なデータパターンの解読か
172
+ ら自動運転車の操縦まで、AIの応用は広範囲に及びます。
173
+ ```
174
+
175
+
176
+ <table>
177
+ <thead>
178
+ <tr>
179
+ <th>Speaker</th>
180
+ <th>Input Audio</th>
181
+ <th>Synthesized Audio</th>
182
+ </tr>
183
+ </thead>
184
+ <tbody>
185
+ <tr>
186
+ <td>Random Speaker 1</td>
187
+ <td> - </td>
188
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/ja/0_output.wav" /></td>
189
+ </tr>
190
+ <tr>
191
+ <td>Random Speaker 2</td>
192
+ <td> - </td>
193
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/ja/1_output.wav" /></td>
194
+ </tr>
195
+ </tbody>
196
+ </table>
197
+
198
+ ## Japanese Sentence 2
199
+ ```
200
+ 皆さん、こんにちは。私はフィッシュオーディオによって開発されたオープンソースのテ
201
+ キストから音声への変換モデルです。15万時間のデータトレーニングを経て、
202
+ 中国語、日本語、英語を熟知しており、言語処理能力は人間に近いレベルです。
203
+ 声の表現も多彩で豊かです。数億のパラメータを持つこのモデルは、コミュニティ
204
+ のメンバーが個人のデバイスで簡単に実行し、微調整することができると
205
+ 信じています。これにより、私を個人の音声アシスタントとして活用できます。
206
+ ```
207
+
208
+ <table>
209
+ <thead>
210
+ <tr>
211
+ <th>Speaker</th>
212
+ <th>Input Audio</th>
213
+ <th>Synthesized Audio</th>
214
+ </tr>
215
+ </thead>
216
+ <tbody>
217
+ <tr>
218
+ <td>Random Speaker</td>
219
+ <td> - </td>
220
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/ja/2_output.wav" /></td>
221
+ </tr>
222
+ </tbody>
223
+ </table>
docs/ja/finetune.md ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 微調整
2
+
3
+ 明らかに、このページを開いたとき、few-shot 事前トレーニングモデルのパフォーマンスに満足していなかったことでしょう。データセット上でのパフォーマンスを向上させるためにモデルを微調整したいと考えています。
4
+
5
+ 現在のバージョンでは、「LLAMA」部分のみを微調整する必要があります。
6
+
7
+ ## LLAMAの微調整
8
+ ### 1. データセットの準備
9
+
10
+ ```
11
+ .
12
+ ├── SPK1
13
+ │ ├── 21.15-26.44.lab
14
+ │ ├── 21.15-26.44.mp3
15
+ │ ├── 27.51-29.98.lab
16
+ │ ├── 27.51-29.98.mp3
17
+ │ ├── 30.1-32.71.lab
18
+ │ └── 30.1-32.71.mp3
19
+ └── SPK2
20
+ ├── 38.79-40.85.lab
21
+ └── 38.79-40.85.mp3
22
+ ```
23
+
24
+ データセットを上記の形式に変換し、「data」ディレクトリに配置する必要があります。音声ファイルの拡張子は「.mp3」、「.wav」、または「.flac」にすることができ、注釈ファイルの拡張子は「.lab」にする必要があります。
25
+
26
+ !!! warning
27
+ データセットにラウドネス正規化を適用することをお勧めします。これを行うには、[fish-audio-preprocess](https://github.com/fishaudio/audio-preprocess) を使用できます。
28
+
29
+ ```bash
30
+ fap loudness-norm data-raw data --clean
31
+ ```
32
+
33
+
34
+ ### 2. セマンティックトークンのバッチ抽出
35
+
36
+ VQGANの重みをダウンロードしたことを確認してください。まだダウンロードしていない場合は、次のコマンドを実行してください。
37
+
38
+ ```bash
39
+ huggingface-cli download fishaudio/fish-speech-1.2-sft --local-dir checkpoints/fish-speech-1.2-sft
40
+ ```
41
+
42
+ 次に、次のコマンドを実行してセマンティックトークンを抽出できます。
43
+
44
+ ```bash
45
+ python tools/vqgan/extract_vq.py data \
46
+ --num-workers 1 --batch-size 16 \
47
+ --config-name "firefly_gan_vq" \
48
+ --checkpoint-path "checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth"
49
+ ```
50
+
51
+ !!! note
52
+ `--num-workers` と `--batch-size` を調整して抽出速度を上げることができますが、GPUメモリの制限を超えないようにしてください。
53
+ VITS形式の場合、`--filelist xxx.list` を使用してファイルリストを指定できます。
54
+
55
+ このコマンドは、`data`ディレクトリに`.npy`ファイルを作成します。以下のように表示されます。
56
+
57
+ ```
58
+ .
59
+ ├── SPK1
60
+ │ ├── 21.15-26.44.lab
61
+ │ ├── 21.15-26.44.mp3
62
+ │ ├── 21.15-26.44.npy
63
+ │ ├── 27.51-29.98.lab
64
+ │ ├── 27.51-29.98.mp3
65
+ │ ├── 27.51-29.98.npy
66
+ │ ├── 30.1-32.71.lab
67
+ │ ├── 30.1-32.71.mp3
68
+ │ └── 30.1-32.71.npy
69
+ └── SPK2
70
+ ├── 38.79-40.85.lab
71
+ ├── 38.79-40.85.mp3
72
+ └── 38.79-40.85.npy
73
+ ```
74
+
75
+ ### 3. データセットをprotobufにパックする
76
+
77
+ ```bash
78
+ python tools/llama/build_dataset.py \
79
+ --input "data" \
80
+ --output "data/protos" \
81
+ --text-extension .lab \
82
+ --num-workers 16
83
+ ```
84
+
85
+ コマンドの実行が完了すると、`data`ディレクトリに`quantized-dataset-ft.protos`ファイルが表示されます。
86
+
87
+ ### 4. 最後に、LoRAを使用して微調整する
88
+
89
+ 同様に、`LLAMA`の重みをダウンロードしたことを確認してください。まだダウンロードしていない場合は、次のコマンドを実行してください。
90
+
91
+ ```bash
92
+ huggingface-cli download fishaudio/fish-speech-1.2-sft --local-dir checkpoints/fish-speech-1.2-sft
93
+ ```
94
+
95
+ 最後に、次のコマンドを実行して微調整を開始できます。
96
+
97
+ ```bash
98
+ python fish_speech/train.py --config-name text2semantic_finetune \
99
+ project=$project \
100
+ +lora@model.model.lora_config=r_8_alpha_16
101
+ ```
102
+
103
+ !!! note
104
+ `fish_speech/configs/text2semantic_finetune.yaml` を変更して、`batch_size`、`gradient_accumulation_steps` などのトレーニングパラメータを変更し、GPUメモリに適合させることができます。
105
+
106
+ !!! note
107
+ Windowsユーザーの場合、`trainer.strategy.process_group_backend=gloo` を使用して `nccl` の問題を回避できます。
108
+
109
+ トレーニングが完了したら、[推論](inference.md)セクションを参照し、`--speaker SPK1` を使用して音声を生成します。
110
+
111
+ !!! info
112
+ デフォルトでは、モデルは話者の発話パターンのみを学習し、音色は学習しません。音色の安定性を確保するためにプロンプトを使用する必要があります。
113
+ 音色を学習したい場合は、トレーニングステップ数を増やすことができますが、これにより過学習が発生する可能性があります。
114
+
115
+ トレーニングが完了したら、推論を行う前にLoRAの重みを通常の重みに変換する必要があります。
116
+
117
+ ```bash
118
+ python tools/llama/merge_lora.py \
119
+ --lora-config r_8_alpha_16 \
120
+ --base-weight checkpoints/fish-speech-1.2-sft \
121
+ --lora-weight results/$project/checkpoints/step_000000010.ckpt \
122
+ --output checkpoints/fish-speech-1.2-sft-yth-lora/
123
+ ```
124
+ !!! note
125
+ 他のチェックポイントを試すこともできます。要件を満たす最も早いチェックポイントを使用することをお勧めします。これらは通常、分布外(OOD)データでより良いパフォーマンスを発揮します。
docs/ja/index.md ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # イントロダクション
2
+
3
+ <div>
4
+ <a target="_blank" href="https://discord.gg/Es5qTB9BcN">
5
+ <img alt="Discord" src="https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square"/>
6
+ </a>
7
+ <a target="_blank" href="http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=jCKlUP7QgSm9kh95UlBoYv6s1I-Apl1M&authKey=xI5ttVAp3do68IpEYEalwXSYZFdfxZSkah%2BctF5FIMyN2NqAa003vFtLqJyAVRfF&noverify=0&group_code=593946093">
8
+ <img alt="QQ" src="https://img.shields.io/badge/QQ Group-%2312B7F5?logo=tencent-qq&logoColor=white&style=flat-square"/>
9
+ </a>
10
+ <a target="_blank" href="https://hub.docker.com/r/lengyue233/fish-speech">
11
+ <img alt="Docker" src="https://img.shields.io/docker/pulls/lengyue233/fish-speech?style=flat-square&logo=docker"/>
12
+ </a>
13
+ </div>
14
+
15
+ !!! warning
16
+ 私たちは、コードベースの違法な使用について一切の責任を負いません。お住まいの地域の DMCA(デジタルミレニアム著作権法)およびその他の関連法については、現地の法律を参照してください。 <br/>
17
+ このコードベースは `BSD-3-Clause` ライセンスの下でリリースされており、すべてのモデルは CC-BY-NC-SA-4.0 ライセンスの下でリリースされています。
18
+
19
+ <p align="center">
20
+ <img src="../assets/figs/diagram.png" width="75%">
21
+ </p>
22
+
23
+ ## 要件
24
+
25
+ - GPU メモリ: 4GB(推論用)、8GB(微調整用)
26
+ - システム: Linux、Windows
27
+
28
+ ## Windows セットアップ
29
+
30
+ Windows のプロユーザーは、コードベースを実行するために WSL2 または Docker を検討することができます。
31
+
32
+ 非プロの Windows ユーザーは、Linux 環境なしでコードベースを実行するために以下の方法を検討することができます(モデルコンパイル機能付き、つまり `torch.compile`):
33
+
34
+ <ol>
35
+ <li>プロジェクトパッケージを解凍します。</li>
36
+ <li><code>install_env.bat</code>をクリックして環境をインストールします。
37
+ <ul>
38
+ <li><code>install_env.bat</code>の<code>USE_MIRROR</code>項目を編集して、ミラーサイトを使用するかどうかを決定できます。</li>
39
+ <li><code>USE_MIRROR=false</code>は、最新の安定版<code>torch</code>をオリジナルサイトからダウンロードします。<code>USE_MIRROR=true</code>は、最新の<code>torch</code>をミラーサイトからダウンロードします。デフォルトは<code>true</code>です。</li>
40
+ <li><code>install_env.bat</code>の<code>INSTALL_TYPE</code>項目を編集して、コンパイル環境のダウンロードを有効にするかどうかを決定できます。</li>
41
+ <li><code>INSTALL_TYPE=preview</code>は、コンパイル環境付きのプレビュー版をダウンロードします。<code>INSTALL_TYPE=stable</code>は、コンパイル環境なしの安定版をダウンロードします。</li>
42
+ </ul>
43
+ </li>
44
+ <li>ステップ2で<code>USE_MIRROR=preview</code>の場合、このステップを実行します(オプション、コンパイルモデル環境を有効にするため):
45
+ <ol>
46
+ <li>以下のリンクを使用してLLVMコンパイラをダウンロードします:
47
+ <ul>
48
+ <li><a href="https://huggingface.co/fishaudio/fish-speech-1/resolve/main/LLVM-17.0.6-win64.exe?download=true">LLVM-17.0.6(オリジナルサイトダウンロード)</a></li>
49
+ <li><a href="https://hf-mirror.com/fishaudio/fish-speech-1/resolve/main/LLVM-17.0.6-win64.exe?download=true">LLVM-17.0.6(ミラーサイトダウンロード)</a></li>
50
+ <li><code>LLVM-17.0.6-win64.exe</code>をダウンロードした後、ダブルクリックしてインストールし、適切なインストール場所を選択し、最も重要なのは<code>Add Path to Current User</code>をチェックして環境変数に追加することです。</li>
51
+ <li>インストールが完了したことを確認します。</li>
52
+ </ul>
53
+ </li>
54
+ <li>Microsoft Visual C++ 再頒布可能パッケージをダウンロードしてインストールし、潜在的な.dllの欠落問題を解決します。
55
+ <ul>
56
+ <li><a href="https://aka.ms/vs/17/release/vc_redist.x64.exe">MSVC++ 14.40.33810.0 ダウンロード</a></li>
57
+ </ul>
58
+ </li>
59
+ <li>Visual Studio Community Editionをダウンロードしてインストールし、MSVC++ビルドツールを取得し、LLVMのヘッダーファイル依存関係を解決します。
60
+ <ul>
61
+ <li><a href="https://visualstudio.microsoft.com/zh-hans/downloads/">Visual Studio ダウンロード</a></li>
62
+ <li>Visual Studio Installerをインストールした後、Visual Studio Community 2022をダウンロードします。</li>
63
+ <li>以下の図の��うに<code>Modify</code>ボタンをクリックし、<code>Desktop development with C++</code>オプションを見つけてチェックしてダウンロードします。</li>
64
+ <p align="center">
65
+ <img src="../assets/figs/VS_1.jpg" width="75%">
66
+ </p>
67
+ </ul>
68
+ </li>
69
+ <li>インストール <a href="https://developer.nvidia.com/cuda-12-1-0-download-archive?target_os=Windows&target_arch=x86_64">CUDA Toolkit 12</a></li>
70
+ </ol>
71
+ </li>
72
+ <li><code>start.bat</code>をダブルクリックして、Fish-Speechトレーニング推論設定WebUIページに入ります。
73
+ <ul>
74
+ <li>(オプション)直接推論ページに行きたい場合は、プロジェクトルートディレクトリの<code>API_FLAGS.txt</code>を編集し、最初の3行を次のように変更します:
75
+ <pre><code>--infer
76
+ # --api
77
+ # --listen ...
78
+ ...</code></pre>
79
+ </li>
80
+ <li>(オプション)APIサーバーを起動したい場合は、プロジェクトルートディレクトリの<code>API_FLAGS.txt</code>を編集し、最初の3行を次のように変更します:
81
+ <pre><code># --infer
82
+ --api
83
+ --listen ...
84
+ ...</code></pre>
85
+ </li>
86
+ </ul>
87
+ </li>
88
+ <li>(オプション)<code>run_cmd.bat</code>をダブルクリックして、このプロジェクトのconda/pythonコマンドライン環境に入ります。</li>
89
+ </ol>
90
+
91
+ ## Linux セットアップ
92
+
93
+ ```bash
94
+ # python 3.10仮想環境を作成します。virtualenvも使用できます。
95
+ conda create -n fish-speech python=3.10
96
+ conda activate fish-speech
97
+
98
+ # pytorchをインストールします。
99
+ pip3 install torch torchvision torchaudio
100
+
101
+ # fish-speechをインストールします。
102
+ pip3 install -e .
103
+
104
+ # (Ubuntu / Debianユーザー) soxをインストールします。
105
+ apt install libsox-dev
106
+ ```
107
+
108
+ ## 変更履歴
109
+
110
+ - 2024/07/02: Fish-Speech を 1.2 バージョンに更新し、VITS デコーダーを削除し、ゼロショット能力を大幅に強化しました。
111
+ - 2024/05/10: Fish-Speech を 1.1 バージョンに更新し、VITS デコーダーを実装して WER を減少させ、音色の類似性を向上させました。
112
+ - 2024/04/22: Fish-Speech 1.0 バージョンを完成させ、VQGAN および LLAMA モデルを大幅に修正しました。
113
+ - 2023/12/28: `lora`微調整サポートを追加しました。
114
+ - 2023/12/27: `gradient checkpointing`、`causual sampling`、および`flash-attn`サポートを追加しました。
115
+ - 2023/12/19: webui および HTTP API を更新しました。
116
+ - 2023/12/18: 微調整ドキュメントおよび関連例を更新しました。
117
+ - 2023/12/17: `text2semantic`モデルを更新し、音素フリーモードをサポートしました。
118
+ - 2023/12/13: ベータ版をリリースし、VQGAN モデルおよび LLAMA に基づく言語モデル(音素のみサポート)を含みます。
119
+
120
+ ## 謝辞
121
+
122
+ - [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2)
123
+ - [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2)
124
+ - [GPT VITS](https://github.com/innnky/gpt-vits)
125
+ - [MQTTS](https://github.com/b04901014/MQTTS)
126
+ - [GPT Fast](https://github.com/pytorch-labs/gpt-fast)
127
+ - [Transformers](https://github.com/huggingface/transformers)
128
+ - [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)
docs/ja/inference.md ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 推論
2
+
3
+ 推論は、コマンドライン、HTTP API、および Web UI をサポートしています。
4
+
5
+ !!! note
6
+ 全体として、推論は次のいくつかの部分で構成されています:
7
+
8
+ 1. VQGANを使用して、与えられた約10秒の音声をエンコードします。
9
+ 2. エンコードされたセマンティックトークンと対応するテキストを例として言語モデルに入力します。
10
+ 3. 新しいテキストが与えられた場合、モデルに対応するセマンティックトークンを生成させます。
11
+ 4. 生成されたセマンティックトークンをVITS / VQGANに入力してデコードし、対応する音声を生成します。
12
+
13
+ ## コマンドライン推論
14
+
15
+ 必要な`vqgan`および`llama`モデルを Hugging Face リポジトリからダウンロードします。
16
+
17
+ ```bash
18
+ huggingface-cli download fishaudio/fish-speech-1.2-sft --local-dir checkpoints/fish-speech-1.2-sft
19
+ ```
20
+
21
+ ### 1. 音声からプロンプトを生成する:
22
+
23
+ !!! note
24
+ モデルにランダムに音声の音色を選ばせる場合、このステップをスキップできます。
25
+
26
+ ```bash
27
+ python tools/vqgan/inference.py \
28
+ -i "paimon.wav" \
29
+ --checkpoint-path "checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth"
30
+ ```
31
+
32
+ `fake.npy`ファイルが生成されるはずです。
33
+
34
+ ### 2. テキストからセマンティックトークンを生成する:
35
+
36
+ ```bash
37
+ python tools/llama/generate.py \
38
+ --text "変換したいテキスト" \
39
+ --prompt-text "参照テキスト" \
40
+ --prompt-tokens "fake.npy" \
41
+ --checkpoint-path "checkpoints/fish-speech-1.2-sft" \
42
+ --num-samples 2 \
43
+ --compile
44
+ ```
45
+
46
+ このコマンドは、作業ディレクトリに`codes_N`ファイルを作成します。ここで、N は 0 から始まる整数です。
47
+
48
+ !!! note
49
+ `--compile`を使用して CUDA カーネルを融合し、より高速な推論を実現することができます(約 30 トークン/秒 -> 約 500 トークン/秒)。
50
+ それに対応して、加速を使用しない場合は、`--compile`パラメータをコメントアウトできます。
51
+
52
+ !!! info
53
+ bf16 をサポートしていない GPU の場合、`--half`パラメータを使用する必要があるかもしれません。
54
+
55
+ ### 3. セマンティックトークンから音声を生成する:
56
+
57
+ #### VQGAN デコーダー
58
+
59
+ ```bash
60
+ python tools/vqgan/inference.py \
61
+ -i "codes_0.npy" \
62
+ --checkpoint-path "checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth"
63
+ ```
64
+
65
+ ## HTTP API 推論
66
+
67
+ 推論のための HTTP API を提供しています。次のコマンドを使用してサーバーを起動できます:
68
+
69
+ ```bash
70
+ python -m tools.api \
71
+ --listen 0.0.0.0:8080 \
72
+ --llama-checkpoint-path "checkpoints/fish-speech-1.2-sft" \
73
+ --decoder-checkpoint-path "checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth" \
74
+ --decoder-config-name firefly_gan_vq
75
+ ```
76
+
77
+ 推論を高速化したい場合は、--compile パラメータを追加できます。
78
+
79
+ その後、`http://127.0.0.1:8080/`で API を表示およびテストできます。
80
+
81
+ 以下は、`tools/post_api.py` を使用してリクエストを送信する例です。
82
+
83
+ ```bash
84
+ python -m tools.post_api \
85
+ --text "入力するテキスト" \
86
+ --reference_audio "参照音声へのパス" \
87
+ --reference_text "参照音声テキスト" \
88
+ --streaming True
89
+ ```
90
+
91
+ 上記のコマンドは、参照音声の情報に基づいて必要な音声を合成し、ストリーミング方式で返すことを示しています。
92
+
93
+ `{SPEAKER}`と`{EMOTION}`に基づいて参照音声をランダムに選択する必要がある場合は、以下の手順に従って設定します:
94
+
95
+ ### 1. プロジェクトのルートディレクトリに`ref_data`フォルダを作成します。
96
+
97
+ ### 2. `ref_data`フォルダ内に次のような構造のディレクトリを作成します。
98
+
99
+ ```
100
+ .
101
+ ├── SPEAKER1
102
+ │ ├──EMOTION1
103
+ │ │ ├── 21.15-26.44.lab
104
+ │ │ ├── 21.15-26.44.wav
105
+ │ │ ├── 27.51-29.98.lab
106
+ │ │ ├── 27.51-29.98.wav
107
+ │ │ ├── 30.1-32.71.lab
108
+ │ │ └── 30.1-32.71.flac
109
+ │ └──EMOTION2
110
+ │ ├── 30.1-32.71.lab
111
+ │ └── 30.1-32.71.mp3
112
+ └── SPEAKER2
113
+ └─── EMOTION3
114
+ ├── 30.1-32.71.lab
115
+ └── 30.1-32.71.mp3
116
+
117
+ ```
118
+
119
+ つまり、まず`ref_data`に`{SPEAKER}`フォルダを配置し、各スピーカーの下に`{EMOTION}`フォルダを配置し、各感情フォルダの下に任意の数の音声-テキストペアを配置します
120
+
121
+ ### 3. 仮想環境で以下のコマンドを入力します.
122
+
123
+ ```bash
124
+ python tools/gen_ref.py
125
+
126
+ ```
127
+
128
+ 参照ディレクトリを生成します。
129
+
130
+ ### 4. API を呼び出します。
131
+
132
+ ```bash
133
+ python -m tools.post_api \
134
+ --text "入力するテキスト" \
135
+ --speaker "${SPEAKER1}" \
136
+ --emotion "${EMOTION1}" \
137
+ --streaming True
138
+
139
+ ```
140
+
141
+ 上記の例はテスト目的のみです。
142
+
143
+ ## WebUI 推論
144
+
145
+ 次のコマンドを使用して WebUI を起動できます:
146
+
147
+ ```bash
148
+ python -m tools.webui \
149
+ --llama-checkpoint-path "checkpoints/fish-speech-1.2-sft" \
150
+ --decoder-checkpoint-path "checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth" \
151
+ --decoder-config-name firefly_gan_vq
152
+ ```
153
+
154
+ !!! note
155
+ Gradio 環境変数(`GRADIO_SHARE`、`GRADIO_SERVER_PORT`、`GRADIO_SERVER_NAME`など)を使用して WebUI を構成できます。
156
+
157
+ お楽しみください!
docs/ja/samples.md ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # サンプル
2
+
3
+ v1.2のサンプルは[Bilibili](https://www.bilibili.com/video/BV1wz421B71D/)で利用可能です。
4
+
5
+ 以下のサンプルはv1.1モデルからのものです。
6
+
7
+ ## 中国語の文1
8
+ ```
9
+ 人間灯火倒映湖中,她的渴望让静水泛起涟漪。若代价只是孤独,那就让这份愿望肆意流淌。
10
+ 流入她所注视的世间,也流入她如湖水般澄澈的目光。
11
+ ```
12
+
13
+ <table>
14
+ <thead>
15
+ <tr>
16
+ <th>話者</th>
17
+ <th>入力音声</th>
18
+ <th>合成音声</th>
19
+ </tr>
20
+ </thead>
21
+ <tbody>
22
+ <tr>
23
+ <td>ナヒーダ (原神)</td>
24
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/0_input.wav" /></td>
25
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/0_output.wav" /></td>
26
+ </tr>
27
+ <tr>
28
+ <td>鍾離 (原神)</td>
29
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/1_input.wav" /></td>
30
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/1_output.wav" /></td>
31
+ </tr>
32
+ <tr>
33
+ <td>フリナ (原神)</td>
34
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/2_input.wav" /></td>
35
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/2_output.wav" /></td>
36
+ </tr>
37
+ <tr>
38
+ <td>ランダム話者1</td>
39
+ <td> - </td>
40
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/4_output.wav" /></td>
41
+ </tr>
42
+ <tr>
43
+ <td>ランダム話者2</td>
44
+ <td> - </td>
45
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/5_output.wav" /></td>
46
+ </tr>
47
+ </tbody>
48
+ </table>
49
+
50
+
51
+ ## 中国語の文2
52
+ ```
53
+ 你们这个是什么群啊,你们这是害人不浅啊你们这个群!谁是群主,出来!真的太过分了。你们搞这个群干什么?
54
+ 我儿子每一科的成绩都不过那个平均分呐,他现在初二,你叫我儿子怎么办啊?他现在还不到高中啊?
55
+ 你们害死我儿子了!快点出来你这个群主!再这样我去报警了啊!我跟你们说你们这一帮人啊,一天到晚啊,
56
+ 搞这些什么游戏啊,动漫啊,会害死你们的,你们没有前途我跟你说。你们这九百多个人,好好学习不好吗?
57
+ 一天到晚在上网。有什么意思啊?麻烦你重视一下你们的生活的目标啊?有一点学习目标行不行?一天到晚上网是不是人啊?
58
+ ```
59
+
60
+ <table>
61
+ <thead>
62
+ <tr>
63
+ <th>話者</th>
64
+ <th>入力音声</th>
65
+ <th>合成音声</th>
66
+ </tr>
67
+ </thead>
68
+ <tbody>
69
+ <tr>
70
+ <td>ナヒーダ (原神)</td>
71
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/0_input.wav" /></td>
72
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/6_output.wav" /></td>
73
+ </tr>
74
+ <tr>
75
+ <td>ランダム話者</td>
76
+ <td> - </td>
77
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/7_output.wav" /></td>
78
+ </tr>
79
+ </tbody>
80
+ </table>
81
+
82
+
83
+ ## 中国語の文3
84
+ ```
85
+ 大家好,我是 Fish Audio 开发的开源文本转语音模型。经过十五万小时的数据训练,
86
+ 我已经能够熟练掌握中文、日语和英语,我的语言处理能力接近人类水平,声音表现形式丰富多变。
87
+ 作为一个仅有亿级参数的模型,我相信社区成员能够在个人设备上轻松运行和微调,让我成为您的私人语音助手。
88
+ ```
89
+
90
+
91
+ <table>
92
+ <thead>
93
+ <tr>
94
+ <th>話者</th>
95
+ <th>入力音声</th>
96
+ <th>合成音声</th>
97
+ </tr>
98
+ </thead>
99
+ <tbody>
100
+ <tr>
101
+ <td>ランダム話者</td>
102
+ <td> - </td>
103
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/8_output.wav" /></td>
104
+ </tr>
105
+ </tbody>
106
+ </table>
107
+
108
+ ## 英語の文1
109
+
110
+ ```
111
+ In the realm of advanced technology, the evolution of artificial intelligence stands as a
112
+ monumental achievement. This dynamic field, constantly pushing the boundaries of what
113
+ machines can do, has seen rapid growth and innovation. From deciphering complex data
114
+ patterns to driving cars autonomously, AI's applications are vast and diverse.
115
+ ```
116
+
117
+ <table>
118
+ <thead>
119
+ <tr>
120
+ <th>話者</th>
121
+ <th>入力音声</th>
122
+ <th>合成音声</th>
123
+ </tr>
124
+ </thead>
125
+ <tbody>
126
+ <tr>
127
+ <td>ランダム話者1</td>
128
+ <td> - </td>
129
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/en/0_output.wav" /></td>
130
+ </tr>
131
+ <tr>
132
+ <td>ランダム話者2</td>
133
+ <td> - </td>
134
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/en/1_output.wav" /></td>
135
+ </tr>
136
+ </tbody>
137
+ </table>
138
+
139
+ ## 英語の文2
140
+ ```
141
+ Hello everyone, I am an open-source text-to-speech model developed by
142
+ Fish Audio. After training with 150,000 hours of data, I have become proficient
143
+ in Chinese, Japanese, and English, and my language processing abilities
144
+ are close to human level. My voice is capable of a wide range of expressions.
145
+ As a model with only hundreds of millions of parameters, I believe community
146
+ members can easily run and fine-tune me on their personal devices, allowing
147
+ me to serve as your personal voice assistant.
148
+ ```
149
+
150
+ <table>
151
+ <thead>
152
+ <tr>
153
+ <th>話者</th>
154
+ <th>入力音声</th>
155
+ <th>合成音声</th>
156
+ </tr>
157
+ </thead>
158
+ <tbody>
159
+ <tr>
160
+ <td>ランダム話者</td>
161
+ <td> - </td>
162
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/en/2_output.wav" /></td>
163
+ </tr>
164
+ </tbody>
165
+ </table>
166
+
167
+ ## 日本語の文1
168
+
169
+ ```
170
+ 先進技術の領域において、人工知能の進化は画期的な成果として立っています。常に機械ができることの限界を
171
+ 押し広げているこのダイナミックな分野は、急速な成長と革新を見せています。複雑なデータパターンの解読か
172
+ ら自動運転車の操縦まで、AIの応用は広範囲に及びます。
173
+ ```
174
+
175
+
176
+ <table>
177
+ <thead>
178
+ <tr>
179
+ <th>話者</th>
180
+ <th>入力音声</th>
181
+ <th>合成音声</th>
182
+ </tr>
183
+ </thead>
184
+ <tbody>
185
+ <tr>
186
+ <td>ランダム話者1</td>
187
+ <td> - </td>
188
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/ja/0_output.wav" /></td>
189
+ </tr>
190
+ <tr>
191
+ <td>ランダム話者2</td>
192
+ <td> - </td>
193
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/ja/1_output.wav" /></td>
194
+ </tr>
195
+ </tbody>
196
+ </table>
197
+
198
+ ## 日本語の文2
199
+ ```
200
+ 皆さん、こんにちは。私はフィッシュオーディオによって開発されたオープンソースのテ
201
+ キストから音声への変換モデルです。15万時間のデータトレーニングを経て、
202
+ 中国語、日本語、英語を熟知しており、言語処理能力は人間に近いレベルです。
203
+ 声の表現も多彩で豊かです。数億のパラメータを持つこのモデルは、コミュニティ
204
+ のメンバーが個人のデバイスで簡単に実行し、微調整することができると
205
+ 信じています。これにより、私を個人の音声アシスタントとして活用できます。
206
+ ```
207
+
208
+ <table>
209
+ <thead>
210
+ <tr>
211
+ <th>話者</th>
212
+ <th>入力音声</th>
213
+ <th>合成音声</th>
214
+ </tr>
215
+ </thead>
216
+ <tbody>
217
+ <tr>
218
+ <td>ランダム話者</td>
219
+ <td> - </td>
220
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/ja/2_output.wav" /></td>
221
+ </tr>
222
+ </tbody>
223
+ </table>
docs/requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ mkdocs-material
2
+ mkdocs-static-i18n[material]
3
+ mkdocs[i18n]
docs/stylesheets/extra.css ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ .md-grid {
2
+ max-width: 1440px;
3
+ }
docs/zh/finetune.md ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 微调
2
+
3
+ 显然, 当你打开这个页面的时候, 你已经对预训练模型 zero-shot 的效果不算满意. 你想要微调一个模型, 使得它在你的数据集上表现更好.
4
+
5
+ 在目前版本,你只需要微调'LLAMA'部分即可.
6
+
7
+ ## LLAMA 微调
8
+ ### 1. 准备数据集
9
+
10
+ ```
11
+ .
12
+ ├── SPK1
13
+ │ ├── 21.15-26.44.lab
14
+ │ ├── 21.15-26.44.mp3
15
+ │ ├── 27.51-29.98.lab
16
+ │ ├── 27.51-29.98.mp3
17
+ │ ├── 30.1-32.71.lab
18
+ │ └── 30.1-32.71.mp3
19
+ └── SPK2
20
+ ├── 38.79-40.85.lab
21
+ └── 38.79-40.85.mp3
22
+ ```
23
+
24
+ 你需要将数据集转为以上格式, 并放到 `data` 下, 音频后缀可以为 `.mp3`, `.wav` 或 `.flac`, 标注文件后缀建议为 `.lab`.
25
+
26
+ !!! warning
27
+ 建议先对数据集进行响度匹配, 你可以使用 [fish-audio-preprocess](https://github.com/fishaudio/audio-preprocess) 来完成这一步骤.
28
+ ```bash
29
+ fap loudness-norm data-raw data --clean
30
+ ```
31
+
32
+ ### 2. 批量提取语义 token
33
+
34
+ 确保你已经下载了 vqgan 权重, 如果没有, 请运行以下命令:
35
+
36
+ ```bash
37
+ huggingface-cli download fishaudio/fish-speech-1.2-sft --local-dir checkpoints/fish-speech-1.2-sft
38
+ ```
39
+
40
+ 对于中国大陆用户, 可使用 mirror 下载.
41
+
42
+ ```bash
43
+ HF_ENDPOINT=https://hf-mirror.com huggingface-cli download fishaudio/fish-speech-1.2-sft --local-dir checkpoints/fish-speech-1.2-sft
44
+ ```
45
+
46
+ 随后可运行以下命令来提取语义 token:
47
+
48
+ ```bash
49
+ python tools/vqgan/extract_vq.py data \
50
+ --num-workers 1 --batch-size 16 \
51
+ --config-name "firefly_gan_vq" \
52
+ --checkpoint-path "checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth"
53
+ ```
54
+
55
+ !!! note
56
+ 你可以调整 `--num-workers` 和 `--batch-size` 来提高提取速度, 但是请注意不要超过你的显存限制.
57
+
58
+ 该命令会在 `data` 目录下创建 `.npy` 文件, 如下所示:
59
+
60
+ ```
61
+ .
62
+ ├── SPK1
63
+ │ ├── 21.15-26.44.lab
64
+ │ ├── 21.15-26.44.mp3
65
+ │ ├── 21.15-26.44.npy
66
+ │ ├── 27.51-29.98.lab
67
+ │ ├── 27.51-29.98.mp3
68
+ │ ├── 27.51-29.98.npy
69
+ │ ├── 30.1-32.71.lab
70
+ │ ├── 30.1-32.71.mp3
71
+ │ └── 30.1-32.71.npy
72
+ └── SPK2
73
+ ├── 38.79-40.85.lab
74
+ ├── 38.79-40.85.mp3
75
+ └── 38.79-40.85.npy
76
+ ```
77
+
78
+ ### 3. 打包数据集为 protobuf
79
+
80
+ ```bash
81
+ python tools/llama/build_dataset.py \
82
+ --input "data" \
83
+ --output "data/protos" \
84
+ --text-extension .lab \
85
+ --num-workers 16
86
+ ```
87
+
88
+ 命令执行完毕后, 你应该能在 `data` 目录下看到 `protos` 文件.
89
+
90
+
91
+ ### 4. 最后, 使用 LoRA 进行微调
92
+
93
+ 同样的, 请确保你已经下载了 `LLAMA` 权重, 如果没有, 请运行以下命令:
94
+
95
+ ```bash
96
+ huggingface-cli download fishaudio/fish-speech-1.2-sft --local-dir checkpoints/fish-speech-1.2-sft
97
+ ```
98
+
99
+ 对于中国大陆用户, 可使用 mirror 下载.
100
+
101
+ ```bash
102
+ HF_ENDPOINT=https://hf-mirror.com huggingface-cli download fishaudio/fish-speech-1.2-sft --local-dir checkpoints/fish-speech-1.2-sft
103
+ ```
104
+
105
+ 最后, 你可以运行以下命令来启动微调:
106
+
107
+ ```bash
108
+ python fish_speech/train.py --config-name text2semantic_finetune \
109
+ project=$project \
110
+ +lora@model.model.lora_config=r_8_alpha_16
111
+ ```
112
+
113
+ !!! note
114
+ 你可以通过修改 `fish_speech/configs/text2semantic_finetune.yaml` 来修改训练参数如 `batch_size`, `gradient_accumulation_steps` 等, 来适应你的显存.
115
+
116
+ !!! note
117
+ 对于 Windows 用户, 你可以使用 `trainer.strategy.process_group_backend=gloo` 来避免 `nccl` 的问题.
118
+
119
+ 训练结束后, 你可以参考 [推理](inference.md) 部分, 并携带 `--speaker SPK1` 参数来测试你的模型.
120
+
121
+ !!! info
122
+ 默认配置下, 基本只会学到说话人的发音方式, 而不包含音色, 你依然需要使用 prompt 来保证音色的稳定性.
123
+ 如果你想要学到音色, 请将训练步数调大, 但这有可能会导致过拟合.
124
+
125
+ 训练完成后, 你需要先将 loRA 的权重转为普通权重, 然后再进行推理.
126
+
127
+ ```bash
128
+ python tools/llama/merge_lora.py \
129
+ --lora-config r_8_alpha_16 \
130
+ --base-weight checkpoints/fish-speech-1.2-sft \
131
+ --lora-weight results/$project/checkpoints/step_000000010.ckpt \
132
+ --output checkpoints/fish-speech-1.2-sft-yth-lora/
133
+ ```
134
+
135
+ !!! note
136
+ 你也可以尝试其他的 checkpoint, 我们建议你使用最早的满足你要求的 checkpoint, 他们通常在 OOD 上表现更好.
docs/zh/index.md ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 介绍
2
+
3
+ <div>
4
+ <a target="_blank" href="https://discord.gg/Es5qTB9BcN">
5
+ <img alt="Discord" src="https://img.shields.io/discord/1214047546020728892?color=%23738ADB&label=Discord&logo=discord&logoColor=white&style=flat-square"/>
6
+ </a>
7
+ <a target="_blank" href="http://qm.qq.com/cgi-bin/qm/qr?_wv=1027&k=jCKlUP7QgSm9kh95UlBoYv6s1I-Apl1M&authKey=xI5ttVAp3do68IpEYEalwXSYZFdfxZSkah%2BctF5FIMyN2NqAa003vFtLqJyAVRfF&noverify=0&group_code=593946093">
8
+ <img alt="QQ" src="https://img.shields.io/badge/QQ Group-%2312B7F5?logo=tencent-qq&logoColor=white&style=flat-square"/>
9
+ </a>
10
+ <a target="_blank" href="https://hub.docker.com/r/lengyue233/fish-speech">
11
+ <img alt="Docker" src="https://img.shields.io/docker/pulls/lengyue233/fish-speech?style=flat-square&logo=docker"/>
12
+ </a>
13
+ </div>
14
+
15
+ !!! warning
16
+ 我们不对代码库的任何非法使用承担任何责任. 请参阅您当地关于 DMCA (数字千年法案) 和其他相关法律法规. <br/>
17
+ 此代码库根据 `BSD-3-Clause` 许可证发布, 所有模型根据 CC-BY-NC-SA-4.0 许可证发布.
18
+
19
+ <p align="center">
20
+ <img src="../assets/figs/diagram.png" width="75%">
21
+ </p>
22
+
23
+ ## 要求
24
+
25
+ - GPU 内存: 4GB (用于推理), 8GB (用于微调)
26
+ - 系统: Linux, Windows
27
+
28
+ ## Windows 配置
29
+
30
+ Windows 专业用户可以考虑 WSL2 或 docker 来运行代码库。
31
+
32
+ Windows 非专业用户可考虑以下为免 Linux 环境的基础运行方法(附带模型编译功能,即 `torch.compile`):
33
+
34
+
35
+ 1. 解压项目压缩包。
36
+ 2. 点击 `install_env.bat` 安装环境。
37
+ - 可以通过编辑 `install_env.bat` 的 `USE_MIRROR` 项来决定是否使用镜像站下载。
38
+ - `USE_MIRROR=false` 使用原始站下载最新稳定版 `torch` 环境。`USE_MIRROR=true` 为从镜像站下载最新 `torch` 环境。默认为 `true`。
39
+ - 可以通过编辑 `install_env.bat` 的 `INSTALL_TYPE` 项来决定是否启用可编译环境下载。
40
+ - `INSTALL_TYPE=preview` 下载开发版编译环境。`INSTALL_TYPE=stable` 下载稳定版不带编译环境。
41
+ 3. 若第2步 `INSTALL_TYPE=preview` 则执行这一步(可跳过,此步为激活编译模型环境)
42
+ 1. 使用如下链接下载 LLVM 编译器。
43
+ - [LLVM-17.0.6(原站站点下载)](https://huggingface.co/fishaudio/fish-speech-1/resolve/main/LLVM-17.0.6-win64.exe?download=true)
44
+ - [LLVM-17.0.6(镜像站点下载)](https://hf-mirror.com/fishaudio/fish-speech-1/resolve/main/LLVM-17.0.6-win64.exe?download=true)
45
+ - 下载完 `LLVM-17.0.6-win64.exe` 后,双击进行安装,选择合适的安装位置,最重要的是勾选 `Add Path to Current User` 添加环境变量。
46
+ - 确认安装完成。
47
+ 2. 下载安装 Microsoft Visual C++ 可再发行程序包,解决潜在 .dll 丢失问题。
48
+ - [MSVC++ 14.40.33810.0 下载](https://aka.ms/vs/17/release/vc_redist.x64.exe)
49
+ 3. 下载安装 Visual Studio 社区版以获取 MSVC++ 编译工具, 解决 LLVM 的头文件依赖问题。
50
+ - [Visual Studio 下载](https://visualstudio.microsoft.com/zh-hans/downloads/)
51
+ - 安装好Visual Studio Installer之后,下载Visual Studio Community 2022
52
+ - 如下图点击`修改`按钮,找到`使用C++的桌面开发`项,勾选下载
53
+ 4. 下载安装 [CUDA Toolkit 12](https://developer.nvidia.com/cuda-12-1-0-download-archive?target_os=Windows&target_arch=x86_64)
54
+ 4. 双击 `start.bat` 打开训练推理WebUI管理界面. 如有需要,可照下列提示修改`API_FLAGS`.
55
+
56
+ !!! info "可选"
57
+
58
+ 想启动 推理 WebUI 界面?编辑项目根目录下的 `API_FLAGS.txt`, 前三行修改成如下格式:
59
+ ```
60
+ --infer
61
+ # --api
62
+ # --listen ...
63
+ ...
64
+ ```
65
+
66
+ !!! info "可选"
67
+
68
+ 想启动 API 服务器?编辑项目根目录下的 `API_FLAGS.txt`, 前三行修改成如下格式:
69
+ ```
70
+ # --infer
71
+ --api
72
+ --listen ...
73
+ ...
74
+ ```
75
+
76
+ !!! info "可选"
77
+
78
+ 双击 `run_cmd.bat` 进入本项目的 conda/python 命令行环境
79
+
80
+
81
+ ## Linux 配置
82
+
83
+ ```bash
84
+ # 创建一个 python 3.10 虚拟环境, 你也可以用 virtualenv
85
+ conda create -n fish-speech python=3.10
86
+ conda activate fish-speech
87
+
88
+ # 安装 pytorch
89
+ pip3 install torch torchvision torchaudio
90
+
91
+ # 安装 fish-speech
92
+ pip3 install -e .
93
+
94
+ # (Ubuntu / Debian 用户) 安装 sox
95
+ apt install libsox-dev
96
+ ```
97
+
98
+ ## 更新日志
99
+
100
+ - 2024/07/02: 更新了 Fish-Speech 到 1.2 版本,移除 VITS Decoder,同时极大幅度提升 zero-shot 能力.
101
+ - 2024/05/10: 更新了 Fish-Speech 到 1.1 版本,引入了 VITS Decoder 来降低口胡和提高音色相似度.
102
+ - 2024/04/22: 完成了 Fish-Speech 1.0 版本, 大幅修改了 VQGAN 和 LLAMA 模型.
103
+ - 2023/12/28: 添加了 `lora` 微调支持.
104
+ - 2023/12/27: 添加了 `gradient checkpointing`, `causual sampling` 和 `flash-attn` 支持.
105
+ - 2023/12/19: 更新了 Webui 和 HTTP API.
106
+ - 2023/12/18: 更新了微调文档和相关例子.
107
+ - 2023/12/17: 更新了 `text2semantic` 模型, 支持无音素模式.
108
+ - 2023/12/13: 测试版发布, 包含 VQGAN 模型和一个���于 LLAMA 的语言模型 (只支持音素).
109
+
110
+ ## 致谢
111
+
112
+ - [VITS2 (daniilrobnikov)](https://github.com/daniilrobnikov/vits2)
113
+ - [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2)
114
+ - [GPT VITS](https://github.com/innnky/gpt-vits)
115
+ - [MQTTS](https://github.com/b04901014/MQTTS)
116
+ - [GPT Fast](https://github.com/pytorch-labs/gpt-fast)
117
+ - [Transformers](https://github.com/huggingface/transformers)
118
+ - [GPT-SoVITS](https://github.com/RVC-Boss/GPT-SoVITS)
docs/zh/inference.md ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 推理
2
+
3
+ 推理支持命令行, http api, 以及 webui 三种方式.
4
+
5
+ !!! note
6
+ 总的来说, 推理分为几个部分:
7
+
8
+ 1. 给定一段 ~10 秒的语音, 将它用 VQGAN 编码.
9
+ 2. 将编码后的语义 token 和对应文本输入语言模型作为例子.
10
+ 3. 给定一段新文本, 让模型生成对应的语义 token.
11
+ 4. 将生成的语义 token 输入 VQGAN 解码, 生成对应的语音.
12
+
13
+ ## 命令行推理
14
+
15
+ 从我们的 huggingface 仓库下载所需的 `vqgan` 和 `llama` 模型。
16
+
17
+ ```bash
18
+ huggingface-cli download fishaudio/fish-speech-1.2-sft --local-dir checkpoints/fish-speech-1.2-sft
19
+ ```
20
+
21
+ 对于中国大陆用户,可使用 mirror 下载。
22
+
23
+ ```bash
24
+ HF_ENDPOINT=https://hf-mirror.com huggingface-cli download fishaudio/fish-speech-1.2-sft --local-dir checkpoints/fish-speech-1.2-sft
25
+ ```
26
+
27
+ ### 1. 从语音生成 prompt:
28
+
29
+ !!! note
30
+ 如果你打算让模型随机选择音色, 你可以跳过这一步.
31
+
32
+ ```bash
33
+ python tools/vqgan/inference.py \
34
+ -i "paimon.wav" \
35
+ --checkpoint-path "checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth"
36
+ ```
37
+
38
+ 你应该能得到一个 `fake.npy` 文件.
39
+
40
+ ### 2. 从文本生成语义 token:
41
+
42
+ ```bash
43
+ python tools/llama/generate.py \
44
+ --text "要转换的文本" \
45
+ --prompt-text "你的参考文本" \
46
+ --prompt-tokens "fake.npy" \
47
+ --checkpoint-path "checkpoints/fish-speech-1.2-sft" \
48
+ --num-samples 2 \
49
+ --compile
50
+ ```
51
+
52
+ 该命令会在工作目录下创建 `codes_N` 文件, 其中 N 是从 0 开始的整数.
53
+
54
+ !!! note
55
+ 您可能希望使用 `--compile` 来融合 cuda 内核以实现更快的推理 (~30 个 token/秒 -> ~500 个 token/秒).
56
+ 对应的, 如果你不打算使用加速, 你可以注释掉 `--compile` 参数.
57
+
58
+ !!! info
59
+ 对于不支持 bf16 的 GPU, 你可能需要使用 `--half` 参数.
60
+
61
+ ### 3. 从语义 token 生成人声:
62
+
63
+ #### VQGAN 解码
64
+
65
+ ```bash
66
+ python tools/vqgan/inference.py \
67
+ -i "codes_0.npy" \
68
+ --checkpoint-path "checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth"
69
+ ```
70
+
71
+ ## HTTP API 推理
72
+
73
+ 运行以下命令来启动 HTTP 服务:
74
+
75
+ ```bash
76
+ python -m tools.api \
77
+ --listen 0.0.0.0:8080 \
78
+ --llama-checkpoint-path "checkpoints/fish-speech-1.2-sft" \
79
+ --decoder-checkpoint-path "checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth" \
80
+ --decoder-config-name firefly_gan_vq
81
+ ```
82
+ 如果你想要加速推理,可以加上`--compile`参数。
83
+
84
+ 推荐中国大陆用户运行以下命令来启动 HTTP 服务:
85
+ ```bash
86
+ HF_ENDPOINT=https://hf-mirror.com python -m ...(同上)
87
+ ```
88
+
89
+ 随后, 你可以在 `http://127.0.0.1:8080/` 中查看并测试 API.
90
+
91
+ 下面是使用`tools/post_api.py`发送请求的示例。
92
+
93
+ ```bash
94
+ python -m tools.post_api \
95
+ --text "要输入的文本" \
96
+ --reference_audio "参考音频路径" \
97
+ --reference_text "参考音频的文本内容" \
98
+ --streaming True
99
+ ```
100
+
101
+ 上面的命令表示按照参考音频的信息,合成所需的音频并流式返回.
102
+
103
+ 如果需要通过`{说话人}`和`{情绪}`随机选择参考音频,那么就根据下列步骤配置:
104
+
105
+ ### 1. 在项目根目录创建`ref_data`文件夹.
106
+
107
+ ### 2. 在`ref_data`文件夹内创建类似如下结构的目录.
108
+
109
+ ```
110
+ .
111
+ ├── SPEAKER1
112
+ │ ├──EMOTION1
113
+ │ │ ├── 21.15-26.44.lab
114
+ │ │ ├── 21.15-26.44.wav
115
+ │ │ ├── 27.51-29.98.lab
116
+ │ │ ├── 27.51-29.98.wav
117
+ │ │ ├── 30.1-32.71.lab
118
+ │ │ └── 30.1-32.71.flac
119
+ │ └──EMOTION2
120
+ │ ├── 30.1-32.71.lab
121
+ │ └── 30.1-32.71.mp3
122
+ └── SPEAKER2
123
+ └─── EMOTION3
124
+ ├── 30.1-32.71.lab
125
+ └── 30.1-32.71.mp3
126
+ ```
127
+
128
+ 也就是`ref_data`里先放`{说话人}`文件夹, 每个说话人下再放`{情绪}`文件夹, 每个情绪文件夹下放任意个`音频-文本对`。
129
+
130
+ ### 3. 在虚拟环境里输入
131
+
132
+ ```bash
133
+ python tools/gen_ref.py
134
+ ```
135
+
136
+ 生成参考目录.
137
+
138
+ ### 4. 调用 api.
139
+
140
+ ```bash
141
+ python -m tools.post_api \
142
+ --text "要输入的文本" \
143
+ --speaker "说话人1" \
144
+ --emotion "情绪1" \
145
+ --streaming True
146
+ ```
147
+
148
+ 以上示例仅供测试.
149
+
150
+ ## WebUI 推理
151
+
152
+ 你可以使用以下命令来启动 WebUI:
153
+
154
+ ```bash
155
+ python -m tools.webui \
156
+ --llama-checkpoint-path "checkpoints/fish-speech-1.2-sft" \
157
+ --decoder-checkpoint-path "checkpoints/fish-speech-1.2-sft/firefly-gan-vq-fsq-4x1024-42hz-generator.pth" \
158
+ --decoder-config-name firefly_gan_vq
159
+ ```
160
+
161
+ !!! note
162
+ 你可以使用 Gradio 环境变量, 如 `GRADIO_SHARE`, `GRADIO_SERVER_PORT`, `GRADIO_SERVER_NAME` 来配置 WebUI.
163
+
164
+ 祝大家玩得开心!
docs/zh/samples.md ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 例子
2
+
3
+ v1.2 的样本可以在 [Bilibili](https://www.bilibili.com/video/BV1wz421B71D/) 观看。
4
+
5
+ 以下样本来自 v1.1 版本的模型。
6
+
7
+ ## 中文句子 1
8
+ ```
9
+ 人间灯火倒映湖中,她的渴望让静水泛起涟漪。若代价只是孤独,那就让这份愿望肆意流淌。
10
+ 流入她所注视的世间,也流入她如湖水般澄澈的目光。
11
+ ```
12
+
13
+ <table>
14
+ <thead>
15
+ <tr>
16
+ <th>说话人</th>
17
+ <th>输入音频</th>
18
+ <th>合成音频</th>
19
+ </tr>
20
+ </thead>
21
+ <tbody>
22
+ <tr>
23
+ <td>纳西妲 (原神)</td>
24
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/0_input.wav" /></td>
25
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/0_output.wav" /></td>
26
+ </tr>
27
+ <tr>
28
+ <td>钟离 (原神)</td>
29
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/1_input.wav" /></td>
30
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/1_output.wav" /></td>
31
+ </tr>
32
+ <tr>
33
+ <td>芙宁娜 (原神)</td>
34
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/2_input.wav" /></td>
35
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/2_output.wav" /></td>
36
+ </tr>
37
+ <tr>
38
+ <td>随机说话人 1</td>
39
+ <td> - </td>
40
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/4_output.wav" /></td>
41
+ </tr>
42
+ <tr>
43
+ <td>随机说话人 2</td>
44
+ <td> - </td>
45
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/5_output.wav" /></td>
46
+ </tr>
47
+ </tbody>
48
+ </table>
49
+
50
+
51
+ ## 中文句子 2
52
+ ```
53
+ 你们这个是什么群啊,你们这是害人不浅啊你们这个群!谁是群主,出来!真的太过分了。你们搞这个群干什么?
54
+ 我儿子每一科的成绩都不过那个平均分呐,他现在初二,你叫我儿子怎么办啊?他现在还不到高中啊?
55
+ 你们害死我儿子了!快点出来你这个群主!再这样我去报警了啊!我跟你们说你们这一帮人啊,一天到晚啊,
56
+ 搞这些什么游戏啊,动漫啊,会害死你们的,你们没有前途我跟你说。你们这九百多个人,好好学习不好吗?
57
+ 一天到晚在上网。有什么意思啊?麻烦你重视一下你们的生活的目标啊?有一点学习目标行不行?一天到晚上网是不是人啊?
58
+ ```
59
+
60
+ <table>
61
+ <thead>
62
+ <tr>
63
+ <th>说话人</th>
64
+ <th>输入音频</th>
65
+ <th>合成音频</th>
66
+ </tr>
67
+ </thead>
68
+ <tbody>
69
+ <tr>
70
+ <td>纳西妲 (原神)</td>
71
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/0_input.wav" /></td>
72
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/6_output.wav" /></td>
73
+ </tr>
74
+ <tr>
75
+ <td>随机说话人</td>
76
+ <td> - </td>
77
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/7_output.wav" /></td>
78
+ </tr>
79
+ </tbody>
80
+ </table>
81
+
82
+
83
+ ## 中文句子 3
84
+ ```
85
+ 大家好,我是 Fish Audio 开发的开源文本转语音模型。经过十五万小时的数据训练,
86
+ 我已经能够熟练掌握中文、日语和英语,我的语言处理能力接近人类水平,声音表现形式丰富多变。
87
+ 作为一个仅有亿级参数的模型,我相信社区成员能够在个人设备上轻松运行和微调,让我成为您的私人语音助手。
88
+ ```
89
+
90
+
91
+ <table>
92
+ <thead>
93
+ <tr>
94
+ <th>说话人</th>
95
+ <th>输入音频</th>
96
+ <th>合成音频</th>
97
+ </tr>
98
+ </thead>
99
+ <tbody>
100
+ <tr>
101
+ <td>随机说话人</td>
102
+ <td> - </td>
103
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/zh/8_output.wav" /></td>
104
+ </tr>
105
+ </tbody>
106
+ </table>
107
+
108
+ ## 英文句子 1
109
+
110
+ ```
111
+ In the realm of advanced technology, the evolution of artificial intelligence stands as a
112
+ monumental achievement. This dynamic field, constantly pushing the boundaries of what
113
+ machines can do, has seen rapid growth and innovation. From deciphering complex data
114
+ patterns to driving cars autonomously, AI's applications are vast and diverse.
115
+ ```
116
+
117
+ <table>
118
+ <thead>
119
+ <tr>
120
+ <th>说话人</th>
121
+ <th>输入音频</th>
122
+ <th>合成音频</th>
123
+ </tr>
124
+ </thead>
125
+ <tbody>
126
+ <tr>
127
+ <td>随机说话人 1</td>
128
+ <td> - </td>
129
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/en/0_output.wav" /></td>
130
+ </tr>
131
+ <tr>
132
+ <td>随机说话人 2</td>
133
+ <td> - </td>
134
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/en/1_output.wav" /></td>
135
+ </tr>
136
+ </tbody>
137
+ </table>
138
+
139
+ ## 英文句子 2
140
+ ```
141
+ Hello everyone, I am an open-source text-to-speech model developed by
142
+ Fish Audio. After training with 150,000 hours of data, I have become proficient
143
+ in Chinese, Japanese, and English, and my language processing abilities
144
+ are close to human level. My voice is capable of a wide range of expressions.
145
+ As a model with only hundreds of millions of parameters, I believe community
146
+ members can easily run and fine-tune me on their personal devices, allowing
147
+ me to serve as your personal voice assistant.
148
+ ```
149
+
150
+ <table>
151
+ <thead>
152
+ <tr>
153
+ <th>说话人</th>
154
+ <th>输入音频</th>
155
+ <th>合成音频</th>
156
+ </tr>
157
+ </thead>
158
+ <tbody>
159
+ <tr>
160
+ <td>随机说话人</td>
161
+ <td> - </td>
162
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/en/2_output.wav" /></td>
163
+ </tr>
164
+ </tbody>
165
+ </table>
166
+
167
+ ## 日文句子 1
168
+
169
+ ```
170
+ 先進技術の領域において、人工知能の進化は画期的な成果として立っています。常に機械ができることの限界を
171
+ 押し広げているこのダイナミックな分野は、急速な成長と革新を見せています。複雑なデータパターンの解読か
172
+ ら自動運転車の操縦まで、AIの応用は広範囲に及びます。
173
+ ```
174
+
175
+
176
+ <table>
177
+ <thead>
178
+ <tr>
179
+ <th>说话人</th>
180
+ <th>输入音频</th>
181
+ <th>合成音频</th>
182
+ </tr>
183
+ </thead>
184
+ <tbody>
185
+ <tr>
186
+ <td>随机说话人 1</td>
187
+ <td> - </td>
188
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/ja/0_output.wav" /></td>
189
+ </tr>
190
+ <tr>
191
+ <td>随机说话人 2</td>
192
+ <td> - </td>
193
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/ja/1_output.wav" /></td>
194
+ </tr>
195
+ </tbody>
196
+ </table>
197
+
198
+ ## 日文句子 2
199
+ ```
200
+ 皆さん、こんにちは。私はフィッシュオーディオによって開発されたオープンソースのテ
201
+ キストから音声への変換モデルです。15万時間のデータトレーニングを経て、
202
+ 中国語、日本語、英語を熟知しており、言語処理能力は人間に近いレベルです。
203
+ 声の表現も多彩で豊かです。数億のパラメータを持つこのモデルは、コミュニティ
204
+ のメンバーが個人のデバイスで簡単に実行し、微調整することができると
205
+ 信じています。これにより、私を個人の音声アシスタントとして活用できます。
206
+ ```
207
+
208
+ <table>
209
+ <thead>
210
+ <tr>
211
+ <th>说话人</th>
212
+ <th>输入音频</th>
213
+ <th>合成音频</th>
214
+ </tr>
215
+ </thead>
216
+ <tbody>
217
+ <tr>
218
+ <td>随机说话人</td>
219
+ <td> - </td>
220
+ <td><audio controls preload="auto" src="https://demo-r2.speech.fish.audio/v1.1-sft-large/ja/2_output.wav" /></td>
221
+ </tr>
222
+ </tbody>
223
+ </table>
fish_speech/callbacks/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .grad_norm import GradNormMonitor
2
+
3
+ __all__ = ["GradNormMonitor"]
fish_speech/callbacks/grad_norm.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Union
2
+
3
+ import lightning.pytorch as pl
4
+ import torch
5
+ from lightning import LightningModule, Trainer
6
+ from lightning.pytorch.callbacks import Callback
7
+ from torch import Tensor, nn
8
+ from torch.utils._foreach_utils import (
9
+ _group_tensors_by_device_and_dtype,
10
+ _has_foreach_support,
11
+ )
12
+
13
+
14
+ @torch.no_grad()
15
+ def grad_norm(
16
+ parameters: Union[Tensor, list[Tensor]],
17
+ norm_type: float = 2.0,
18
+ ) -> float:
19
+ """
20
+ Returns the norm of the gradients of the given parameters.
21
+
22
+ Args:
23
+ parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
24
+ single Tensor that will have gradients normalized
25
+ norm_type (float): type of the used p-norm.
26
+
27
+ Returns:
28
+ Total norm of the parameter gradients (viewed as a single vector).
29
+ """ # noqa: E501
30
+
31
+ if isinstance(parameters, Tensor):
32
+ parameters = [parameters]
33
+
34
+ grads = [p.grad for p in parameters if p.grad is not None]
35
+ if len(grads) == 0:
36
+ return None
37
+
38
+ first_device = grads[0].device
39
+ grouped_grads: dict[
40
+ tuple[torch.device, torch.dtype], list[list[Tensor]]
41
+ ] = _group_tensors_by_device_and_dtype(
42
+ [[g.detach() for g in grads]]
43
+ ) # type: ignore[assignment]
44
+
45
+ norms = []
46
+ for (device, _), ([grads], _) in grouped_grads.items():
47
+ if _has_foreach_support(grads, device=device):
48
+ norms.extend(torch._foreach_norm(grads, norm_type))
49
+ else:
50
+ norms.extend([torch.norm(g, norm_type) for g in grads])
51
+
52
+ return torch.norm(torch.stack([norm.to(first_device) for norm in norms]), norm_type)
53
+
54
+
55
+ class GradNormMonitor(Callback):
56
+ """
57
+ Callback that computes the gradient norm of the model parameters.
58
+ """
59
+
60
+ def __init__(
61
+ self,
62
+ norm_type: float = 2.0,
63
+ logging_interval: str = "step",
64
+ sub_module: Optional[Union[str, list[str]]] = None,
65
+ ) -> None:
66
+ """
67
+ Args:
68
+ norm_type (float): type of the used p-norm.
69
+ logging_interval (str): "step" or "epoch".
70
+ """
71
+ super().__init__()
72
+
73
+ self.norm_type = norm_type
74
+ self.logging_interval = logging_interval
75
+ self.sub_module = sub_module
76
+
77
+ def on_after_backward(self, trainer: Trainer, model: LightningModule) -> None:
78
+ """
79
+ Computes the gradient norm of the model parameters and logs it to the logger.
80
+
81
+ Args:
82
+ trainer (Trainer): The trainer object
83
+ model (LightningModule): The current lightningModule
84
+ """
85
+
86
+ lightning_model = model
87
+
88
+ if self.sub_module is None:
89
+ return self.log_sub_module_grad_norm(lightning_model, model, "")
90
+
91
+ sub_modules = self.sub_module
92
+ if isinstance(sub_modules, str):
93
+ sub_modules = [sub_modules]
94
+
95
+ for sub_module in sub_modules:
96
+ self.log_sub_module_grad_norm(
97
+ lightning_model, getattr(model, sub_module), f"/{sub_module}"
98
+ )
99
+
100
+ def log_sub_module_grad_norm(
101
+ self, lightning_model: LightningModule, model: nn.Module, path: str
102
+ ) -> None:
103
+ grad_norm_val = grad_norm(model.parameters(), self.norm_type)
104
+ if grad_norm_val is None:
105
+ return
106
+
107
+ on_step = self.logging_interval == "step"
108
+ lightning_model.log(
109
+ f"train{path}/grad_norm",
110
+ grad_norm_val,
111
+ on_step=on_step,
112
+ on_epoch=not on_step,
113
+ )
fish_speech/configs/base.yaml ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Base configuration for training a model
2
+ paths:
3
+ run_dir: results/${project}
4
+ ckpt_dir: ${paths.run_dir}/checkpoints
5
+
6
+ hydra:
7
+ run:
8
+ dir: ${paths.run_dir}
9
+
10
+ # Lightning Trainer
11
+ trainer:
12
+ _target_: lightning.pytorch.trainer.Trainer
13
+
14
+ default_root_dir: ${paths.run_dir}
15
+ accelerator: gpu
16
+ num_nodes: 1
17
+ devices: auto
18
+ strategy:
19
+ _target_: lightning.pytorch.strategies.DDPStrategy
20
+ process_group_backend: nccl # This should be override when training on windows
21
+
22
+ precision: bf16-mixed
23
+
24
+ # disable validation by epoch end
25
+ check_val_every_n_epoch: null
26
+ val_check_interval: 5000
27
+ max_steps: 100_000
28
+
29
+ # Use torch.backends.cudnn.benchmark to speed up training
30
+ benchmark: true
31
+
32
+ # Callbacks
33
+ callbacks:
34
+ model_checkpoint:
35
+ _target_: lightning.pytorch.callbacks.ModelCheckpoint
36
+ dirpath: ${paths.ckpt_dir}
37
+ filename: "step_{step:09d}"
38
+ save_last: false # additionally always save an exact copy of the last checkpoint to a file last.ckpt
39
+ save_top_k: 5 # save 5 latest checkpoints
40
+ monitor: step # use step to monitor checkpoints
41
+ mode: max # save the latest checkpoint with the highest global_step
42
+ every_n_epochs: null # don't save checkpoints by epoch end
43
+ every_n_train_steps: 5000 # save checkpoints every 5000 steps
44
+ auto_insert_metric_name: false
45
+
46
+ model_summary:
47
+ _target_: lightning.pytorch.callbacks.ModelSummary
48
+ max_depth: 2 # the maximum depth of layer nesting that the summary will include
49
+
50
+ learning_rate_monitor:
51
+ _target_: lightning.pytorch.callbacks.LearningRateMonitor
52
+ logging_interval: step
53
+ log_momentum: false
54
+
55
+ grad_norm_monitor:
56
+ _target_: fish_speech.callbacks.GradNormMonitor
57
+ norm_type: 2
58
+ logging_interval: step
59
+
60
+ # Logger
61
+ logger:
62
+ tensorboard:
63
+ _target_: lightning.pytorch.loggers.tensorboard.TensorBoardLogger
64
+ save_dir: "${paths.run_dir}/tensorboard/"
65
+ name: null
66
+ log_graph: false
67
+ default_hp_metric: true
68
+ prefix: ""
69
+
70
+ # wandb:
71
+ # _target_: lightning.pytorch.loggers.wandb.WandbLogger
72
+ # # name: "" # name of the run (normally generated by wandb)
73
+ # save_dir: "${paths.run_dir}"
74
+ # offline: False
75
+ # id: null # pass correct id to resume experiment!
76
+ # anonymous: null # enable anonymous logging
77
+ # project: "fish-speech"
78
+ # log_model: False # upload lightning ckpts
79
+ # prefix: "" # a string to put at the beginning of metric keys
80
+ # # entity: "" # set to name of your wandb team
81
+ # group: ""
82
+ # tags: ["vq", "hq", "finetune"]
83
+ # job_type: ""
84
+
85
+ # Loop
86
+ train: true
87
+ test: false
fish_speech/configs/firefly_gan_vq.yaml ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: fish_speech.models.vqgan.modules.firefly.FireflyArchitecture
2
+ spec_transform:
3
+ _target_: fish_speech.utils.spectrogram.LogMelSpectrogram
4
+ sample_rate: 44100
5
+ n_mels: 160
6
+ n_fft: 2048
7
+ hop_length: 512
8
+ win_length: 2048
9
+ backbone:
10
+ _target_: fish_speech.models.vqgan.modules.firefly.ConvNeXtEncoder
11
+ input_channels: 160
12
+ depths: [3, 3, 9, 3]
13
+ dims: [128, 256, 384, 512]
14
+ drop_path_rate: 0.2
15
+ kernel_size: 7
16
+ head:
17
+ _target_: fish_speech.models.vqgan.modules.firefly.HiFiGANGenerator
18
+ hop_length: 512
19
+ upsample_rates: [8, 8, 2, 2, 2] # aka. strides
20
+ upsample_kernel_sizes: [16, 16, 4, 4, 4]
21
+ resblock_kernel_sizes: [3, 7, 11]
22
+ resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
23
+ num_mels: 512
24
+ upsample_initial_channel: 512
25
+ use_template: false
26
+ pre_conv_kernel_size: 13
27
+ post_conv_kernel_size: 13
28
+ quantizer:
29
+ _target_: fish_speech.models.vqgan.modules.fsq.DownsampleFiniteScalarQuantize
30
+ input_dim: 512
31
+ n_groups: 4
32
+ n_codebooks: 1
33
+ levels: [8, 5, 5, 5]
34
+ downsample_factor: [2]
fish_speech/configs/lora/r_8_alpha_16.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ _target_: fish_speech.models.text2semantic.lora.LoraConfig
2
+ r: 8
3
+ lora_alpha: 16
4
+ lora_dropout: 0.01
fish_speech/configs/text2semantic_finetune.yaml ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ defaults:
2
+ - base
3
+ - _self_
4
+
5
+ project: text2semantic_finetune_dual_ar
6
+ max_length: 4096
7
+ pretrained_ckpt_path: checkpoints/fish-speech-1.2-sft
8
+
9
+ # Lightning Trainer
10
+ trainer:
11
+ accumulate_grad_batches: 1
12
+ gradient_clip_val: 1.0
13
+ gradient_clip_algorithm: "norm"
14
+ max_steps: 1000
15
+ precision: bf16-true
16
+ limit_val_batches: 10
17
+ val_check_interval: 100
18
+
19
+ # Dataset Configuration
20
+ tokenizer:
21
+ _target_: transformers.AutoTokenizer.from_pretrained
22
+ pretrained_model_name_or_path: ${pretrained_ckpt_path}
23
+
24
+ # Dataset Configuration
25
+ train_dataset:
26
+ _target_: fish_speech.datasets.semantic.AutoTextSemanticInstructionDataset
27
+ proto_files:
28
+ - data/protos
29
+ tokenizer: ${tokenizer}
30
+ causal: true
31
+ max_length: ${max_length}
32
+ use_speaker: false
33
+ interactive_prob: 0.7
34
+
35
+ val_dataset:
36
+ _target_: fish_speech.datasets.semantic.AutoTextSemanticInstructionDataset
37
+ proto_files:
38
+ - data/protos
39
+ tokenizer: ${tokenizer}
40
+ causal: true
41
+ max_length: ${max_length}
42
+ use_speaker: false
43
+ interactive_prob: 0.7
44
+
45
+ data:
46
+ _target_: fish_speech.datasets.semantic.SemanticDataModule
47
+ train_dataset: ${train_dataset}
48
+ val_dataset: ${val_dataset}
49
+ num_workers: 4
50
+ batch_size: 8
51
+ tokenizer: ${tokenizer}
52
+ max_length: ${max_length}
53
+
54
+ # Model Configuration
55
+ model:
56
+ _target_: fish_speech.models.text2semantic.lit_module.TextToSemantic
57
+ model:
58
+ _target_: fish_speech.models.text2semantic.llama.BaseTransformer.from_pretrained
59
+ path: ${pretrained_ckpt_path}
60
+ load_weights: true
61
+ max_length: ${max_length}
62
+ lora_config: null
63
+
64
+ optimizer:
65
+ _target_: torch.optim.AdamW
66
+ _partial_: true
67
+ lr: 1e-4
68
+ weight_decay: 0
69
+ betas: [0.9, 0.95]
70
+ eps: 1e-5
71
+
72
+ lr_scheduler:
73
+ _target_: torch.optim.lr_scheduler.LambdaLR
74
+ _partial_: true
75
+ lr_lambda:
76
+ _target_: fish_speech.scheduler.get_constant_schedule_with_warmup_lr_lambda
77
+ _partial_: true
78
+ num_warmup_steps: 10
79
+
80
+ # Callbacks
81
+ callbacks:
82
+ model_checkpoint:
83
+ every_n_train_steps: ${trainer.val_check_interval}
fish_speech/conversation.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ SEMANTIC_TOKEN = "<|semantic|>"
2
+ CODEBOOK_PAD_TOKEN_ID = 0
fish_speech/datasets/concat_repeat.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import bisect
2
+ import random
3
+ from typing import Iterable
4
+
5
+ from torch.utils.data import Dataset, IterableDataset
6
+
7
+
8
+ class ConcatRepeatDataset(Dataset):
9
+ datasets: list[Dataset]
10
+ cumulative_sizes: list[int]
11
+ repeats: list[int]
12
+
13
+ @staticmethod
14
+ def cumsum(sequence, repeats):
15
+ r, s = [], 0
16
+ for dataset, repeat in zip(sequence, repeats):
17
+ l = len(dataset) * repeat
18
+ r.append(l + s)
19
+ s += l
20
+ return r
21
+
22
+ def __init__(self, datasets: Iterable[Dataset], repeats: list[int]):
23
+ super().__init__()
24
+
25
+ self.datasets = list(datasets)
26
+ self.repeats = repeats
27
+
28
+ assert len(self.datasets) > 0, "datasets should not be an empty iterable"
29
+ assert len(self.datasets) == len(
30
+ repeats
31
+ ), "datasets and repeats should have the same length"
32
+
33
+ for d in self.datasets:
34
+ assert not isinstance(
35
+ d, IterableDataset
36
+ ), "ConcatRepeatDataset does not support IterableDataset"
37
+
38
+ self.cumulative_sizes = self.cumsum(self.datasets, self.repeats)
39
+
40
+ def __len__(self):
41
+ return self.cumulative_sizes[-1]
42
+
43
+ def __getitem__(self, idx):
44
+ dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
45
+
46
+ if dataset_idx == 0:
47
+ sample_idx = idx
48
+ else:
49
+ sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
50
+
51
+ dataset = self.datasets[dataset_idx]
52
+
53
+ return dataset[sample_idx % len(dataset)]
fish_speech/datasets/protos/text-data.proto ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ syntax = "proto3";
2
+
3
+ package text_data;
4
+
5
+ message Semantics {
6
+ repeated uint32 values = 1;
7
+ }
8
+
9
+ message Sentence {
10
+ repeated string texts = 1;
11
+ repeated Semantics semantics = 3;
12
+ }
13
+
14
+ message TextData {
15
+ string source = 1;
16
+ string name = 2;
17
+ repeated Sentence sentences = 4;
18
+ }
19
+
20
+ message SampledData {
21
+ string source = 1;
22
+ string name = 2;
23
+ repeated Sentence samples = 3;
24
+ }
fish_speech/datasets/protos/text_data_pb2.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Generated by the protocol buffer compiler. DO NOT EDIT!
3
+ # source: text-data.proto
4
+ # Protobuf Python Version: 4.25.1
5
+ """Generated protocol buffer code."""
6
+ from google.protobuf import descriptor as _descriptor
7
+ from google.protobuf import descriptor_pool as _descriptor_pool
8
+ from google.protobuf import symbol_database as _symbol_database
9
+ from google.protobuf.internal import builder as _builder
10
+
11
+ # @@protoc_insertion_point(imports)
12
+
13
+ _sym_db = _symbol_database.Default()
14
+
15
+
16
+ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
17
+ b'\n\x0ftext-data.proto\x12\ttext_data"\x1b\n\tSemantics\x12\x0e\n\x06values\x18\x01 \x03(\r"B\n\x08Sentence\x12\r\n\x05texts\x18\x01 \x03(\t\x12\'\n\tsemantics\x18\x03 \x03(\x0b\x32\x14.text_data.Semantics"P\n\x08TextData\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12&\n\tsentences\x18\x04 \x03(\x0b\x32\x13.text_data.Sentence"Q\n\x0bSampledData\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12$\n\x07samples\x18\x03 \x03(\x0b\x32\x13.text_data.Sentenceb\x06proto3'
18
+ )
19
+
20
+ _globals = globals()
21
+ _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
22
+ _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "text_data_pb2", _globals)
23
+ if _descriptor._USE_C_DESCRIPTORS == False:
24
+ DESCRIPTOR._options = None
25
+ _globals["_SEMANTICS"]._serialized_start = 30
26
+ _globals["_SEMANTICS"]._serialized_end = 57
27
+ _globals["_SENTENCE"]._serialized_start = 59
28
+ _globals["_SENTENCE"]._serialized_end = 125
29
+ _globals["_TEXTDATA"]._serialized_start = 127
30
+ _globals["_TEXTDATA"]._serialized_end = 207
31
+ _globals["_SAMPLEDDATA"]._serialized_start = 209
32
+ _globals["_SAMPLEDDATA"]._serialized_end = 290
33
+ # @@protoc_insertion_point(module_scope)
fish_speech/datasets/protos/text_data_stream.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import struct
2
+
3
+ from .text_data_pb2 import TextData
4
+
5
+
6
+ def read_pb_stream(f):
7
+ while True:
8
+ buf = f.read(4)
9
+ if len(buf) == 0:
10
+ break
11
+ size = struct.unpack("I", buf)[0]
12
+ buf = f.read(size)
13
+ text_data = TextData()
14
+ text_data.ParseFromString(buf)
15
+ yield text_data
16
+
17
+
18
+ def write_pb_stream(f, text_data):
19
+ buf = text_data.SerializeToString()
20
+ f.write(struct.pack("I", len(buf)))
21
+ f.write(buf)
22
+
23
+
24
+ def pack_pb_stream(text_data):
25
+ buf = text_data.SerializeToString()
26
+ return struct.pack("I", len(buf)) + buf
27
+
28
+
29
+ def split_pb_stream(f):
30
+ while True:
31
+ head = f.read(4)
32
+ if len(head) == 0:
33
+ break
34
+ size = struct.unpack("I", head)[0]
35
+ buf = f.read(size)
36
+ yield head + buf
fish_speech/datasets/semantic.py ADDED
@@ -0,0 +1,496 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ from dataclasses import dataclass
3
+ from itertools import chain
4
+ from pathlib import Path
5
+ from random import Random
6
+ from typing import Optional, Union
7
+
8
+ import numpy as np
9
+ import pyarrow.parquet as pq
10
+ import torch
11
+ import torch.nn.functional as F
12
+ from datasets.download.streaming_download_manager import xopen
13
+ from huggingface_hub import HfApi
14
+ from lightning import LightningDataModule
15
+ from torch.distributed import get_rank, get_world_size, is_initialized
16
+ from torch.utils.data import DataLoader, IterableDataset, get_worker_info
17
+ from transformers import AutoTokenizer
18
+
19
+ from fish_speech.conversation import CODEBOOK_PAD_TOKEN_ID
20
+ from fish_speech.datasets.protos.text_data_pb2 import SampledData
21
+ from fish_speech.datasets.protos.text_data_stream import read_pb_stream
22
+ from fish_speech.text.clean import clean_text
23
+ from fish_speech.utils import RankedLogger
24
+ from fish_speech.utils.braceexpand import braceexpand
25
+
26
+ log = RankedLogger(__name__, rank_zero_only=True)
27
+
28
+
29
+ def split_by_rank_worker(files):
30
+ # We need to know the total number of devices
31
+ # to split the data properly
32
+
33
+ total_devices = 1
34
+ if is_initialized():
35
+ total_devices = get_world_size()
36
+
37
+ worker_info = get_worker_info()
38
+ if worker_info is not None:
39
+ total_devices *= worker_info.num_workers
40
+
41
+ if len(files) < total_devices:
42
+ # Repeat the files N times to match the number of devices
43
+ files = files * (total_devices // len(files) + 1)
44
+
45
+ # DDP
46
+ if is_initialized():
47
+ files = files[get_rank() :: get_world_size()]
48
+
49
+ # Split by worker
50
+ if worker_info is not None:
51
+ files = files[worker_info.id :: worker_info.num_workers]
52
+
53
+ return files
54
+
55
+
56
+ class AutoTextSemanticInstructionDataset(IterableDataset):
57
+ """
58
+ Auto Augment Dataset by Speaker
59
+
60
+ 1. Random concatenate multiple sentences from the same speaker to form a longer sentence
61
+ 2. Automatically normalize the text
62
+
63
+ For interactive mode, we use the following format (multiple sequences):
64
+ <s> [INST] [SPK: speaker] text [/INST] ... [INST] text [/INST] </s>
65
+
66
+ For non-interactive mode, we use the following format (one long sequence):
67
+ <s> [INST] text [/INST] ... </s>
68
+ """
69
+
70
+ def __init__(
71
+ self,
72
+ proto_files: list[str],
73
+ seed: int = 42,
74
+ interactive_prob: float = 0.5,
75
+ max_length: int = 1024,
76
+ tokenizer: AutoTokenizer = None,
77
+ use_speaker: bool | float = True,
78
+ causal: bool = True,
79
+ num_codebooks: Optional[int] = None,
80
+ skip_text_prob: float = 0.0,
81
+ ):
82
+ """
83
+ Args:
84
+ proto_files: proto buf files if using local data
85
+ seed: random seed
86
+ interactive_prob: probability to use interactive mode
87
+ max_length: max length of the text
88
+ tokenizer: tokenizer
89
+ use_speaker: include speaker information in the prompt
90
+ causal: use causal sampling when using local data, disable will lead to random sampling
91
+ num_codebooks: number of codebooks, if None, it will be automatically detected
92
+ skip_text_prob: probability to skip the text (audio only), this only applies to interactive mode
93
+ """
94
+
95
+ super().__init__()
96
+
97
+ assert 0 <= interactive_prob <= 1, "interactive_prob must be in [0, 1]"
98
+
99
+ self.seed = seed
100
+ self.max_length = max_length
101
+ self.tokenizer = tokenizer
102
+ self.interactive_prob = interactive_prob
103
+ self.use_speaker = use_speaker
104
+ self.proto_files = proto_files
105
+ self.causal = causal
106
+ self.num_codebooks = num_codebooks
107
+ self.skip_text_prob = skip_text_prob
108
+
109
+ self.semantic_token_id = self.tokenizer.convert_tokens_to_ids("<|semantic|>")
110
+ self.groups = None
111
+
112
+ def init_mock_data_server(self):
113
+ if self.groups is not None:
114
+ return
115
+
116
+ # Expand the proto files
117
+ expanded_proto_files = []
118
+ for filename in self.proto_files:
119
+ for i in braceexpand(filename):
120
+ i = Path(i)
121
+ if i.is_file():
122
+ expanded_proto_files.append(i)
123
+ elif i.is_dir():
124
+ expanded_proto_files.extend(i.rglob("*.proto"))
125
+ expanded_proto_files.extend(i.rglob("*.protos"))
126
+ else:
127
+ raise ValueError(f"{i} is not a file or directory")
128
+
129
+ expanded_proto_files = sorted(expanded_proto_files)
130
+ Random(self.seed).shuffle(expanded_proto_files)
131
+
132
+ self.groups = []
133
+ shard_proto_files = split_by_rank_worker(expanded_proto_files)
134
+ log.info(
135
+ f"Reading {len(shard_proto_files)} / {len(expanded_proto_files)} files"
136
+ )
137
+
138
+ count = 0
139
+ for filename in shard_proto_files:
140
+ with open(filename, "rb") as f:
141
+ for text_data in read_pb_stream(f):
142
+ self.groups.append(text_data)
143
+ count += 1
144
+
145
+ log.info(f"Read total {count} groups of data")
146
+
147
+ # Shuffle the lines
148
+ Random(self.seed).shuffle(self.groups)
149
+ self.group_weights = [len(i.sentences) for i in self.groups]
150
+
151
+ def __iter__(self):
152
+ while True:
153
+ yield self.augment()
154
+
155
+ def tokenize_sentence(self, sentence: str):
156
+ sentence = clean_text(sentence)
157
+ tokens = self.tokenizer.encode(
158
+ f"{sentence}",
159
+ max_length=10**6,
160
+ add_special_tokens=False,
161
+ truncation=False,
162
+ )
163
+ return sentence, len(tokens)
164
+
165
+ def sample_data(self):
166
+ if self.groups is None:
167
+ self.init_mock_data_server()
168
+
169
+ # Shuffle unique lines, estimate that each sample is at least 20 tokens
170
+ num_samples = self.max_length // 20
171
+
172
+ # choice group based on their number of samples
173
+ group = random.choices(self.groups, weights=self.group_weights, k=1)[0]
174
+
175
+ if self.causal:
176
+ # Sample in order
177
+ if num_samples >= len(group.sentences):
178
+ samples = group.sentences
179
+ else:
180
+ begin = random.randint(0, len(group.sentences) - num_samples)
181
+ samples = group.sentences[begin : begin + num_samples]
182
+ else:
183
+ samples = random.choices(
184
+ group.sentences, k=min(num_samples, len(group.sentences))
185
+ )
186
+
187
+ return SampledData(
188
+ source=group.source,
189
+ name=group.name,
190
+ samples=samples,
191
+ )
192
+
193
+ def augment(self):
194
+ final_text, final_semantic = [], []
195
+ response = self.sample_data()
196
+ if len(response.samples) == 0:
197
+ # Invalid group
198
+ return None
199
+
200
+ samples = list(response.samples)
201
+ idx = 0
202
+ use_interactive = random.random() < self.interactive_prob
203
+
204
+ if use_interactive is False:
205
+ # Random sample based on speaker using a truncated normal distribution
206
+ a = torch.tensor([0], dtype=torch.float32)
207
+ torch.nn.init.trunc_normal_(
208
+ a,
209
+ mean=self.max_length // 2,
210
+ std=self.max_length // 4,
211
+ a=10,
212
+ b=self.max_length,
213
+ )
214
+ remaining_tokens = a.long().item() - 4
215
+ else:
216
+ remaining_tokens = self.max_length
217
+
218
+ # Use speaker
219
+ if isinstance(self.use_speaker, float):
220
+ use_speaker = random.random() < self.use_speaker
221
+ else:
222
+ use_speaker = self.use_speaker
223
+
224
+ all_tokens, all_labels = [], []
225
+ while remaining_tokens > 0 and len(samples) > 0:
226
+ sentence = samples.pop(0)
227
+
228
+ text = random.choice(sentence.texts)
229
+ text, length = self.tokenize_sentence(text)
230
+ remaining_tokens -= length + len(sentence.semantics[0].values)
231
+
232
+ if use_interactive is False:
233
+ final_text.append(text)
234
+ final_semantic.append(sentence.semantics)
235
+ else:
236
+ # For interactive mode, we only apply speaker for the first sentence
237
+ # [INST] [SPK: speaker] text [/INST] ... [INST] text [/INST]
238
+ tokens, labels = self.pack_sentences(
239
+ sentences=[text],
240
+ semantics=[sentence.semantics],
241
+ speaker=response.name if use_speaker else None,
242
+ skip_text=random.random() < self.skip_text_prob,
243
+ )
244
+
245
+ all_tokens.append(tokens)
246
+ all_labels.append(labels)
247
+
248
+ idx += 1
249
+
250
+ if use_interactive is False:
251
+ tokens, labels = self.pack_sentences(
252
+ final_text,
253
+ semantics=final_semantic,
254
+ speaker=response.name if use_speaker else None,
255
+ )
256
+ all_tokens.append(tokens)
257
+ all_labels.append(labels)
258
+
259
+ tokens = torch.cat(all_tokens, dim=1)
260
+ labels = torch.cat(all_labels, dim=1)
261
+
262
+ # Verify that the length is correct
263
+ assert tokens.size(1) == labels.size(1), f"{tokens.size(1)} != {labels.size(1)}"
264
+
265
+ data = {"tokens": tokens, "labels": labels}
266
+
267
+ return data
268
+
269
+ def pack_sentences(
270
+ self,
271
+ sentences: list[str],
272
+ semantics: list,
273
+ speaker: Optional[str] = None,
274
+ skip_text: bool = False,
275
+ ):
276
+ if speaker is None:
277
+ speaker = "assistant"
278
+
279
+ cated_sentences = " ".join(sentences)
280
+ if skip_text:
281
+ cated_sentences = "<|skip_text|>"
282
+
283
+ final_text = "<|im_start|>user\n" + cated_sentences + "<|im_end|>"
284
+ final_text = final_text + f"<|im_start|>{speaker}\n"
285
+
286
+ encoded = self.tokenizer.encode(
287
+ final_text,
288
+ add_special_tokens=False,
289
+ truncation=False,
290
+ max_length=10**6,
291
+ )
292
+ semantic_length = sum([len(i[0].values) for i in semantics])
293
+ prompt_length = len(encoded)
294
+ num_codebooks = (
295
+ len(semantics[0]) if self.num_codebooks is None else self.num_codebooks
296
+ )
297
+
298
+ # Pack the tokens and semantics (add <s> and </s> to semantic tokens)
299
+ tokens = (
300
+ encoded
301
+ + [self.semantic_token_id] * semantic_length
302
+ + self.tokenizer.convert_tokens_to_ids(["<|im_end|>"])
303
+ )
304
+
305
+ # Codebook bos/padding: 0, eos: 1
306
+ codes = [[CODEBOOK_PAD_TOKEN_ID] * prompt_length for _ in range(num_codebooks)]
307
+ for segment in semantics:
308
+ for book_idx, book in zip(range(num_codebooks), segment):
309
+ for j in book.values:
310
+ codes[book_idx].append(int(j) + 1)
311
+
312
+ for book in codes:
313
+ book.extend([CODEBOOK_PAD_TOKEN_ID] * 1)
314
+
315
+ tokens = [tokens] + codes
316
+
317
+ tokens = torch.tensor(tokens, dtype=torch.long)
318
+ labels = tokens.clone()
319
+
320
+ if skip_text:
321
+ # If text is not provided, the sentence is used for condition only, all labels are -100
322
+ torch.fill_(labels, -100)
323
+ return tokens, labels
324
+
325
+ # Mask out the <s> tokens for semantic, predict semantic tokens only
326
+ # Since we don't mask out the input tokens, the language modeling still works
327
+ labels[1:, :prompt_length] = -100
328
+
329
+ tokens = tokens[:, :-1]
330
+ labels = labels[:, 1:]
331
+
332
+ # Verify the padding is correct, and the last token is eos
333
+ assert (tokens[1:, :prompt_length] == CODEBOOK_PAD_TOKEN_ID).all()
334
+ assert (labels[1:, -1:] == CODEBOOK_PAD_TOKEN_ID).all()
335
+
336
+ return tokens, labels
337
+
338
+
339
+ @dataclass
340
+ class TextDataCollator:
341
+ tokenizer: AutoTokenizer
342
+ max_length: int = 1024
343
+
344
+ def __call__(self, examples):
345
+ if "negative_tokens" in examples:
346
+ positive_examples = []
347
+ negative_examples = []
348
+
349
+ for i in examples:
350
+ positive_examples.append(
351
+ {
352
+ "tokens": i["tokens"],
353
+ "labels": i["labels"],
354
+ }
355
+ )
356
+ negative_examples.append(
357
+ {
358
+ "tokens": i["negative_tokens"],
359
+ "labels": i["negative_labels"],
360
+ }
361
+ )
362
+
363
+ examples = positive_examples + negative_examples
364
+
365
+ return self.batchify(examples)
366
+
367
+ def batchify(self, examples, tokens_key="tokens", labels_key="labels"):
368
+ tokens, attention_masks, labels = [], [], []
369
+
370
+ # Calculate the max length
371
+ max_tokens_length = 0
372
+ for example in examples:
373
+ max_tokens_length = max(max_tokens_length, example[tokens_key].size(1))
374
+ max_tokens_length = min(max_tokens_length, self.max_length)
375
+
376
+ for example in examples:
377
+ _tokens = example[tokens_key][:, :max_tokens_length]
378
+ _labels = example[labels_key][:, :max_tokens_length]
379
+ _attention_mask = torch.ones((max_tokens_length,), dtype=torch.bool)
380
+ tokens_length = _tokens.size(1)
381
+ _attention_mask[:tokens_length] = False
382
+
383
+ assert tokens_length == _labels.size(
384
+ 1
385
+ ), f"{tokens_length} != {_labels.size(1)}"
386
+
387
+ if tokens_length < max_tokens_length:
388
+ _tokens = F.pad(
389
+ _tokens,
390
+ (0, max_tokens_length - tokens_length),
391
+ value=self.tokenizer.eos_token_id,
392
+ )
393
+ _tokens[1:, tokens_length:] = CODEBOOK_PAD_TOKEN_ID
394
+ _labels = F.pad(
395
+ _labels, (0, max_tokens_length - _labels.size(1)), value=-100
396
+ )
397
+
398
+ tokens.append(_tokens)
399
+ attention_masks.append(_attention_mask)
400
+ labels.append(_labels)
401
+
402
+ tokens = torch.stack(tokens, dim=0)
403
+ attention_masks = torch.stack(attention_masks, dim=0)
404
+ labels = torch.stack(labels, dim=0)
405
+
406
+ return {
407
+ "inputs": tokens,
408
+ "attention_masks": attention_masks,
409
+ "labels": labels,
410
+ }
411
+
412
+
413
+ class InterleaveDataset(IterableDataset):
414
+ def __init__(
415
+ self,
416
+ datasets: list[IterableDataset],
417
+ probabilities: list[float],
418
+ seed: int = 42,
419
+ ):
420
+ super().__init__()
421
+
422
+ self.datasets = datasets
423
+ self.probabilities = probabilities
424
+ self.seed = seed
425
+
426
+ def __iter__(self):
427
+ rng = np.random.default_rng(self.seed)
428
+ dataset_iterators = [iter(dataset) for dataset in self.datasets]
429
+
430
+ while True:
431
+ # Random choice one
432
+ dataset_idx = rng.choice(len(self.datasets), p=self.probabilities)
433
+ dataset_iterator = dataset_iterators[dataset_idx]
434
+
435
+ try:
436
+ yield next(dataset_iterator)
437
+ except StopIteration:
438
+ # Exhausted, create a new iterator
439
+ dataset_iterators[dataset_idx] = iter(self.datasets[dataset_idx])
440
+ yield next(dataset_iterators[dataset_idx])
441
+
442
+
443
+ class SemanticDataModule(LightningDataModule):
444
+ def __init__(
445
+ self,
446
+ train_dataset: Union[AutoTextSemanticInstructionDataset, InterleaveDataset],
447
+ val_dataset: Union[AutoTextSemanticInstructionDataset, InterleaveDataset],
448
+ batch_size: int = 32,
449
+ tokenizer: AutoTokenizer = None,
450
+ max_length: int = 1024,
451
+ num_workers: int = 4,
452
+ ):
453
+ super().__init__()
454
+
455
+ self.train_dataset = train_dataset
456
+ self.val_dataset = val_dataset
457
+ self.batch_size = batch_size
458
+ self.tokenizer = tokenizer
459
+ self.max_length = max_length
460
+ self.num_workers = num_workers
461
+
462
+ def train_dataloader(self):
463
+ return DataLoader(
464
+ self.train_dataset,
465
+ batch_size=self.batch_size,
466
+ collate_fn=TextDataCollator(self.tokenizer, self.max_length),
467
+ num_workers=self.num_workers,
468
+ persistent_workers=True,
469
+ )
470
+
471
+ def val_dataloader(self):
472
+ return DataLoader(
473
+ self.val_dataset,
474
+ batch_size=self.batch_size,
475
+ collate_fn=TextDataCollator(self.tokenizer, self.max_length),
476
+ num_workers=self.num_workers,
477
+ persistent_workers=True,
478
+ )
479
+
480
+
481
+ if __name__ == "__main__":
482
+ from tqdm import tqdm
483
+
484
+ ds = AutoTextSemanticInstructionDataset(
485
+ ["data/protos"],
486
+ tokenizer=AutoTokenizer.from_pretrained("fishaudio/fish-speech-1"),
487
+ use_speaker=False,
488
+ interactive_prob=1.0,
489
+ skip_text_prob=0.5,
490
+ )
491
+
492
+ for i in ds:
493
+ print(ds.tokenizer.decode(i["tokens"][0], skip_special_tokens=False))
494
+ # i["labels"][0][i["labels"][0] == -100] = 0
495
+ # print(ds.tokenizer.decode(i["labels"][0], skip_special_tokens=False))
496
+ break
fish_speech/datasets/vqgan.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from pathlib import Path
3
+ from typing import Optional
4
+
5
+ import librosa
6
+ import numpy as np
7
+ import torch
8
+ from lightning import LightningDataModule
9
+ from torch.utils.data import DataLoader, Dataset
10
+
11
+ from fish_speech.utils import RankedLogger
12
+
13
+ logger = RankedLogger(__name__, rank_zero_only=False)
14
+
15
+
16
+ class VQGANDataset(Dataset):
17
+ def __init__(
18
+ self,
19
+ filelist: str,
20
+ sample_rate: int = 32000,
21
+ hop_length: int = 640,
22
+ slice_frames: Optional[int] = None,
23
+ ):
24
+ super().__init__()
25
+
26
+ filelist = Path(filelist)
27
+ root = filelist.parent
28
+
29
+ self.files = [
30
+ root / line.strip()
31
+ for line in filelist.read_text(encoding="utf-8").splitlines()
32
+ if line.strip()
33
+ ]
34
+ self.sample_rate = sample_rate
35
+ self.hop_length = hop_length
36
+ self.slice_frames = slice_frames
37
+
38
+ def __len__(self):
39
+ return len(self.files)
40
+
41
+ def get_item(self, idx):
42
+ file = self.files[idx]
43
+
44
+ audio, _ = librosa.load(file, sr=self.sample_rate, mono=True)
45
+
46
+ # Slice audio and features
47
+ if (
48
+ self.slice_frames is not None
49
+ and audio.shape[0] > self.slice_frames * self.hop_length
50
+ ):
51
+ start = np.random.randint(
52
+ 0, audio.shape[0] - self.slice_frames * self.hop_length
53
+ )
54
+ audio = audio[start : start + self.slice_frames * self.hop_length]
55
+
56
+ if len(audio) == 0:
57
+ return None
58
+
59
+ max_value = np.abs(audio).max()
60
+ if max_value > 1.0:
61
+ audio = audio / max_value
62
+
63
+ return {
64
+ "audio": torch.from_numpy(audio),
65
+ }
66
+
67
+ def __getitem__(self, idx):
68
+ try:
69
+ return self.get_item(idx)
70
+ except Exception as e:
71
+ import traceback
72
+
73
+ traceback.print_exc()
74
+ logger.error(f"Error loading {self.files[idx]}: {e}")
75
+ return None
76
+
77
+
78
+ @dataclass
79
+ class VQGANCollator:
80
+ def __call__(self, batch):
81
+ batch = [x for x in batch if x is not None]
82
+
83
+ audio_lengths = torch.tensor([len(x["audio"]) for x in batch])
84
+ audio_maxlen = audio_lengths.max()
85
+
86
+ # Rounds up to nearest multiple of 2 (audio_lengths)
87
+ audios = []
88
+ for x in batch:
89
+ audios.append(
90
+ torch.nn.functional.pad(x["audio"], (0, audio_maxlen - len(x["audio"])))
91
+ )
92
+
93
+ return {
94
+ "audios": torch.stack(audios),
95
+ "audio_lengths": audio_lengths,
96
+ }
97
+
98
+
99
+ class VQGANDataModule(LightningDataModule):
100
+ def __init__(
101
+ self,
102
+ train_dataset: VQGANDataset,
103
+ val_dataset: VQGANDataset,
104
+ batch_size: int = 32,
105
+ num_workers: int = 4,
106
+ val_batch_size: Optional[int] = None,
107
+ ):
108
+ super().__init__()
109
+
110
+ self.train_dataset = train_dataset
111
+ self.val_dataset = val_dataset
112
+ self.batch_size = batch_size
113
+ self.val_batch_size = val_batch_size or batch_size
114
+ self.num_workers = num_workers
115
+
116
+ def train_dataloader(self):
117
+ return DataLoader(
118
+ self.train_dataset,
119
+ batch_size=self.batch_size,
120
+ collate_fn=VQGANCollator(),
121
+ num_workers=self.num_workers,
122
+ shuffle=True,
123
+ persistent_workers=True,
124
+ )
125
+
126
+ def val_dataloader(self):
127
+ return DataLoader(
128
+ self.val_dataset,
129
+ batch_size=self.val_batch_size,
130
+ collate_fn=VQGANCollator(),
131
+ num_workers=self.num_workers,
132
+ persistent_workers=True,
133
+ )
134
+
135
+
136
+ if __name__ == "__main__":
137
+ dataset = VQGANDataset("data/LibriTTS_R/vq_train_filelist.txt")
138
+ dataloader = DataLoader(
139
+ dataset, batch_size=4, shuffle=False, collate_fn=VQGANCollator()
140
+ )
141
+
142
+ for batch in dataloader:
143
+ print(batch["audios"].shape)
144
+ print(batch["features"].shape)
145
+ print(batch["audio_lengths"])
146
+ print(batch["feature_lengths"])
147
+ break
fish_speech/i18n/README.md ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## i18n Folder Attribution
2
+
3
+ The `i18n` folder within the `fish_speech` directory contains files initially sourced from the RVC project. In compliance with the MIT license under which these files were released, we acknowledge the original authors and sources below:
4
+
5
+ ### fish_speech/i18n/core.py
6
+
7
+ **Related code from RVC:**
8
+ [https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/i18n.py](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/i18n.py)
9
+
10
+ **Initial commit:**
11
+ add localization(添加本地化) [RVC-Project/Retrieval-based-Voice-Conversion-WebUI#35](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/pull/35)
12
+
13
+ **Initial author:**
14
+ [@L4Ph](https://github.com/L4Ph)
15
+
16
+ ### fish_speech/i18n/scan.py
17
+
18
+ **Related code from RVC:**
19
+ [https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/scan_i18n.py](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/scan_i18n.py)
20
+
21
+ **Initial commit:**
22
+ File for detecting i18n missing keys [RVC-Project/Retrieval-based-Voice-Conversion-WebUI#1058](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/pull/1058)
23
+
24
+ **Initial author:**
25
+ [@towzeur](https://github.com/towzeur)
26
+
27
+ We appreciate the contributions of the RVC project and its authors.
fish_speech/i18n/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .core import i18n
2
+
3
+ __all__ = ["i18n"]
fish_speech/i18n/core.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import locale
3
+ from pathlib import Path
4
+
5
+ I18N_FILE_PATH = Path(__file__).parent / "locale"
6
+ DEFAULT_LANGUAGE = "en_US"
7
+
8
+
9
+ def load_language_list(language):
10
+ with open(I18N_FILE_PATH / f"{language}.json", "r", encoding="utf-8") as f:
11
+ language_list = json.load(f)
12
+
13
+ return language_list
14
+
15
+
16
+ class I18nAuto:
17
+ def __init__(self):
18
+ i18n_file = Path(".locale")
19
+
20
+ if i18n_file.exists():
21
+ with open(i18n_file, "r", encoding="utf-8") as f:
22
+ language = f.read().strip()
23
+ else:
24
+ # getlocale can't identify the system's language ((None, None))
25
+ language = locale.getdefaultlocale()[0]
26
+
27
+ if (I18N_FILE_PATH / f"{language}.json").exists() is False:
28
+ language = DEFAULT_LANGUAGE
29
+
30
+ self.language = language
31
+ self.language_map = load_language_list(language)
32
+
33
+ def __call__(self, key):
34
+ return self.language_map.get(key, key)
35
+
36
+ def __repr__(self):
37
+ return "Use Language: " + self.language
38
+
39
+
40
+ i18n = I18nAuto()
fish_speech/i18n/locale/en_US.json ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "16-mixed is recommended for 10+ series GPU": "16-mixed is recommended for 10+ series GPU",
3
+ "5 to 10 seconds of reference audio, useful for specifying speaker.": "5 to 10 seconds of reference audio, useful for specifying speaker.",
4
+ "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).",
5
+ "Accumulate Gradient Batches": "Accumulate Gradient Batches",
6
+ "Add to Processing Area": "Add to Processing Area",
7
+ "Added path successfully!": "Added path successfully!",
8
+ "Advanced Config": "Advanced Config",
9
+ "Base LLAMA Model": "Base LLAMA Model",
10
+ "Batch Inference": "Batch Inference",
11
+ "Batch Size": "Batch Size",
12
+ "Changing with the Model Path": "Changing with the Model Path",
13
+ "Chinese": "Chinese",
14
+ "Compile Model": "Compile Model",
15
+ "Compile the model can significantly reduce the inference time, but will increase cold start time": "Compile the model can significantly reduce the inference time, but will increase cold start time",
16
+ "Copy": "Copy",
17
+ "Data Preprocessing": "Data Preprocessing",
18
+ "Data Preprocessing Path": "Data Preprocessing Path",
19
+ "Data Source": "Data Source",
20
+ "Decoder Model Config": "Decoder Model Config",
21
+ "Decoder Model Path": "Decoder Model Path",
22
+ "Disabled": "Disabled",
23
+ "Enable Reference Audio": "Enable Reference Audio",
24
+ "English": "English",
25
+ "Error Message": "Error Message",
26
+ "File Preprocessing": "File Preprocessing",
27
+ "Generate": "Generate",
28
+ "Generated Audio": "Generated Audio",
29
+ "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format",
30
+ "Infer interface is closed": "Infer interface is closed",
31
+ "Inference Configuration": "Inference Configuration",
32
+ "Inference Server Configuration": "Inference Server Configuration",
33
+ "Inference Server Error": "Inference Server Error",
34
+ "Inferring interface is launched at {}": "Inferring interface is launched at {}",
35
+ "Initial Learning Rate": "Initial Learning Rate",
36
+ "Input Audio & Source Path for Transcription": "Input Audio & Source Path for Transcription",
37
+ "Input Text": "Input Text",
38
+ "Invalid path: {}": "Invalid path: {}",
39
+ "It is recommended to use CUDA, if you have low configuration, use CPU": "It is recommended to use CUDA, if you have low configuration, use CPU",
40
+ "Iterative Prompt Length, 0 means off": "Iterative Prompt Length, 0 means off",
41
+ "Japanese": "Japanese",
42
+ "LLAMA Configuration": "LLAMA Configuration",
43
+ "LLAMA Model Config": "LLAMA Model Config",
44
+ "LLAMA Model Path": "LLAMA Model Path",
45
+ "Labeling Device": "Labeling Device",
46
+ "LoRA Model to be merged": "LoRA Model to be merged",
47
+ "Maximum Audio Duration": "Maximum Audio Duration",
48
+ "Maximum Length per Sample": "Maximum Length per Sample",
49
+ "Maximum Training Steps": "Maximum Training Steps",
50
+ "Maximum tokens per batch, 0 means no limit": "Maximum tokens per batch, 0 means no limit",
51
+ "Merge": "Merge",
52
+ "Merge LoRA": "Merge LoRA",
53
+ "Merge successfully": "Merge successfully",
54
+ "Minimum Audio Duration": "Minimum Audio Duration",
55
+ "Model Output Path": "Model Output Path",
56
+ "Model Size": "Model Size",
57
+ "Move": "Move",
58
+ "Move files successfully": "Move files successfully",
59
+ "No audio generated, please check the input text.": "No audio generated, please check the input text.",
60
+ "No selected options": "No selected options",
61
+ "Number of Workers": "Number of Workers",
62
+ "Open Inference Server": "Open Inference Server",
63
+ "Open Labeler WebUI": "Open Labeler WebUI",
64
+ "Open Tensorboard": "Open Tensorboard",
65
+ "Opened labeler in browser": "Opened labeler in browser",
66
+ "Optional Label Language": "Optional Label Language",
67
+ "Optional online ver": "Optional online ver",
68
+ "Output Path": "Output Path",
69
+ "Path error, please check the model file exists in the corresponding path": "Path error, please check the model file exists in the corresponding path",
70
+ "Precision": "Precision",
71
+ "Probability of applying Speaker Condition": "Probability of applying Speaker Condition",
72
+ "Put your text here.": "Put your text here.",
73
+ "Reference Audio": "Reference Audio",
74
+ "Reference Text": "Reference Text",
75
+ "Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.": "Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.",
76
+ "Remove Selected Data": "Remove Selected Data",
77
+ "Removed path successfully!": "Removed path successfully!",
78
+ "Repetition Penalty": "Repetition Penalty",
79
+ "Save model every n steps": "Save model every n steps",
80
+ "Select LLAMA ckpt": "Select LLAMA ckpt",
81
+ "Select VITS ckpt": "Select VITS ckpt",
82
+ "Select VQGAN ckpt": "Select VQGAN ckpt",
83
+ "Select source file processing method": "Select source file processing method",
84
+ "Select the model to be trained (Depending on the Tab page you are on)": "Select the model to be trained (Depending on the Tab page you are on)",
85
+ "Selected: {}": "Selected: {}",
86
+ "Speaker": "Speaker",
87
+ "Speaker is identified by the folder name": "Speaker is identified by the folder name",
88
+ "Start Training": "Start Training",
89
+ "Streaming Audio": "Streaming Audio",
90
+ "Streaming Generate": "Streaming Generate",
91
+ "Tensorboard Host": "Tensorboard Host",
92
+ "Tensorboard Log Path": "Tensorboard Log Path",
93
+ "Tensorboard Port": "Tensorboard Port",
94
+ "Tensorboard interface is closed": "Tensorboard interface is closed",
95
+ "Tensorboard interface is launched at {}": "Tensorboard interface is launched at {}",
96
+ "Text is too long, please keep it under {} characters.": "Text is too long, please keep it under {} characters.",
97
+ "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.",
98
+ "Training Configuration": "Training Configuration",
99
+ "Training Error": "Training Error",
100
+ "Training stopped": "Training stopped",
101
+ "Type name of the speaker": "Type name of the speaker",
102
+ "Type the path or select from the dropdown": "Type the path or select from the dropdown",
103
+ "Use LoRA": "Use LoRA",
104
+ "Use LoRA can save GPU memory, but may reduce the quality of the model": "Use LoRA can save GPU memory, but may reduce the quality of the model",
105
+ "Use filelist": "Use filelist",
106
+ "Use large for 10G+ GPU, medium for 5G, small for 2G": "Use large for 10G+ GPU, medium for 5G, small for 2G",
107
+ "VITS Configuration": "VITS Configuration",
108
+ "VQGAN Configuration": "VQGAN Configuration",
109
+ "Validation Batch Size": "Validation Batch Size",
110
+ "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "View the status of the preprocessing folder (use the slider to control the depth of the tree)",
111
+ "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.",
112
+ "WebUI Host": "WebUI Host",
113
+ "WebUI Port": "WebUI Port",
114
+ "Whisper Model": "Whisper Model",
115
+ "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).",
116
+ "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU",
117
+ "latest": "latest",
118
+ "new": "new",
119
+ "Realtime Transform Text": "Realtime Transform Text",
120
+ "Normalization Result Preview (Currently Only Chinese)": "Normalization Result Preview (Currently Only Chinese)",
121
+ "Text Normalization": "Text Normalization"
122
+ }
fish_speech/i18n/locale/es_ES.json ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "16-mixed is recommended for 10+ series GPU": "se recomienda 16-mixed para GPU de la serie 10+",
3
+ "5 to 10 seconds of reference audio, useful for specifying speaker.": "5 a 10 segundos de audio de referencia, útil para especificar el hablante.",
4
+ "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "Un modelo de texto a voz basado en VQ-GAN y Llama desarrollado por [Fish Audio](https://fish.audio).",
5
+ "Accumulate Gradient Batches": "Acumular lotes de gradientes",
6
+ "Add to Processing Area": "Agregar al Área de Procesamiento",
7
+ "Added path successfully!": "¡Ruta agregada exitosamente!",
8
+ "Advanced Config": "Configuración Avanzada",
9
+ "Base LLAMA Model": "Modelo Base LLAMA",
10
+ "Batch Inference": "Inferencia por Lote",
11
+ "Batch Size": "Tamaño del Lote",
12
+ "Changing with the Model Path": "Cambiando con la Ruta del Modelo",
13
+ "Chinese": "Chino",
14
+ "Compile Model": "Compilar Modelo",
15
+ "Compile the model can significantly reduce the inference time, but will increase cold start time": "Compilar el modelo puede reducir significativamente el tiempo de inferencia, pero aumentará el tiempo de inicio en frío",
16
+ "Copy": "Copiar",
17
+ "Data Preprocessing": "Preprocesamiento de Datos",
18
+ "Data Preprocessing Path": "Ruta de Preprocesamiento de Datos",
19
+ "Data Source": "Fuente de Datos",
20
+ "Decoder Model Config": "Configuración del modelo decodificador",
21
+ "Decoder Model Path": "Ruta del modelo decodificador",
22
+ "Disabled": "Desactivado",
23
+ "Enable Reference Audio": "Habilitar Audio de Referencia",
24
+ "English": "Inglés",
25
+ "Error Message": "Mensaje de Error",
26
+ "File Preprocessing": "Preprocesamiento de Archivos",
27
+ "Generate": "Generar",
28
+ "Generated Audio": "Audio Generado",
29
+ "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "Si no hay texto correspondiente para el audio, aplique ASR para asistencia, soporte para formato .txt o .lab",
30
+ "Infer interface is closed": "La interfaz de inferencia está cerrada",
31
+ "Inference Configuration": "Configuración de Inferencia",
32
+ "Inference Server Configuration": "Configuración del Servidor de Inferencia",
33
+ "Inference Server Error": "Error del Servidor de Inferencia",
34
+ "Inferring interface is launched at {}": "La interfaz de inferencia se ha lanzado en {}",
35
+ "Initial Learning Rate": "Tasa de Aprendizaje Inicial",
36
+ "Input Audio & Source Path for Transcription": "Audio de Entrada y Ruta de Origen para Transcripción",
37
+ "Input Text": "Texto de Entrada",
38
+ "Invalid path: {}": "Ruta inválida: {}",
39
+ "It is recommended to use CUDA, if you have low configuration, use CPU": "Se recomienda usar CUDA, si tiene una configuración baja, use CPU",
40
+ "Iterative Prompt Length, 0 means off": "Longitud de la Indicación Iterativa, 0 significa apagado",
41
+ "Japanese": "Japonés",
42
+ "LLAMA Configuration": "Configuración de LLAMA",
43
+ "LLAMA Model Config": "Configuración del Modelo LLAMA",
44
+ "LLAMA Model Path": "Ruta del Modelo LLAMA",
45
+ "Labeling Device": "Dispositivo de Etiquetado",
46
+ "LoRA Model to be merged": "Modelo LoRA a fusionar",
47
+ "Maximum Audio Duration": "Duración máxima de audio",
48
+ "Maximum Length per Sample": "Longitud Máxima por Muestra",
49
+ "Maximum Training Steps": "Pasos Máximos de Entrenamiento",
50
+ "Maximum tokens per batch, 0 means no limit": "Máximo de tokens por lote, 0 significa sin límite",
51
+ "Merge": "Fusionar",
52
+ "Merge LoRA": "Fusionar LoRA",
53
+ "Merge successfully": "Fusionado exitosamente",
54
+ "Minimum Audio Duration": "Duración mínima de audio",
55
+ "Model Output Path": "Ruta de Salida del Modelo",
56
+ "Model Size": "Tamaño del Modelo",
57
+ "Move": "Mover",
58
+ "Move files successfully": "Archivos movidos exitosamente",
59
+ "No audio generated, please check the input text.": "No se generó audio, por favor verifique el texto de entrada.",
60
+ "No selected options": "No hay opciones seleccionadas",
61
+ "Number of Workers": "Número de Trabajadores",
62
+ "Open Inference Server": "Abrir Servidor de Inferencia",
63
+ "Open Labeler WebUI": "Abrir Interfaz Web del Etiquetador",
64
+ "Open Tensorboard": "Abrir Tensorboard",
65
+ "Opened labeler in browser": "Se abrió el etiquetador en el navegador",
66
+ "Optional Label Language": "Idioma de Etiquetado Opcional",
67
+ "Optional online ver": "Ver en línea opcional",
68
+ "Output Path": "Ruta de Salida",
69
+ "Path error, please check the model file exists in the corresponding path": "Error de ruta, por favor verifique que el archivo del modelo exista en la ruta correspondiente",
70
+ "Precision": "Precisión",
71
+ "Probability of applying Speaker Condition": "Probabilidad de aplicar Condición de Hablante",
72
+ "Put your text here.": "Ponga su texto aquí.",
73
+ "Reference Audio": "Audio de Referencia",
74
+ "Reference Text": "Texto de Referencia",
75
+ "Related code are released under BSD-3-Clause License, and weights are released under CC BY-NC-SA 4.0 License.": "El código relacionado se publica bajo la Licencia BSD-3-Clause, y los pesos se publican bajo la Licencia CC BY-NC-SA 4.0.",
76
+ "Remove Selected Data": "Eliminar Datos Seleccionados",
77
+ "Removed path successfully!": "¡Ruta eliminada exitosamente!",
78
+ "Repetition Penalty": "Penalización por Repetición",
79
+ "Save model every n steps": "Guardar modelo cada n pasos",
80
+ "Select LLAMA ckpt": "Seleccionar punto de control LLAMA",
81
+ "Select VITS ckpt": "Seleccionar punto de control VITS",
82
+ "Select VQGAN ckpt": "Seleccionar punto de control VQGAN",
83
+ "Select source file processing method": "Seleccione el método de procesamiento de archivos fuente",
84
+ "Select the model to be trained (Depending on the Tab page you are on)": "Seleccione el modelo a entrenar (Dependiendo de la pestaña en la que se encuentre)",
85
+ "Selected: {}": "Seleccionado: {}",
86
+ "Speaker": "Hablante",
87
+ "Speaker is identified by the folder name": "El hablante se identifica por el nombre de la carpeta",
88
+ "Start Training": "Iniciar Entrenamiento",
89
+ "Streaming Audio": "transmisión de audio",
90
+ "Streaming Generate": "síntesis en flujo",
91
+ "Tensorboard Host": "Host de Tensorboard",
92
+ "Tensorboard Log Path": "Ruta de Registro de Tensorboard",
93
+ "Tensorboard Port": "Puerto de Tensorboard",
94
+ "Tensorboard interface is closed": "La interfaz de Tensorboard está cerrada",
95
+ "Tensorboard interface is launched at {}": "La interfaz de Tensorboard se ha lanzado en {}",
96
+ "Text is too long, please keep it under {} characters.": "El texto es demasiado largo, por favor manténgalo por debajo de {} caracteres.",
97
+ "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "La ruta de la carpeta de entrada a la izquierda o la lista de archivos. Ya sea que esté marcado o no, se utilizará para el entrenamiento posterior en esta lista.",
98
+ "Training Configuration": "Configuración de Entrenamiento",
99
+ "Training Error": "Error de Entrenamiento",
100
+ "Training stopped": "Entrenamiento detenido",
101
+ "Type name of the speaker": "Escriba el nombre del hablante",
102
+ "Type the path or select from the dropdown": "Escriba la ruta o seleccione de la lista desplegable",
103
+ "Use LoRA": "Usar LoRA",
104
+ "Use LoRA can save GPU memory, but may reduce the quality of the model": "Usar LoRA puede ahorrar memoria GPU, pero puede reducir la calidad del modelo",
105
+ "Use filelist": "Usar lista de archivos",
106
+ "Use large for 10G+ GPU, medium for 5G, small for 2G": "Use grande para GPU de 10G+, mediano para 5G, pequeño para 2G",
107
+ "VITS Configuration": "Configuración de VITS",
108
+ "VQGAN Configuration": "Configuración de VQGAN",
109
+ "Validation Batch Size": "Tamaño del Lote de Validación",
110
+ "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "Vea el estado de la carpeta de preprocesamiento (use el control deslizante para controlar la profundidad del árbol)",
111
+ "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "No somos responsables de ningún mal uso del modelo, por favor considere sus leyes y regulaciones locales antes de usarlo.",
112
+ "WebUI Host": "Host de WebUI",
113
+ "WebUI Port": "Puerto de WebUI",
114
+ "Whisper Model": "Modelo Whisper",
115
+ "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "Puede encontrar el código fuente [aquí](https://github.com/fishaudio/fish-speech) y los modelos [aquí](https://huggingface.co/fishaudio/fish-speech-1).",
116
+ "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "Se recomienda bf16-true para GPU de la serie 30+, se recomienda 16-mixed para GPU de la serie 10+",
117
+ "latest": "más reciente",
118
+ "new": "nuevo",
119
+ "Realtime Transform Text": "Transformación de Texto en Tiempo Real",
120
+ "Normalization Result Preview (Currently Only Chinese)": "Vista Previa del Resultado de Normalización (Actualmente Solo Chino)",
121
+ "Text Normalization": "Normalización de Texto"
122
+ }