Files changed (4) hide show
  1. README.md +1 -1
  2. app.py +149 -67
  3. makefile +9 -0
  4. requirements.txt +1 -2
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 🐸
4
  colorFrom: green
5
  colorTo: red
6
  sdk: gradio
7
- sdk_version: 3.44.2
8
  app_file: app.py
9
  pinned: false
10
  models:
 
4
  colorFrom: green
5
  colorTo: red
6
  sdk: gradio
7
+ sdk_version: 3.44.3
8
  app_file: app.py
9
  pinned: false
10
  models:
app.py CHANGED
@@ -1,51 +1,40 @@
1
- import sys
2
- import os
3
- #os.system("pip uninstall -y gradio")
4
- #os.system("pip install --upgrade gradio==3.24.0")
5
 
6
  import gradio as gr
7
  from TTS.api import TTS
8
 
9
  tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1")
10
- tts.to("cuda")
11
-
12
-
13
- def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, agree):
14
- if agree == True:
15
- if use_mic == True:
16
- if mic_file_path is not None:
17
- speaker_wav=mic_file_path
18
- else:
19
- gr.Warning("Please record your voice with Microphone, or uncheck Use Microphone to use reference audios")
20
- return (
21
- None,
22
- None,
23
- )
24
-
25
- else:
26
- speaker_wav=audio_file_pth
27
-
28
- if len(prompt)<2:
29
- gr.Warning("Please give a longer prompt text")
30
- return (
31
- None,
32
- None,
33
- )
34
- try:
35
- tts.tts_to_file(
36
- text=prompt,
37
- file_path="output.wav",
38
- speaker_wav=speaker_wav,
39
- language=language,
40
- )
41
- except RuntimeError as e:
42
- if "device-side" in e.message:
43
- # cannot do anything on cuda device side error, need tor estart
44
- gr.Warning("Unhandled Exception encounter, please retry in a minute")
45
- print("Cuda device-assert Runtime encountered need restart")
46
- print(e.message)
47
- sys.exit("Exit due to cuda device-assert")
48
- raise
49
 
50
  return (
51
  gr.make_waveform(
@@ -54,11 +43,10 @@ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, agree):
54
  "output.wav",
55
  )
56
  else:
57
- gr.Warning("Please accept the Terms & Condition!")
58
- return (
59
- None,
60
- None,
61
- )
62
 
63
 
64
  title = "Coqui🐸 XTTS"
@@ -66,10 +54,11 @@ title = "Coqui🐸 XTTS"
66
  description = """
67
  <a href="https://huggingface.co/coqui/XTTS-v1">XTTS</a> is a Voice generation model that lets you clone voices into different languages by using just a quick 3-second audio clip.
68
  <br/>
69
- Built on Tortoise, XTTS has important model changes that make cross-language voice cloning and multi-lingual speech generation super easy.
70
  <br/>
71
- This is the same model that powers Coqui Studio, and Coqui API, however we apply a few tricks to make it faster and support streaming inference.
72
  <br/>
 
73
  <br/>
74
  <p>For faster inference without waiting in the queue, you should duplicate this space and upgrade to GPU via the settings.
75
  <br/>
@@ -89,37 +78,77 @@ examples = [
89
  "Once when I was six years old I saw a magnificent picture",
90
  "en",
91
  "examples/female.wav",
92
- None,
93
- False,
94
  True,
95
  ],
96
  [
97
  "Lorsque j'avais six ans j'ai vu, une fois, une magnifique image",
98
  "fr",
99
  "examples/male.wav",
100
- None,
101
- False,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  True,
103
  ],
104
  [
105
  "Un tempo lontano, quando avevo sei anni, vidi un magnifico disegno",
106
  "it",
107
  "examples/female.wav",
108
- None,
109
- False,
110
  True,
111
  ],
112
  [
113
  "Bir zamanlar, altı yaşındayken, muhteşem bir resim gördüm",
114
  "tr",
115
  "examples/female.wav",
116
- None,
117
- False,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  True,
119
  ],
120
  ]
121
 
122
- gr.Interface(
123
  fn=predict,
124
  inputs=[
125
  gr.Textbox(
@@ -141,7 +170,7 @@ gr.Interface(
141
  "tr",
142
  "ru",
143
  "nl",
144
- "cz",
145
  "ar",
146
  "zh-cn",
147
  ],
@@ -153,12 +182,8 @@ gr.Interface(
153
  info="Click on the ✎ button to upload your own target speaker audio",
154
  type="filepath",
155
  value="examples/female.wav",
 
156
  ),
157
- gr.Audio(source="microphone",
158
- type="filepath",
159
- info="Use your microphone to record audio",
160
- label="Use Microphone for Reference"),
161
- gr.Checkbox(label="Check to use Microphone as Reference", value=False),
162
  gr.Checkbox(
163
  label="Agree",
164
  value=False,
@@ -169,8 +194,65 @@ gr.Interface(
169
  gr.Video(label="Waveform Visual"),
170
  gr.Audio(label="Synthesised Audio"),
171
  ],
172
- title=title,
173
  description=description,
174
  article=article,
175
  examples=examples,
176
- ).queue().launch(debug=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
 
 
 
2
 
3
  import gradio as gr
4
  from TTS.api import TTS
5
 
6
  tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1")
7
+ device = "cuda" if torch.cuda.is_available() else "cpu"
8
+ tts.to(device)
9
+
10
+
11
+ def predict(prompt, language, speaker_wav, agree=False):
12
+ """
13
+ Main body function to run inference, with light checks to ensure valid arguments are passed to the model.
14
+
15
+ Args:
16
+ prompt (`str`, required):
17
+ Text prompt to the model.
18
+ language (`str`, required):
19
+ Language for inference.
20
+ speaker_wav (`str`, required):
21
+ Path to the speaker prompt audio file.
22
+ agree (`bool`, required, defaults to `False`):
23
+ Whether or not the model terms have been agreed to.
24
+ Returns:
25
+ tuple of (waveform_visual, synthesised_audio):
26
+ Video animation of the output speech, and audio file.
27
+ """
28
+ if agree:
29
+ if len(prompt) < 2:
30
+ raise gr.Error("Please give a longer text prompt")
31
+
32
+ tts.tts_to_file(
33
+ text=prompt,
34
+ file_path="output.wav",
35
+ speaker_wav=speaker_wav,
36
+ language=language,
37
+ )
 
 
 
 
 
 
 
 
38
 
39
  return (
40
  gr.make_waveform(
 
43
  "output.wav",
44
  )
45
  else:
46
+ gr.Warning(
47
+ "Please accept the Terms & Conditions of the model by checking the box!"
48
+ )
49
+ return ()
 
50
 
51
 
52
  title = "Coqui🐸 XTTS"
 
54
  description = """
55
  <a href="https://huggingface.co/coqui/XTTS-v1">XTTS</a> is a Voice generation model that lets you clone voices into different languages by using just a quick 3-second audio clip.
56
  <br/>
57
+ XTTS is built on previous research, like Tortoise, with additional architectural innovations and training to make cross-language voice cloning and multilingual speech generation possible.
58
  <br/>
59
+ This is the same model that powers our creator application <a href="https://coqui.ai">Coqui Studio</a> as well as the <a href="https://docs.coqui.ai">Coqui API</a>. In production we apply modifications to make low-latency streaming possible.
60
  <br/>
61
+ Leave a star on the Github <a href="https://github.com/coqui-ai/TTS">🐸TTS</a>, where our open-source inference and training code lives.
62
  <br/>
63
  <p>For faster inference without waiting in the queue, you should duplicate this space and upgrade to GPU via the settings.
64
  <br/>
 
78
  "Once when I was six years old I saw a magnificent picture",
79
  "en",
80
  "examples/female.wav",
 
 
81
  True,
82
  ],
83
  [
84
  "Lorsque j'avais six ans j'ai vu, une fois, une magnifique image",
85
  "fr",
86
  "examples/male.wav",
87
+ True,
88
+ ],
89
+ [
90
+ "Als ich sechs war, sah ich einmal ein wunderbares Bild",
91
+ "de",
92
+ "examples/female.wav",
93
+ True,
94
+ ],
95
+ [
96
+ "Cuando tenía seis años, vi una vez una imagen magnífica",
97
+ "es",
98
+ "examples/male.wav",
99
+ True,
100
+ ],
101
+ [
102
+ "Quando eu tinha seis anos eu vi, uma vez, uma imagem magnífica",
103
+ "pt",
104
+ "examples/female.wav",
105
+ True,
106
+ ],
107
+ [
108
+ "Kiedy miałem sześć lat, zobaczyłem pewnego razu wspaniały obrazek",
109
+ "pl",
110
+ "examples/male.wav",
111
  True,
112
  ],
113
  [
114
  "Un tempo lontano, quando avevo sei anni, vidi un magnifico disegno",
115
  "it",
116
  "examples/female.wav",
 
 
117
  True,
118
  ],
119
  [
120
  "Bir zamanlar, altı yaşındayken, muhteşem bir resim gördüm",
121
  "tr",
122
  "examples/female.wav",
123
+ True,
124
+ ],
125
+ [
126
+ "Когда мне было шесть лет, я увидел однажды удивительную картинку",
127
+ "ru",
128
+ "examples/female.wav",
129
+ True,
130
+ ],
131
+ [
132
+ "Toen ik een jaar of zes was, zag ik op een keer een prachtige plaat",
133
+ "nl",
134
+ "examples/male.wav",
135
+ True,
136
+ ],
137
+ [
138
+ "Když mi bylo šest let, viděl jsem jednou nádherný obrázek",
139
+ "cs",
140
+ "examples/female.wav",
141
+ True,
142
+ ],
143
+ [
144
+ "当我还只有六岁的时候, 看到了一副精彩的插画",
145
+ "zh-cn",
146
+ "examples/female.wav",
147
  True,
148
  ],
149
  ]
150
 
151
+ audio_upload = gr.Interface(
152
  fn=predict,
153
  inputs=[
154
  gr.Textbox(
 
170
  "tr",
171
  "ru",
172
  "nl",
173
+ "cs",
174
  "ar",
175
  "zh-cn",
176
  ],
 
182
  info="Click on the ✎ button to upload your own target speaker audio",
183
  type="filepath",
184
  value="examples/female.wav",
185
+ source="upload",
186
  ),
 
 
 
 
 
187
  gr.Checkbox(
188
  label="Agree",
189
  value=False,
 
194
  gr.Video(label="Waveform Visual"),
195
  gr.Audio(label="Synthesised Audio"),
196
  ],
 
197
  description=description,
198
  article=article,
199
  examples=examples,
200
+ )
201
+
202
+ microphone = gr.Interface(
203
+ fn=predict,
204
+ inputs=[
205
+ gr.Textbox(
206
+ label="Text Prompt",
207
+ info="One or two sentences at a time is better",
208
+ value="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
209
+ ),
210
+ gr.Dropdown(
211
+ label="Language",
212
+ info="Select an output language for the synthesised speech",
213
+ choices=[
214
+ "en",
215
+ "es",
216
+ "fr",
217
+ "de",
218
+ "it",
219
+ "pt",
220
+ "pl",
221
+ "tr",
222
+ "ru",
223
+ "nl",
224
+ "cs",
225
+ "ar",
226
+ "zh-cn",
227
+ ],
228
+ max_choices=1,
229
+ value="en",
230
+ ),
231
+ gr.Audio(
232
+ label="Reference Audio",
233
+ info="Record your own target speaker audio",
234
+ type="filepath",
235
+ source="microphone",
236
+ ),
237
+ gr.Checkbox(
238
+ label="Agree",
239
+ value=False,
240
+ info="I agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml",
241
+ ),
242
+ ],
243
+ outputs=[
244
+ gr.Video(label="Waveform Visual"),
245
+ gr.Audio(label="Synthesised Audio"),
246
+ ],
247
+ description=description,
248
+ article=article,
249
+ )
250
+
251
+ demo = gr.Blocks()
252
+
253
+ with demo:
254
+ gr.TabbedInterface(
255
+ [audio_upload, microphone], ["Audio file", "Microphone"], title=title
256
+ )
257
+
258
+ demo.queue().launch(debug=True)
makefile ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ check_dirs := .
2
+
3
+ quality:
4
+ black --check $(check_dirs)
5
+ ruff $(check_dirs)
6
+
7
+ style:
8
+ black $(check_dirs)
9
+ ruff $(check_dirs) --fix
requirements.txt CHANGED
@@ -1,2 +1 @@
1
- TTS==0.17.1
2
- gradio==3.41.2
 
1
+ TTS==0.17.1