Files changed (3) hide show
  1. README.md +3 -3
  2. app.py +192 -431
  3. requirements.txt +57 -10
README.md CHANGED
@@ -4,11 +4,11 @@ emoji: 🐸
4
  colorFrom: green
5
  colorTo: red
6
  sdk: gradio
7
- sdk_version: 3.48.0
8
  app_file: app.py
9
  pinned: false
10
  models:
11
- - coqui/XTTS-v2
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
4
  colorFrom: green
5
  colorTo: red
6
  sdk: gradio
7
+ sdk_version: 3.44.3
8
  app_file: app.py
9
  pinned: false
10
  models:
11
+ - coqui/XTTS-v1
12
  ---
13
 
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -3,26 +3,16 @@ import io, os, stat
3
  import subprocess
4
  import random
5
  from zipfile import ZipFile
6
- import uuid
7
  import time
8
  import torch
9
  import torchaudio
10
-
11
-
12
- #download for mecab
13
- os.system('python -m unidic download')
14
-
15
  # By using XTTS you agree to CPML license https://coqui.ai/cpml
16
  os.environ["COQUI_TOS_AGREED"] = "1"
17
 
18
  # langid is used to detect language for longer text
19
  # Most users expect text to be their own language, there is checkbox to disable it
20
- import langid
21
- import base64
22
- import csv
23
- from io import StringIO
24
- import datetime
25
- import re
26
 
27
  import gradio as gr
28
  from scipy.io.wavfile import write
@@ -45,215 +35,153 @@ repo_id = "coqui/xtts"
45
  print("Export newer ffmpeg binary for denoise filter")
46
  ZipFile("ffmpeg.zip").extractall()
47
  print("Make ffmpeg binary executable")
48
- st = os.stat("ffmpeg")
49
- os.chmod("ffmpeg", st.st_mode | stat.S_IEXEC)
50
 
51
- # This will trigger downloading model
52
- print("Downloading if not downloaded Coqui XTTS V2")
53
- from TTS.utils.manage import ModelManager
54
-
55
- model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
56
- ModelManager().download_model(model_name)
57
- model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
58
- print("XTTS downloaded")
59
 
 
60
  config = XttsConfig()
61
  config.load_json(os.path.join(model_path, "config.json"))
62
-
63
  model = Xtts.init_from_config(config)
64
  model.load_checkpoint(
65
  config,
66
  checkpoint_path=os.path.join(model_path, "model.pth"),
67
  vocab_path=os.path.join(model_path, "vocab.json"),
68
  eval=True,
69
- use_deepspeed=True,
70
  )
71
  model.cuda()
72
 
73
  # This is for debugging purposes only
74
- DEVICE_ASSERT_DETECTED = 0
75
- DEVICE_ASSERT_PROMPT = None
76
- DEVICE_ASSERT_LANG = None
77
-
78
- supported_languages = config.languages
79
 
80
- def predict(
81
- prompt,
82
- language,
83
- audio_file_pth,
84
- mic_file_path,
85
- use_mic,
86
- voice_cleanup,
87
- no_lang_auto_detect,
88
- agree,
89
- ):
90
  if agree == True:
 
 
91
  if language not in supported_languages:
92
- gr.Warning(
93
- f"Language you put {language} in is not in is not in our Supported Languages, please choose from dropdown"
94
- )
95
-
96
  return (
97
- None,
98
- None,
99
- None,
100
- None,
101
- )
102
 
103
- language_predicted = langid.classify(prompt)[
104
- 0
105
- ].strip() # strip need as there is space at end!
106
 
107
  # tts expects chinese as zh-cn
108
- if language_predicted == "zh":
109
- # we use zh-cn
110
  language_predicted = "zh-cn"
111
-
112
  print(f"Detected language:{language_predicted}, Chosen language:{language}")
113
 
114
  # After text character length 15 trigger language detection
115
- if len(prompt) > 15:
116
  # allow any language for short text as some may be common
117
  # If user unchecks language autodetection it will not trigger
118
  # You may remove this completely for own use
119
  if language_predicted != language and not no_lang_auto_detect:
120
- # Please duplicate and remove this check if you really want this
121
- # Or auto-detector fails to identify language (which it can on pretty short text or mixed text)
122
- gr.Warning(
123
- f"It looks like your text isn’t the language you chose , if you’re sure the text is the same language you chose, please check disable language auto-detection checkbox"
124
- )
125
-
126
  return (
127
- None,
128
- None,
129
- None,
130
- None,
131
- )
132
 
 
133
  if use_mic == True:
134
  if mic_file_path is not None:
135
- speaker_wav = mic_file_path
136
  else:
137
- gr.Warning(
138
- "Please record your voice with Microphone, or uncheck Use Microphone to use reference audios"
139
- )
140
  return (
141
  None,
142
  None,
143
  None,
144
  None,
145
- )
146
-
147
  else:
148
- speaker_wav = audio_file_pth
149
 
 
150
  # Filtering for microphone input, as it has BG noise, maybe silence in beginning and end
151
  # This is fast filtering not perfect
152
 
153
  # Apply all on demand
154
- lowpassfilter = denoise = trim = loudness = True
155
-
156
  if lowpassfilter:
157
- lowpass_highpass = "lowpass=8000,highpass=75,"
158
  else:
159
- lowpass_highpass = ""
160
 
161
  if trim:
162
  # better to remove silence in beginning and end for microphone
163
- trim_silence = "areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,"
164
  else:
165
- trim_silence = ""
166
-
167
- if voice_cleanup:
168
  try:
169
- out_filename = (
170
- speaker_wav + str(uuid.uuid4()) + ".wav"
171
- ) # ffmpeg to know output format
172
-
173
- # we will use newer ffmpeg as that has afftn denoise filter
174
- shell_command = f"./ffmpeg -y -i {speaker_wav} -af {lowpass_highpass}{trim_silence} {out_filename}".split(
175
- " "
176
- )
177
-
178
- command_result = subprocess.run(
179
- [item for item in shell_command],
180
- capture_output=False,
181
- text=True,
182
- check=True,
183
- )
184
- speaker_wav = out_filename
185
  print("Filtered microphone input")
186
  except subprocess.CalledProcessError:
187
  # There was an error - command exited with non-zero code
188
  print("Error: failed filtering, use original microphone input")
189
  else:
190
- speaker_wav = speaker_wav
191
 
192
- if len(prompt) < 2:
193
  gr.Warning("Please give a longer prompt text")
194
  return (
195
- None,
196
- None,
197
- None,
198
- None,
199
- )
200
- if len(prompt) > 200:
201
- gr.Warning(
202
- "Text length limited to 200 characters for this demo, please try shorter text. You can clone this space and edit code for your own usage"
203
- )
204
  return (
205
- None,
206
- None,
207
- None,
208
- None,
209
- )
210
  global DEVICE_ASSERT_DETECTED
211
  if DEVICE_ASSERT_DETECTED:
212
  global DEVICE_ASSERT_PROMPT
213
  global DEVICE_ASSERT_LANG
214
- # It will likely never come here as we restart space on first unrecoverable error now
215
- print(
216
- f"Unrecoverable exception caused by language:{DEVICE_ASSERT_LANG} prompt:{DEVICE_ASSERT_PROMPT}"
217
- )
218
-
219
- # HF Space specific.. This error is unrecoverable need to restart space
220
- space = api.get_space_runtime(repo_id=repo_id)
221
- if space.stage!="BUILDING":
222
- api.restart_space(repo_id=repo_id)
223
- else:
224
- print("TRIED TO RESTART but space is building")
225
-
226
- try:
227
- metrics_text = ""
228
- t_latent = time.time()
229
-
230
  # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
231
- try:
232
- (
233
- gpt_cond_latent,
234
- speaker_embedding,
235
- ) = model.get_conditioning_latents(audio_path=speaker_wav, gpt_cond_len=30, gpt_cond_chunk_len=4, max_ref_length=60)
236
- except Exception as e:
237
- print("Speaker encoding error", str(e))
238
- gr.Warning(
239
- "It appears something wrong with reference, did you unmute your microphone?"
240
- )
241
- return (
242
- None,
243
- None,
244
- None,
245
- None,
246
- )
247
-
248
  latent_calculation_time = time.time() - t_latent
249
- # metrics_text=f"Embedding calculation time: {latent_calculation_time:.2f} seconds\n"
250
-
251
- # temporary comma fix
252
- prompt= re.sub("([^\x00-\x7F]|\w)(\.|\。|\?)",r"\1 \2\2",prompt)
253
-
254
- wav_chunks = []
255
- ## Direct mode
256
 
 
 
257
  print("I: Generating new audio...")
258
  t0 = time.time()
259
  out = model.inference(
@@ -261,8 +189,7 @@ def predict(
261
  language,
262
  gpt_cond_latent,
263
  speaker_embedding,
264
- repetition_penalty=5.0,
265
- temperature=0.75,
266
  )
267
  inference_time = time.time() - t0
268
  print(f"I: Time to generate audio: {round(inference_time*1000)} milliseconds")
@@ -271,125 +198,24 @@ def predict(
271
  print(f"Real-time factor (RTF): {real_time_factor}")
272
  metrics_text+=f"Real-time factor (RTF): {real_time_factor:.2f}\n"
273
  torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
274
-
275
-
276
- """
277
- print("I: Generating new audio in streaming mode...")
278
- t0 = time.time()
279
- chunks = model.inference_stream(
280
- prompt,
281
- language,
282
- gpt_cond_latent,
283
- speaker_embedding,
284
- repetition_penalty=7.0,
285
- temperature=0.85,
286
- )
287
-
288
- first_chunk = True
289
- for i, chunk in enumerate(chunks):
290
- if first_chunk:
291
- first_chunk_time = time.time() - t0
292
- metrics_text += f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
293
- first_chunk = False
294
- wav_chunks.append(chunk)
295
- print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
296
- inference_time = time.time() - t0
297
- print(
298
- f"I: Time to generate audio: {round(inference_time*1000)} milliseconds"
299
- )
300
- #metrics_text += (
301
- # f"Time to generate audio: {round(inference_time*1000)} milliseconds\n"
302
- #)
303
-
304
- wav = torch.cat(wav_chunks, dim=0)
305
- print(wav.shape)
306
- real_time_factor = (time.time() - t0) / wav.shape[0] * 24000
307
- print(f"Real-time factor (RTF): {real_time_factor}")
308
- metrics_text += f"Real-time factor (RTF): {real_time_factor:.2f}\n"
309
-
310
- torchaudio.save("output.wav", wav.squeeze().unsqueeze(0).cpu(), 24000)
311
- """
312
-
313
- except RuntimeError as e:
314
  if "device-side assert" in str(e):
315
  # cannot do anything on cuda device side error, need tor estart
316
- print(
317
- f"Exit due to: Unrecoverable exception caused by language:{language} prompt:{prompt}",
318
- flush=True,
319
- )
320
  gr.Warning("Unhandled Exception encounter, please retry in a minute")
321
  print("Cuda device-assert Runtime encountered need restart")
322
  if not DEVICE_ASSERT_DETECTED:
323
- DEVICE_ASSERT_DETECTED = 1
324
- DEVICE_ASSERT_PROMPT = prompt
325
- DEVICE_ASSERT_LANG = language
326
-
327
- # just before restarting save what caused the issue so we can handle it in future
328
- # Uploading Error data only happens for unrecovarable error
329
- error_time = datetime.datetime.now().strftime("%d-%m-%Y-%H:%M:%S")
330
- error_data = [
331
- error_time,
332
- prompt,
333
- language,
334
- audio_file_pth,
335
- mic_file_path,
336
- use_mic,
337
- voice_cleanup,
338
- no_lang_auto_detect,
339
- agree,
340
- ]
341
- error_data = [str(e) if type(e) != str else e for e in error_data]
342
- print(error_data)
343
- print(speaker_wav)
344
- write_io = StringIO()
345
- csv.writer(write_io).writerows([error_data])
346
- csv_upload = write_io.getvalue().encode()
347
-
348
- filename = error_time + "_" + str(uuid.uuid4()) + ".csv"
349
- print("Writing error csv")
350
- error_api = HfApi()
351
- error_api.upload_file(
352
- path_or_fileobj=csv_upload,
353
- path_in_repo=filename,
354
- repo_id="coqui/xtts-flagged-dataset",
355
- repo_type="dataset",
356
- )
357
 
358
- # speaker_wav
359
- print("Writing error reference audio")
360
- speaker_filename = (
361
- error_time + "_reference_" + str(uuid.uuid4()) + ".wav"
362
- )
363
- error_api = HfApi()
364
- error_api.upload_file(
365
- path_or_fileobj=speaker_wav,
366
- path_in_repo=speaker_filename,
367
- repo_id="coqui/xtts-flagged-dataset",
368
- repo_type="dataset",
369
- )
370
-
371
- # HF Space specific.. This error is unrecoverable need to restart space
372
- space = api.get_space_runtime(repo_id=repo_id)
373
- if space.stage!="BUILDING":
374
- api.restart_space(repo_id=repo_id)
375
- else:
376
- print("TRIED TO RESTART but space is building")
377
-
378
  else:
379
- if "Failed to decode" in str(e):
380
- print("Speaker encoding error", str(e))
381
- gr.Warning(
382
- "It appears something wrong with reference, did you unmute your microphone?"
383
- )
384
- else:
385
- print("RuntimeError: non device-side assert error:", str(e))
386
- gr.Warning("Something unexpected happened please retry again.")
387
- return (
388
- None,
389
- None,
390
- None,
391
- None,
392
- )
393
  return (
394
  gr.make_waveform(
395
  audio="output.wav",
@@ -401,45 +227,45 @@ def predict(
401
  else:
402
  gr.Warning("Please accept the Terms & Condition!")
403
  return (
404
- None,
405
- None,
406
- None,
407
- None,
408
- )
409
 
410
 
411
  title = "Coqui🐸 XTTS"
412
 
413
  description = """
 
 
 
 
 
 
414
 
 
415
  <br/>
416
-
417
- This demo is currently running **XTTS v2.0.3** <a href="https://huggingface.co/coqui/XTTS-v2">XTTS</a> is a multilingual text-to-speech and voice-cloning model. This demo features zero-shot voice cloning, however, you can fine-tune XTTS for better results. Leave a star 🌟 on Github <a href="https://github.com/coqui-ai/TTS">🐸TTS</a>, where our open-source inference and training code lives.
418
-
419
  <br/>
420
-
421
- Supported languages: Arabic: ar, Brazilian Portuguese: pt , Mandarin Chinese: zh-cn, Czech: cs, Dutch: nl, English: en, French: fr, German: de, Italian: it, Polish: pl, Russian: ru, Spanish: es, Turkish: tr, Japanese: ja, Korean: ko, Hungarian: hu, Hindi: hi
422
-
423
  <br/>
424
- """
425
-
426
- links = """
427
- <img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=0d00920c-8cc9-4bf3-90f2-a615797e5f59" />
428
-
429
- | | |
430
- | ------------------------------- | --------------------------------------- |
431
- | 🐸💬 **CoquiTTS** | <a style="display:inline-block" href='https://github.com/coqui-ai/TTS'><img src='https://img.shields.io/github/stars/coqui-ai/TTS?style=social' /></a>|
432
- | 💼 **Documentation** | [ReadTheDocs](https://tts.readthedocs.io/en/latest/)
433
- | 👩‍💻 **Questions** | [GitHub Discussions](https://github.com/coqui-ai/TTS/discussions) |
434
- | 🗯 **Community** | [![Dicord](https://img.shields.io/discord/1037326658807533628?color=%239B59B6&label=chat%20on%20discord)](https://discord.gg/5eXr5seRrv) |
435
-
436
-
437
  """
438
 
439
  article = """
440
  <div style='margin:20px auto;'>
441
  <p>By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml</p>
442
- <p>We collect data only for error cases for improvement.</p>
443
  </div>
444
  """
445
  examples = [
@@ -452,6 +278,7 @@ examples = [
452
  False,
453
  False,
454
  True,
 
455
  ],
456
  [
457
  "Lorsque j'avais six ans j'ai vu, une fois, une magnifique image",
@@ -508,7 +335,7 @@ examples = [
508
  "it",
509
  "examples/female.wav",
510
  None,
511
- False,
512
  False,
513
  False,
514
  True,
@@ -528,7 +355,7 @@ examples = [
528
  "ru",
529
  "examples/female.wav",
530
  None,
531
- False,
532
  False,
533
  False,
534
  True,
@@ -538,7 +365,7 @@ examples = [
538
  "nl",
539
  "examples/male.wav",
540
  None,
541
- False,
542
  False,
543
  False,
544
  True,
@@ -548,7 +375,7 @@ examples = [
548
  "cs",
549
  "examples/female.wav",
550
  None,
551
- False,
552
  False,
553
  False,
554
  True,
@@ -558,146 +385,80 @@ examples = [
558
  "zh-cn",
559
  "examples/female.wav",
560
  None,
 
561
  False,
562
  False,
563
- False,
564
- True,
565
- ],
566
- [
567
- "かつて 六歳のとき、素晴らしい絵を見ました",
568
- "ja",
569
- "examples/female.wav",
570
- None,
571
- False,
572
- True,
573
- False,
574
- True,
575
- ],
576
- [
577
- "한번은 내가 여섯 살이었을 때 멋진 그림을 보았습니다.",
578
- "ko",
579
- "examples/female.wav",
580
- None,
581
- False,
582
- True,
583
- False,
584
- True,
585
- ],
586
- [
587
- "Egyszer hat éves koromban láttam egy csodálatos képet",
588
- "hu",
589
- "examples/male.wav",
590
- None,
591
- False,
592
- True,
593
- False,
594
  True,
595
  ],
596
  ]
597
 
598
 
599
 
600
- with gr.Blocks(analytics_enabled=False) as demo:
601
- with gr.Row():
602
- with gr.Column():
603
- gr.Markdown(
604
- """
605
- ## <img src="https://raw.githubusercontent.com/coqui-ai/TTS/main/images/coqui-log-green-TTS.png" height="56"/>
606
- """
607
- )
608
- with gr.Column():
609
- # placeholder to align the image
610
- pass
611
-
612
- with gr.Row():
613
- with gr.Column():
614
- gr.Markdown(description)
615
- with gr.Column():
616
- gr.Markdown(links)
617
-
618
- with gr.Row():
619
- with gr.Column():
620
- input_text_gr = gr.Textbox(
621
- label="Text Prompt",
622
- info="One or two sentences at a time is better. Up to 200 text characters.",
623
- value="Hi there, I'm your new voice clone. Try your best to upload quality audio.",
624
- )
625
- language_gr = gr.Dropdown(
626
- label="Language",
627
- info="Select an output language for the synthesised speech",
628
- choices=[
629
- "en",
630
- "es",
631
- "fr",
632
- "de",
633
- "it",
634
- "pt",
635
- "pl",
636
- "tr",
637
- "ru",
638
- "nl",
639
- "cs",
640
- "ar",
641
- "zh-cn",
642
- "ja",
643
- "ko",
644
- "hu",
645
- "hi"
646
- ],
647
- max_choices=1,
648
- value="en",
649
- )
650
- ref_gr = gr.Audio(
651
- label="Reference Audio",
652
- info="Click on the button to upload your own target speaker audio",
653
- type="filepath",
654
- value="examples/female.wav",
655
- )
656
- mic_gr = gr.Audio(
657
- source="microphone",
658
- type="filepath",
659
- info="Use your microphone to record audio",
660
- label="Use Microphone for Reference",
661
- )
662
- use_mic_gr = gr.Checkbox(
663
- label="Use Microphone",
664
- value=False,
665
- info="Notice: Microphone input may not work properly under traffic",
666
- )
667
- clean_ref_gr = gr.Checkbox(
668
- label="Cleanup Reference Voice",
669
- value=False,
670
- info="This check can improve output if your microphone or reference voice is noisy",
671
- )
672
- auto_det_lang_gr = gr.Checkbox(
673
- label="Do not use language auto-detect",
674
- value=False,
675
- info="Check to disable language auto-detection",
676
- )
677
- tos_gr = gr.Checkbox(
678
- label="Agree",
679
- value=False,
680
- info="I agree to the terms of the CPML: https://coqui.ai/cpml",
681
- )
682
-
683
- tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
684
-
685
-
686
- with gr.Column():
687
- video_gr = gr.Video(label="Waveform Visual")
688
- audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
689
- out_text_gr = gr.Text(label="Metrics")
690
- ref_audio_gr = gr.Audio(label="Reference Audio Used")
691
-
692
- with gr.Row():
693
- gr.Examples(examples,
694
- label="Examples",
695
- inputs=[input_text_gr, language_gr, ref_gr, mic_gr, use_mic_gr, clean_ref_gr, auto_det_lang_gr, tos_gr],
696
- outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr],
697
- fn=predict,
698
- cache_examples=False,)
699
-
700
- tts_button.click(predict, [input_text_gr, language_gr, ref_gr, mic_gr, use_mic_gr, clean_ref_gr, auto_det_lang_gr, tos_gr], outputs=[video_gr, audio_gr, out_text_gr, ref_audio_gr])
701
-
702
- demo.queue()
703
- demo.launch(debug=True, show_api=True)
 
3
  import subprocess
4
  import random
5
  from zipfile import ZipFile
6
+ import uuid
7
  import time
8
  import torch
9
  import torchaudio
 
 
 
 
 
10
  # By using XTTS you agree to CPML license https://coqui.ai/cpml
11
  os.environ["COQUI_TOS_AGREED"] = "1"
12
 
13
  # langid is used to detect language for longer text
14
  # Most users expect text to be their own language, there is checkbox to disable it
15
+ import langid
 
 
 
 
 
16
 
17
  import gradio as gr
18
  from scipy.io.wavfile import write
 
35
  print("Export newer ffmpeg binary for denoise filter")
36
  ZipFile("ffmpeg.zip").extractall()
37
  print("Make ffmpeg binary executable")
38
+ st = os.stat('ffmpeg')
39
+ os.chmod('ffmpeg', st.st_mode | stat.S_IEXEC)
40
 
41
+ # Load TTS
42
+ tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1")
 
 
 
 
 
 
43
 
44
+ model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v1")
45
  config = XttsConfig()
46
  config.load_json(os.path.join(model_path, "config.json"))
 
47
  model = Xtts.init_from_config(config)
48
  model.load_checkpoint(
49
  config,
50
  checkpoint_path=os.path.join(model_path, "model.pth"),
51
  vocab_path=os.path.join(model_path, "vocab.json"),
52
  eval=True,
53
+ use_deepspeed=True
54
  )
55
  model.cuda()
56
 
57
  # This is for debugging purposes only
58
+ DEVICE_ASSERT_DETECTED=0
59
+ DEVICE_ASSERT_PROMPT=None
60
+ DEVICE_ASSERT_LANG=None
 
 
61
 
62
+ def predict(prompt, language, audio_file_pth, mic_file_path, use_mic, voice_cleanup, no_lang_auto_detect, agree,):
 
 
 
 
 
 
 
 
 
63
  if agree == True:
64
+ supported_languages=["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn"]
65
+
66
  if language not in supported_languages:
67
+ gr.Warning(f"Language you put {language} in is not in is not in our Supported Languages, please choose from dropdown")
68
+
 
 
69
  return (
70
+ None,
71
+ None,
72
+ None,
73
+ None,
74
+ )
75
 
76
+ language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
 
 
77
 
78
  # tts expects chinese as zh-cn
79
+ if language_predicted == "zh":
80
+ #we use zh-cn
81
  language_predicted = "zh-cn"
 
82
  print(f"Detected language:{language_predicted}, Chosen language:{language}")
83
 
84
  # After text character length 15 trigger language detection
85
+ if len(prompt)>15:
86
  # allow any language for short text as some may be common
87
  # If user unchecks language autodetection it will not trigger
88
  # You may remove this completely for own use
89
  if language_predicted != language and not no_lang_auto_detect:
90
+ #Please duplicate and remove this check if you really want this
91
+ #Or auto-detector fails to identify language (which it can on pretty short text or mixed text)
92
+ gr.Warning(f"It looks like your text isn’t the language you chose , if you’re sure the text is the same language you chose, please check disable language auto-detection checkbox" )
93
+
 
 
94
  return (
95
+ None,
96
+ None,
97
+ None,
98
+ None,
99
+ )
100
 
101
+
102
  if use_mic == True:
103
  if mic_file_path is not None:
104
+ speaker_wav=mic_file_path
105
  else:
106
+ gr.Warning("Please record your voice with Microphone, or uncheck Use Microphone to use reference audios")
 
 
107
  return (
108
  None,
109
  None,
110
  None,
111
  None,
112
+ )
113
+
114
  else:
115
+ speaker_wav=audio_file_pth
116
 
117
+
118
  # Filtering for microphone input, as it has BG noise, maybe silence in beginning and end
119
  # This is fast filtering not perfect
120
 
121
  # Apply all on demand
122
+ lowpassfilter=denoise=trim=loudness=True
123
+
124
  if lowpassfilter:
125
+ lowpass_highpass="lowpass=8000,highpass=75,"
126
  else:
127
+ lowpass_highpass=""
128
 
129
  if trim:
130
  # better to remove silence in beginning and end for microphone
131
+ trim_silence="areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,"
132
  else:
133
+ trim_silence=""
134
+
135
+ if (voice_cleanup):
136
  try:
137
+ out_filename = speaker_wav + str(uuid.uuid4()) + ".wav" #ffmpeg to know output format
138
+
139
+ #we will use newer ffmpeg as that has afftn denoise filter
140
+ shell_command = f"./ffmpeg -y -i {speaker_wav} -af {lowpass_highpass}{trim_silence} {out_filename}".split(" ")
141
+
142
+ command_result = subprocess.run([item for item in shell_command], capture_output=False,text=True, check=True)
143
+ speaker_wav=out_filename
 
 
 
 
 
 
 
 
 
144
  print("Filtered microphone input")
145
  except subprocess.CalledProcessError:
146
  # There was an error - command exited with non-zero code
147
  print("Error: failed filtering, use original microphone input")
148
  else:
149
+ speaker_wav=speaker_wav
150
 
151
+ if len(prompt)<2:
152
  gr.Warning("Please give a longer prompt text")
153
  return (
154
+ None,
155
+ None,
156
+ None,
157
+ None,
158
+ )
159
+ if len(prompt)>200:
160
+ gr.Warning("Text length limited to 200 characters for this demo, please try shorter text. You can clone this space and edit code for your own usage")
 
 
161
  return (
162
+ None,
163
+ None,
164
+ None,
165
+ None,
166
+ )
167
  global DEVICE_ASSERT_DETECTED
168
  if DEVICE_ASSERT_DETECTED:
169
  global DEVICE_ASSERT_PROMPT
170
  global DEVICE_ASSERT_LANG
171
+ #It will likely never come here as we restart space on first unrecoverable error now
172
+ print(f"Unrecoverable exception caused by language:{DEVICE_ASSERT_LANG} prompt:{DEVICE_ASSERT_PROMPT}")
173
+
174
+ try:
175
+ metrics_text=""
176
+ t_latent=time.time()
177
+
 
 
 
 
 
 
 
 
 
178
  # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
179
+ gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  latent_calculation_time = time.time() - t_latent
181
+ #metrics_text=f"Embedding calculation time: {latent_calculation_time:.2f} seconds\n"
 
 
 
 
 
 
182
 
183
+ wav_chunks = []
184
+
185
  print("I: Generating new audio...")
186
  t0 = time.time()
187
  out = model.inference(
 
189
  language,
190
  gpt_cond_latent,
191
  speaker_embedding,
192
+ diffusion_conditioning
 
193
  )
194
  inference_time = time.time() - t0
195
  print(f"I: Time to generate audio: {round(inference_time*1000)} milliseconds")
 
198
  print(f"Real-time factor (RTF): {real_time_factor}")
199
  metrics_text+=f"Real-time factor (RTF): {real_time_factor:.2f}\n"
200
  torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
201
+
202
+ except RuntimeError as e :
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  if "device-side assert" in str(e):
204
  # cannot do anything on cuda device side error, need tor estart
205
+ print(f"Exit due to: Unrecoverable exception caused by language:{language} prompt:{prompt}", flush=True)
 
 
 
206
  gr.Warning("Unhandled Exception encounter, please retry in a minute")
207
  print("Cuda device-assert Runtime encountered need restart")
208
  if not DEVICE_ASSERT_DETECTED:
209
+ DEVICE_ASSERT_DETECTED=1
210
+ DEVICE_ASSERT_PROMPT=prompt
211
+ DEVICE_ASSERT_LANG=language
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
 
213
+
214
+ # HF Space specific.. This error is unrecoverable need to restart space
215
+ api.restart_space(repo_id=repo_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  else:
217
+ print("RuntimeError: non device-side assert error:", str(e))
218
+ raise e
 
 
 
 
 
 
 
 
 
 
 
 
219
  return (
220
  gr.make_waveform(
221
  audio="output.wav",
 
227
  else:
228
  gr.Warning("Please accept the Terms & Condition!")
229
  return (
230
+ None,
231
+ None,
232
+ None,
233
+ None,
234
+ )
235
 
236
 
237
  title = "Coqui🐸 XTTS"
238
 
239
  description = """
240
+ <div>
241
+ <a style="display:inline-block" href='https://github.com/coqui-ai/TTS'><img src='https://img.shields.io/github/stars/coqui-ai/TTS?style=social' /></a>
242
+ <a style='display:inline-block' href='https://discord.gg/5eXr5seRrv'><img src='https://discord.com/api/guilds/1037326658807533628/widget.png?style=shield' /></a>
243
+ <a href="https://huggingface.co/spaces/coqui/xtts?duplicate=true">
244
+ <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
245
+ </div>
246
 
247
+ <a href="https://huggingface.co/coqui/XTTS-v1">XTTS</a> is a Voice generation model that lets you clone voices into different languages by using just a quick 6-second audio clip.
248
  <br/>
249
+ XTTS is built on previous research, like Tortoise, with additional architectural innovations and training to make cross-language voice cloning and multilingual speech generation possible.
 
 
250
  <br/>
251
+ This is the same model that powers our creator application <a href="https://coqui.ai">Coqui Studio</a> as well as the <a href="https://docs.coqui.ai">Coqui API</a>. In production we apply modifications to make low-latency streaming possible.
 
 
252
  <br/>
253
+ Leave a star on the Github <a href="https://github.com/coqui-ai/TTS">🐸TTS</a>, where our open-source inference and training code lives.
254
+ <br/>
255
+ <p>For faster inference without waiting in the queue, you should duplicate this space and upgrade to GPU via the settings.
256
+ <br/>
257
+ </p>
258
+ <p>Language Selectors:
259
+ Arabic: ar, Brazilian Portuguese: pt , Chinese: zh-cn, Czech: cs,<br/>
260
+ Dutch: nl, English: en, French: fr, Italian: it, Polish: pl,<br/>
261
+ Russian: ru, Spanish: es, Turkish: tr <br/>
262
+ </p>
263
+ <img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=8946ef36-c454-4a8e-a9c9-8a8dd735fabd" />
 
 
264
  """
265
 
266
  article = """
267
  <div style='margin:20px auto;'>
268
  <p>By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml</p>
 
269
  </div>
270
  """
271
  examples = [
 
278
  False,
279
  False,
280
  True,
281
+
282
  ],
283
  [
284
  "Lorsque j'avais six ans j'ai vu, une fois, une magnifique image",
 
335
  "it",
336
  "examples/female.wav",
337
  None,
338
+ False,
339
  False,
340
  False,
341
  True,
 
355
  "ru",
356
  "examples/female.wav",
357
  None,
358
+ False,
359
  False,
360
  False,
361
  True,
 
365
  "nl",
366
  "examples/male.wav",
367
  None,
368
+ False,
369
  False,
370
  False,
371
  True,
 
375
  "cs",
376
  "examples/female.wav",
377
  None,
378
+ False,
379
  False,
380
  False,
381
  True,
 
385
  "zh-cn",
386
  "examples/female.wav",
387
  None,
388
+ False,
389
  False,
390
  False,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391
  True,
392
  ],
393
  ]
394
 
395
 
396
 
397
+ gr.Interface(
398
+ fn=predict,
399
+ inputs=[
400
+ gr.Textbox(
401
+ label="Text Prompt",
402
+ info="One or two sentences at a time is better. Up to 200 text characters.",
403
+ value="Hi there, I'm your new voice clone. Try your best to upload quality audio",
404
+ ),
405
+ gr.Dropdown(
406
+ label="Language",
407
+ info="Select an output language for the synthesised speech",
408
+ choices=[
409
+ "en",
410
+ "es",
411
+ "fr",
412
+ "de",
413
+ "it",
414
+ "pt",
415
+ "pl",
416
+ "tr",
417
+ "ru",
418
+ "nl",
419
+ "cs",
420
+ "ar",
421
+ "zh-cn",
422
+ ],
423
+ max_choices=1,
424
+ value="en",
425
+ ),
426
+ gr.Audio(
427
+ label="Reference Audio",
428
+ info="Click on the ✎ button to upload your own target speaker audio",
429
+ type="filepath",
430
+ value="examples/female.wav",
431
+ ),
432
+ gr.Audio(source="microphone",
433
+ type="filepath",
434
+ info="Use your microphone to record audio",
435
+ label="Use Microphone for Reference"),
436
+ gr.Checkbox(label="Use Microphone",
437
+ value=False,
438
+ info="Notice: Microphone input may not work properly under traffic",),
439
+ gr.Checkbox(label="Cleanup Reference Voice",
440
+ value=False,
441
+ info="This check can improve output if your microphone or reference voice is noisy",
442
+ ),
443
+ gr.Checkbox(label="Do not use language auto-detect",
444
+ value=False,
445
+ info="Check to disable language auto-detection",),
446
+ gr.Checkbox(
447
+ label="Agree",
448
+ value=False,
449
+ info="I agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml",
450
+ ),
451
+
452
+
453
+ ],
454
+ outputs=[
455
+ gr.Video(label="Waveform Visual"),
456
+ gr.Audio(label="Synthesised Audio",autoplay=True),
457
+ gr.Text(label="Metrics"),
458
+ gr.Audio(label="Reference Audio Used"),
459
+ ],
460
+ title=title,
461
+ description=description,
462
+ article=article,
463
+ examples=examples,
464
+ ).queue().launch(debug=True,show_api=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,12 +1,59 @@
1
  # Preinstall requirements from TTS
2
- TTS @ git+https://github.com/coqui-ai/TTS@v0.21.1
3
- pydantic==1.10.13
4
- python-multipart==0.0.6
5
- typing-extensions>=4.8.0
6
- cutlet
7
- mecab-python3==1.0.6
8
- unidic-lite==1.0.8
9
- unidic==1.1.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  langid
11
- deepspeed
12
- pydub
 
 
 
1
  # Preinstall requirements from TTS
2
+ torch==2.0.1 --index-url https://download.pytorch.org/whl/cu118
3
+ torchvision==0.15.2 --index-url https://download.pytorch.org/whl/cu118
4
+ torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118
5
+ numpy==1.22.0;python_version<="3.10"
6
+ numpy==1.24.3;python_version>"3.10"
7
+ cython==0.29.30
8
+ scipy>=1.11.2
9
+ soundfile==0.12.*
10
+ librosa==0.10.*
11
+ scikit-learn==1.3.0
12
+ numba==0.55.1;python_version<"3.9"
13
+ numba==0.57.0;python_version>="3.9"
14
+ inflect==5.6.*
15
+ tqdm==4.64.*
16
+ anyascii==0.3.*
17
+ pyyaml==6.*
18
+ fsspec==2023.6.0 # <= 2023.9.1 makes aux tests fail
19
+ aiohttp==3.8.*
20
+ packaging==23.1
21
+ # deps for examples
22
+ flask==2.*
23
+ # deps for inference
24
+ pysbd==0.3.4
25
+ # deps for notebooks
26
+ umap-learn==0.5.*
27
+ pandas>=1.4,<2.0
28
+ # deps for training
29
+ matplotlib==3.7.*
30
+ # coqui stack
31
+ trainer
32
+ # config management
33
+ coqpit>=0.0.16
34
+ # chinese g2p deps
35
+ jieba
36
+ pypinyin==0.47.1
37
+ # gruut+supported langs
38
+ gruut[de,es,fr]==2.2.3
39
+ # deps for korean
40
+ jamo
41
+ nltk
42
+ g2pkk>=0.1.1
43
+ # deps for bangla
44
+ bangla
45
+ bnnumerizer
46
+ bnunicodenormalizer
47
+ #deps for tortoise
48
+ k_diffusion
49
+ einops==0.6.*
50
+ transformers==4.33.*
51
+ #deps for bark
52
+ encodec==0.1.*
53
+ # deps for XTTS
54
+ unidecode==1.3.*
55
  langid
56
+ # Install tts
57
+ git+https://github.com/coqui-ai/tts.git@v0.17.7
58
+ deepspeed==0.8.3
59
+ pydub