gorkemgoknar commited on
Commit
96324d6
·
1 Parent(s): e5753d7

xtts v2 with silence fix

Browse files
Files changed (1) hide show
  1. app.py +30 -16
app.py CHANGED
@@ -44,10 +44,10 @@ st = os.stat("ffmpeg")
44
  os.chmod("ffmpeg", st.st_mode | stat.S_IEXEC)
45
 
46
  # This will trigger downloading model
47
- print("Downloading if not downloaded Coqui XTTS V1.1")
48
  from TTS.utils.manage import ModelManager
49
 
50
- model_name = "tts_models/multilingual/multi-dataset/xtts_v1.1"
51
  ModelManager().download_model(model_name)
52
  model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
53
  print("XTTS downloaded")
@@ -55,10 +55,6 @@ print("XTTS downloaded")
55
  config = XttsConfig()
56
  config.load_json(os.path.join(model_path, "config.json"))
57
 
58
- # it should be there just to be sure
59
- if "ja" not in config.languages:
60
- config.languages.append("ja")
61
-
62
  model = Xtts.init_from_config(config)
63
  model.load_checkpoint(
64
  config,
@@ -74,11 +70,8 @@ DEVICE_ASSERT_DETECTED = 0
74
  DEVICE_ASSERT_PROMPT = None
75
  DEVICE_ASSERT_LANG = None
76
 
77
-
78
- # supported_languages=["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn"]
79
  supported_languages = config.languages
80
 
81
-
82
  def predict(
83
  prompt,
84
  language,
@@ -254,8 +247,7 @@ def predict(
254
  language,
255
  gpt_cond_latent,
256
  speaker_embedding,
257
- diffusion_conditioning,
258
- decoder="ne_hifigan",
259
  )
260
  inference_time = time.time() - t0
261
  print(f"I: Time to generate audio: {round(inference_time*1000)} milliseconds")
@@ -273,7 +265,8 @@ def predict(
273
  language,
274
  gpt_cond_latent,
275
  speaker_embedding,
276
- decoder="ne_hifigan",
 
277
  )
278
 
279
  first_chunk = True
@@ -403,7 +396,7 @@ description = """
403
  <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
404
  </div>
405
 
406
- <a href="https://huggingface.co/coqui/XTTS-v1">XTTS</a> is a Voice generation model that lets you clone voices into different languages by using just a quick 6-second audio clip.
407
  <br/>
408
  XTTS is built on previous research, like Tortoise, with additional architectural innovations and training to make cross-language voice cloning and multilingual speech generation possible.
409
  <br/>
@@ -415,9 +408,8 @@ Leave a star on the Github <a href="https://github.com/coqui-ai/TTS">🐸TTS</a>
415
  <br/>
416
  </p>
417
  <p>Language Selectors:
418
- Arabic: ar, Brazilian Portuguese: pt , Chinese: zh-cn, Czech: cs,<br/>
419
- Dutch: nl, English: en, French: fr, Italian: it, Polish: pl,<br/>
420
- Russian: ru, Spanish: es, Turkish: tr, Japanese: ja <br/>
421
  </p>
422
  <img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=0d00920c-8cc9-4bf3-90f2-a615797e5f59" />
423
  """
@@ -559,6 +551,26 @@ examples = [
559
  False,
560
  True,
561
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
562
  ]
563
 
564
 
@@ -588,6 +600,8 @@ gr.Interface(
588
  "ar",
589
  "zh-cn",
590
  "ja",
 
 
591
  ],
592
  max_choices=1,
593
  value="en",
 
44
  os.chmod("ffmpeg", st.st_mode | stat.S_IEXEC)
45
 
46
  # This will trigger downloading model
47
+ print("Downloading if not downloaded Coqui XTTS V2")
48
  from TTS.utils.manage import ModelManager
49
 
50
+ model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
51
  ModelManager().download_model(model_name)
52
  model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
53
  print("XTTS downloaded")
 
55
  config = XttsConfig()
56
  config.load_json(os.path.join(model_path, "config.json"))
57
 
 
 
 
 
58
  model = Xtts.init_from_config(config)
59
  model.load_checkpoint(
60
  config,
 
70
  DEVICE_ASSERT_PROMPT = None
71
  DEVICE_ASSERT_LANG = None
72
 
 
 
73
  supported_languages = config.languages
74
 
 
75
  def predict(
76
  prompt,
77
  language,
 
247
  language,
248
  gpt_cond_latent,
249
  speaker_embedding,
250
+ diffusion_conditioning
 
251
  )
252
  inference_time = time.time() - t0
253
  print(f"I: Time to generate audio: {round(inference_time*1000)} milliseconds")
 
265
  language,
266
  gpt_cond_latent,
267
  speaker_embedding,
268
+ repetition_penalty=5.0,
269
+ temperature=0.75,
270
  )
271
 
272
  first_chunk = True
 
396
  <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
397
  </div>
398
 
399
+ <a href="https://huggingface.co/coqui/XTTS-v2">XTTS</a> is a Voice generation model that lets you clone voices into different languages by using just a quick 6-second audio clip.
400
  <br/>
401
  XTTS is built on previous research, like Tortoise, with additional architectural innovations and training to make cross-language voice cloning and multilingual speech generation possible.
402
  <br/>
 
408
  <br/>
409
  </p>
410
  <p>Language Selectors:
411
+ Arabic: ar, Brazilian Portuguese: pt , Chinese: zh-cn, Czech: cs, Dutch: nl, English: en, French: fr, Italian: it, Polish: pl,<br/>
412
+ Russian: ru, Spanish: es, Turkish: tr, Japanese: ja, Korean: ko, Hungarian: hu <br/>
 
413
  </p>
414
  <img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=0d00920c-8cc9-4bf3-90f2-a615797e5f59" />
415
  """
 
551
  False,
552
  True,
553
  ],
554
+ [
555
+ "한번은 내가 여섯 살이었을 때 멋진 그림을 보았습니다.",
556
+ "ko",
557
+ "examples/female.wav",
558
+ None,
559
+ False,
560
+ True,
561
+ False,
562
+ True,
563
+ ],
564
+ [
565
+ "Egyszer hat éves koromban láttam egy csodálatos képet",
566
+ "hu",
567
+ "examples/male.wav",
568
+ None,
569
+ False,
570
+ True,
571
+ False,
572
+ True,
573
+ ],
574
  ]
575
 
576
 
 
600
  "ar",
601
  "zh-cn",
602
  "ja",
603
+ "ko",
604
+ "hu"
605
  ],
606
  max_choices=1,
607
  value="en",