gorkemgoknar commited on
Commit
e5753d7
·
1 Parent(s): d559922

use v1.1 model for now

Browse files
Files changed (1) hide show
  1. app.py +17 -30
app.py CHANGED
@@ -44,10 +44,10 @@ st = os.stat("ffmpeg")
44
  os.chmod("ffmpeg", st.st_mode | stat.S_IEXEC)
45
 
46
  # This will trigger downloading model
47
- print("Downloading if not downloaded Coqui XTTS V2")
48
  from TTS.utils.manage import ModelManager
49
 
50
- model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
51
  ModelManager().download_model(model_name)
52
  model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
53
  print("XTTS downloaded")
@@ -55,6 +55,10 @@ print("XTTS downloaded")
55
  config = XttsConfig()
56
  config.load_json(os.path.join(model_path, "config.json"))
57
 
 
 
 
 
58
  model = Xtts.init_from_config(config)
59
  model.load_checkpoint(
60
  config,
@@ -70,8 +74,11 @@ DEVICE_ASSERT_DETECTED = 0
70
  DEVICE_ASSERT_PROMPT = None
71
  DEVICE_ASSERT_LANG = None
72
 
 
 
73
  supported_languages = config.languages
74
 
 
75
  def predict(
76
  prompt,
77
  language,
@@ -247,7 +254,8 @@ def predict(
247
  language,
248
  gpt_cond_latent,
249
  speaker_embedding,
250
- diffusion_conditioning
 
251
  )
252
  inference_time = time.time() - t0
253
  print(f"I: Time to generate audio: {round(inference_time*1000)} milliseconds")
@@ -264,7 +272,8 @@ def predict(
264
  prompt,
265
  language,
266
  gpt_cond_latent,
267
- speaker_embedding
 
268
  )
269
 
270
  first_chunk = True
@@ -394,7 +403,7 @@ description = """
394
  <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
395
  </div>
396
 
397
- <a href="https://huggingface.co/coqui/XTTS-v2">XTTS</a> is a Voice generation model that lets you clone voices into different languages by using just a quick 6-second audio clip.
398
  <br/>
399
  XTTS is built on previous research, like Tortoise, with additional architectural innovations and training to make cross-language voice cloning and multilingual speech generation possible.
400
  <br/>
@@ -406,8 +415,9 @@ Leave a star on the Github <a href="https://github.com/coqui-ai/TTS">🐸TTS</a>
406
  <br/>
407
  </p>
408
  <p>Language Selectors:
409
- Arabic: ar, Brazilian Portuguese: pt , Chinese: zh-cn, Czech: cs, Dutch: nl, English: en, French: fr, Italian: it, Polish: pl,<br/>
410
- Russian: ru, Spanish: es, Turkish: tr, Japanese: ja, Korean: ko, Hungarian: hu <br/>
 
411
  </p>
412
  <img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=0d00920c-8cc9-4bf3-90f2-a615797e5f59" />
413
  """
@@ -549,26 +559,6 @@ examples = [
549
  False,
550
  True,
551
  ],
552
- [
553
- "한번은 내가 여섯 살이었을 때 멋진 그림을 보았습니다.",
554
- "ko",
555
- "examples/female.wav",
556
- None,
557
- False,
558
- True,
559
- False,
560
- True,
561
- ],
562
- [
563
- "Egyszer hat éves koromban láttam egy csodálatos képet",
564
- "hu",
565
- "examples/male.wav",
566
- None,
567
- False,
568
- True,
569
- False,
570
- True,
571
- ],
572
  ]
573
 
574
 
@@ -598,8 +588,6 @@ gr.Interface(
598
  "ar",
599
  "zh-cn",
600
  "ja",
601
- "ko",
602
- "hu"
603
  ],
604
  max_choices=1,
605
  value="en",
@@ -648,4 +636,3 @@ gr.Interface(
648
  article=article,
649
  examples=examples,
650
  ).queue().launch(debug=True, show_api=True)
651
-
 
44
  os.chmod("ffmpeg", st.st_mode | stat.S_IEXEC)
45
 
46
  # This will trigger downloading model
47
+ print("Downloading if not downloaded Coqui XTTS V1.1")
48
  from TTS.utils.manage import ModelManager
49
 
50
+ model_name = "tts_models/multilingual/multi-dataset/xtts_v1.1"
51
  ModelManager().download_model(model_name)
52
  model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
53
  print("XTTS downloaded")
 
55
  config = XttsConfig()
56
  config.load_json(os.path.join(model_path, "config.json"))
57
 
58
+ # it should be there just to be sure
59
+ if "ja" not in config.languages:
60
+ config.languages.append("ja")
61
+
62
  model = Xtts.init_from_config(config)
63
  model.load_checkpoint(
64
  config,
 
74
  DEVICE_ASSERT_PROMPT = None
75
  DEVICE_ASSERT_LANG = None
76
 
77
+
78
+ # supported_languages=["en","es","fr","de","it","pt","pl","tr","ru","nl","cs","ar","zh-cn"]
79
  supported_languages = config.languages
80
 
81
+
82
  def predict(
83
  prompt,
84
  language,
 
254
  language,
255
  gpt_cond_latent,
256
  speaker_embedding,
257
+ diffusion_conditioning,
258
+ decoder="ne_hifigan",
259
  )
260
  inference_time = time.time() - t0
261
  print(f"I: Time to generate audio: {round(inference_time*1000)} milliseconds")
 
272
  prompt,
273
  language,
274
  gpt_cond_latent,
275
+ speaker_embedding,
276
+ decoder="ne_hifigan",
277
  )
278
 
279
  first_chunk = True
 
403
  <img style="margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>
404
  </div>
405
 
406
+ <a href="https://huggingface.co/coqui/XTTS-v1">XTTS</a> is a Voice generation model that lets you clone voices into different languages by using just a quick 6-second audio clip.
407
  <br/>
408
  XTTS is built on previous research, like Tortoise, with additional architectural innovations and training to make cross-language voice cloning and multilingual speech generation possible.
409
  <br/>
 
415
  <br/>
416
  </p>
417
  <p>Language Selectors:
418
+ Arabic: ar, Brazilian Portuguese: pt , Chinese: zh-cn, Czech: cs,<br/>
419
+ Dutch: nl, English: en, French: fr, Italian: it, Polish: pl,<br/>
420
+ Russian: ru, Spanish: es, Turkish: tr, Japanese: ja <br/>
421
  </p>
422
  <img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=0d00920c-8cc9-4bf3-90f2-a615797e5f59" />
423
  """
 
559
  False,
560
  True,
561
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
562
  ]
563
 
564
 
 
588
  "ar",
589
  "zh-cn",
590
  "ja",
 
 
591
  ],
592
  max_choices=1,
593
  value="en",
 
636
  article=article,
637
  examples=examples,
638
  ).queue().launch(debug=True, show_api=True)