Dionyssos commited on
Commit
6c9a684
·
1 Parent(s): d04a553
Files changed (1) hide show
  1. app.py +36 -136
app.py CHANGED
@@ -464,46 +464,45 @@ def audionar_tts(text=None,
464
  'romanian': 'ron',
465
  'serbian (approx.)': 'rmc-script_latin',
466
  }
467
- lang_code = lang_map.get(lang.lower(), lang.lower().split()[0].strip())
468
-
469
- global cached_lang_code, cached_net_g, cached_tokenizer
470
-
471
- if 'cached_lang_code' not in globals() or cached_lang_code != lang_code:
472
- cached_lang_code = lang_code
473
- cached_net_g = VitsModel.from_pretrained(f'facebook/mms-tts-{lang_code}').eval()
474
- cached_tokenizer = VitsTokenizer.from_pretrained(f'facebook/mms-tts-{lang_code}')
475
-
476
- net_g = cached_net_g
477
- tokenizer = cached_tokenizer
478
-
479
- total_audio = []
480
 
481
- final_audio = None
482
- speech_audio = None
483
-
484
-
485
  if text and text.strip():
 
 
 
 
486
 
 
 
 
 
 
 
 
 
 
 
487
 
488
- text = only_greek_or_only_latin(text, lang=lang_code)
489
- text = transliterate_number(text, lang=lang_code)
490
- text = fix_vocals(text, lang=lang_code)
 
 
491
 
492
 
493
- sentences = textwrap.wrap(text, width=439)
494
 
495
- total_audio_parts = []
496
- for sentence in sentences:
497
- inputs = cached_tokenizer(sentence, return_tensors="pt")
498
- with torch.no_grad():
499
- audio_part = cached_net_g(
500
- input_ids=inputs.input_ids.to(device),
501
- attention_mask=inputs.attention_mask.to(device),
502
- lang_code=lang_code,
503
- )[0, :]
504
- total_audio_parts.append(audio_part)
505
 
506
- speech_audio = torch.cat(total_audio_parts).cpu().numpy()
507
 
508
  # AudioGen
509
  if soundscape and soundscape.strip():
@@ -1321,16 +1320,8 @@ def only_greek_or_only_latin(text, lang='grc'):
1321
  return ''.join(output_chars)
1322
 
1323
 
1324
- def other_tts(text='Hallov worlds Far over the',
1325
- ref_s='wav/af_ZA_google-nwu_0184.wav',
1326
- soundscape='birds fomig',
1327
- cache_lim=64):
1328
-
1329
- total_audio = []
1330
-
1331
- final_audio = None
1332
- speech_audio = None
1333
-
1334
 
1335
  if text and text.strip():
1336
 
@@ -1345,56 +1336,7 @@ def other_tts(text='Hallov worlds Far over the',
1345
  original_rate=24000,
1346
  target_rate=16000)[0, :] # 16 KHz
1347
 
1348
- # AudioGen
1349
- if soundscape and soundscape.strip():
1350
-
1351
-
1352
- speech_duration_secs = len(speech_audio) / 16000 if speech_audio is not None else 0
1353
- target_duration = max(speech_duration_secs + 0.74, 2.0)
1354
-
1355
-
1356
- background_audio = audiogen.generate(
1357
- soundscape,
1358
- duration=target_duration,
1359
- cache_lim=max(4, int(cache_lim)) # at least allow 10 A/R stEps
1360
- ).numpy()
1361
-
1362
- if speech_audio is not None:
1363
-
1364
- len_speech = len(speech_audio)
1365
- len_background = len(background_audio)
1366
-
1367
- if len_background > len_speech:
1368
- padding = np.zeros(len_background - len_speech,
1369
- dtype=np.float32)
1370
- speech_audio = np.concatenate([speech_audio, padding])
1371
- elif len_speech > len_background:
1372
- padding = np.zeros(len_speech - len_background,
1373
- dtype=np.float32)
1374
- background_audio = np.concatenate([background_audio, padding])
1375
-
1376
- # Convert to 2D arrays for stereo blending
1377
- speech_audio_stereo = speech_audio[None, :]
1378
- background_audio_stereo = background_audio[None, :]
1379
-
1380
-
1381
- final_audio = np.concatenate([
1382
- 0.49 * speech_audio_stereo + 0.51 * background_audio_stereo,
1383
- 0.51 * background_audio_stereo + 0.49 * speech_audio_stereo
1384
- ],0)
1385
- else:
1386
- final_audio = background_audio
1387
-
1388
- elif speech_audio is not None:
1389
- final_audio = speech_audio
1390
-
1391
- # If both inputs are empty, create a 2s silent audio file.
1392
- if final_audio is None:
1393
- final_audio = np.zeros(16000 * 2, dtype=np.float32)
1394
-
1395
- wavfile = '_audionar_.wav'
1396
- audiofile.write(wavfile, final_audio, 16000)
1397
- return wavfile
1398
 
1399
  def update_selected_voice(voice_filename):
1400
  return 'wav/' + voice_filename + '.wav'
@@ -1412,40 +1354,8 @@ description = (
1412
  )
1413
 
1414
 
1415
- def other_tts(text_input, selected_voice, soundscape_input, kv_input):
1416
- """
1417
- This function would handle the TTS generation for 'other TTS' voices.
1418
- """
1419
- print(f"Generating TTS for voice: {selected_voice}")
1420
- print(f"Text: {text_input}")
1421
- print(f"Soundscape: {soundscape_input}")
1422
- print(f"KV Period: {kv_input}")
1423
- # Replace with your actual TTS generation code
1424
- return "path/to/generated/audio.wav"
1425
-
1426
- def audionar_tts(text_input, lang_dropdown, soundscape_input, kv_input):
1427
- """
1428
- This function would handle the TTS generation for 'audionar TTS' languages.
1429
- """
1430
- print(f"Generating TTS for language: {lang_dropdown}")
1431
- print(f"Text: {text_input}")
1432
- print(f"Soundscape: {soundscape_input}")
1433
- print(f"KV Period: {kv_input}")
1434
- # Replace with your actual TTS generation code
1435
- return "path/to/generated/audio.wav"
1436
-
1437
- def recognize(audio):
1438
- """
1439
- This function handles speech analysis.
1440
- """
1441
- print(f"Analyzing audio from: {audio}")
1442
- # Replace with your actual speech analysis code
1443
- return "30", "Male", "Happy"
1444
-
1445
-
1446
-
1447
  with gr.Blocks(theme='huggingface') as demo:
1448
- with gr.Tab(label="TTS Generation"):
1449
  with gr.Row():
1450
  text_input = gr.Textbox(
1451
  label="Type text for TTS:",
@@ -1474,18 +1384,8 @@ with gr.Blocks(theme='huggingface') as demo:
1474
 
1475
  output_audio = gr.Audio(label="TTS Output")
1476
 
1477
- def generate_audio_unified(text, choice, soundscape, kv):
1478
- """
1479
- Unified function to call the correct TTS backend based on the dropdown choice.
1480
- """
1481
- # Logic to determine which function to call based on the choice
1482
- if choice in VOICES:
1483
- return other_tts(text, choice, soundscape, kv)
1484
- elif choice in language_names:
1485
- return audionar_tts(text, choice, soundscape, kv)
1486
-
1487
  generate_button.click(
1488
- fn=generate_audio_unified,
1489
  inputs=[text_input, choice_dropdown, soundscape_input, kv_input],
1490
  outputs=output_audio
1491
  )
 
464
  'romanian': 'ron',
465
  'serbian (approx.)': 'rmc-script_latin',
466
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
467
 
 
 
 
 
468
  if text and text.strip():
469
+ if 'wav/' in lang:
470
+ # call StyleTTS2
471
+ speech_audio = _styletts2(text=text,
472
+ ref_s=lang)
473
 
474
+ else: # VITS
475
+
476
+ lang_code = lang_map.get(lang.lower(), lang.lower().split()[0].strip())
477
+
478
+ global cached_lang_code, cached_net_g, cached_tokenizer
479
+
480
+ if 'cached_lang_code' not in globals() or cached_lang_code != lang_code:
481
+ cached_lang_code = lang_code
482
+ cached_net_g = VitsModel.from_pretrained(f'facebook/mms-tts-{lang_code}').eval()
483
+ cached_tokenizer = VitsTokenizer.from_pretrained(f'facebook/mms-tts-{lang_code}')
484
 
485
+ net_g = cached_net_g
486
+ tokenizer = cached_tokenizer
487
+ text = only_greek_or_only_latin(text, lang=lang_code)
488
+ text = transliterate_number(text, lang=lang_code)
489
+ text = fix_vocals(text, lang=lang_code)
490
 
491
 
492
+ sentences = textwrap.wrap(text, width=439)
493
 
494
+ total_audio_parts = []
495
+ for sentence in sentences:
496
+ inputs = cached_tokenizer(sentence, return_tensors="pt")
497
+ with torch.no_grad():
498
+ audio_part = cached_net_g(
499
+ input_ids=inputs.input_ids.to(device),
500
+ attention_mask=inputs.attention_mask.to(device),
501
+ lang_code=lang_code,
502
+ )[0, :]
503
+ total_audio_parts.append(audio_part)
504
 
505
+ speech_audio = torch.cat(total_audio_parts).cpu().numpy()
506
 
507
  # AudioGen
508
  if soundscape and soundscape.strip():
 
1320
  return ''.join(output_chars)
1321
 
1322
 
1323
+ def _stylett2(text='Hallov worlds Far over the',
1324
+ ref_s='wav/af_ZA_google-nwu_0184.wav'):
 
 
 
 
 
 
 
 
1325
 
1326
  if text and text.strip():
1327
 
 
1336
  original_rate=24000,
1337
  target_rate=16000)[0, :] # 16 KHz
1338
 
1339
+ return speech_audio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1340
 
1341
  def update_selected_voice(voice_filename):
1342
  return 'wav/' + voice_filename + '.wav'
 
1354
  )
1355
 
1356
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1357
  with gr.Blocks(theme='huggingface') as demo:
1358
+ with gr.Tab(label="TTS"):
1359
  with gr.Row():
1360
  text_input = gr.Textbox(
1361
  label="Type text for TTS:",
 
1384
 
1385
  output_audio = gr.Audio(label="TTS Output")
1386
 
 
 
 
 
 
 
 
 
 
 
1387
  generate_button.click(
1388
+ fn=audionar_tts,
1389
  inputs=[text_input, choice_dropdown, soundscape_input, kv_input],
1390
  outputs=output_audio
1391
  )