Spaces:
Running
Running
app.py
CHANGED
@@ -464,46 +464,45 @@ def audionar_tts(text=None,
|
|
464 |
'romanian': 'ron',
|
465 |
'serbian (approx.)': 'rmc-script_latin',
|
466 |
}
|
467 |
-
lang_code = lang_map.get(lang.lower(), lang.lower().split()[0].strip())
|
468 |
-
|
469 |
-
global cached_lang_code, cached_net_g, cached_tokenizer
|
470 |
-
|
471 |
-
if 'cached_lang_code' not in globals() or cached_lang_code != lang_code:
|
472 |
-
cached_lang_code = lang_code
|
473 |
-
cached_net_g = VitsModel.from_pretrained(f'facebook/mms-tts-{lang_code}').eval()
|
474 |
-
cached_tokenizer = VitsTokenizer.from_pretrained(f'facebook/mms-tts-{lang_code}')
|
475 |
-
|
476 |
-
net_g = cached_net_g
|
477 |
-
tokenizer = cached_tokenizer
|
478 |
-
|
479 |
-
total_audio = []
|
480 |
|
481 |
-
final_audio = None
|
482 |
-
speech_audio = None
|
483 |
-
|
484 |
-
|
485 |
if text and text.strip():
|
|
|
|
|
|
|
|
|
486 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
487 |
|
488 |
-
|
489 |
-
|
490 |
-
|
|
|
|
|
491 |
|
492 |
|
493 |
-
|
494 |
|
495 |
-
|
496 |
-
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
-
|
502 |
-
|
503 |
-
|
504 |
-
|
505 |
|
506 |
-
|
507 |
|
508 |
# AudioGen
|
509 |
if soundscape and soundscape.strip():
|
@@ -1321,16 +1320,8 @@ def only_greek_or_only_latin(text, lang='grc'):
|
|
1321 |
return ''.join(output_chars)
|
1322 |
|
1323 |
|
1324 |
-
def
|
1325 |
-
ref_s='wav/af_ZA_google-nwu_0184.wav'
|
1326 |
-
soundscape='birds fomig',
|
1327 |
-
cache_lim=64):
|
1328 |
-
|
1329 |
-
total_audio = []
|
1330 |
-
|
1331 |
-
final_audio = None
|
1332 |
-
speech_audio = None
|
1333 |
-
|
1334 |
|
1335 |
if text and text.strip():
|
1336 |
|
@@ -1345,56 +1336,7 @@ def other_tts(text='Hallov worlds Far over the',
|
|
1345 |
original_rate=24000,
|
1346 |
target_rate=16000)[0, :] # 16 KHz
|
1347 |
|
1348 |
-
|
1349 |
-
if soundscape and soundscape.strip():
|
1350 |
-
|
1351 |
-
|
1352 |
-
speech_duration_secs = len(speech_audio) / 16000 if speech_audio is not None else 0
|
1353 |
-
target_duration = max(speech_duration_secs + 0.74, 2.0)
|
1354 |
-
|
1355 |
-
|
1356 |
-
background_audio = audiogen.generate(
|
1357 |
-
soundscape,
|
1358 |
-
duration=target_duration,
|
1359 |
-
cache_lim=max(4, int(cache_lim)) # at least allow 10 A/R stEps
|
1360 |
-
).numpy()
|
1361 |
-
|
1362 |
-
if speech_audio is not None:
|
1363 |
-
|
1364 |
-
len_speech = len(speech_audio)
|
1365 |
-
len_background = len(background_audio)
|
1366 |
-
|
1367 |
-
if len_background > len_speech:
|
1368 |
-
padding = np.zeros(len_background - len_speech,
|
1369 |
-
dtype=np.float32)
|
1370 |
-
speech_audio = np.concatenate([speech_audio, padding])
|
1371 |
-
elif len_speech > len_background:
|
1372 |
-
padding = np.zeros(len_speech - len_background,
|
1373 |
-
dtype=np.float32)
|
1374 |
-
background_audio = np.concatenate([background_audio, padding])
|
1375 |
-
|
1376 |
-
# Convert to 2D arrays for stereo blending
|
1377 |
-
speech_audio_stereo = speech_audio[None, :]
|
1378 |
-
background_audio_stereo = background_audio[None, :]
|
1379 |
-
|
1380 |
-
|
1381 |
-
final_audio = np.concatenate([
|
1382 |
-
0.49 * speech_audio_stereo + 0.51 * background_audio_stereo,
|
1383 |
-
0.51 * background_audio_stereo + 0.49 * speech_audio_stereo
|
1384 |
-
],0)
|
1385 |
-
else:
|
1386 |
-
final_audio = background_audio
|
1387 |
-
|
1388 |
-
elif speech_audio is not None:
|
1389 |
-
final_audio = speech_audio
|
1390 |
-
|
1391 |
-
# If both inputs are empty, create a 2s silent audio file.
|
1392 |
-
if final_audio is None:
|
1393 |
-
final_audio = np.zeros(16000 * 2, dtype=np.float32)
|
1394 |
-
|
1395 |
-
wavfile = '_audionar_.wav'
|
1396 |
-
audiofile.write(wavfile, final_audio, 16000)
|
1397 |
-
return wavfile
|
1398 |
|
1399 |
def update_selected_voice(voice_filename):
|
1400 |
return 'wav/' + voice_filename + '.wav'
|
@@ -1412,40 +1354,8 @@ description = (
|
|
1412 |
)
|
1413 |
|
1414 |
|
1415 |
-
def other_tts(text_input, selected_voice, soundscape_input, kv_input):
|
1416 |
-
"""
|
1417 |
-
This function would handle the TTS generation for 'other TTS' voices.
|
1418 |
-
"""
|
1419 |
-
print(f"Generating TTS for voice: {selected_voice}")
|
1420 |
-
print(f"Text: {text_input}")
|
1421 |
-
print(f"Soundscape: {soundscape_input}")
|
1422 |
-
print(f"KV Period: {kv_input}")
|
1423 |
-
# Replace with your actual TTS generation code
|
1424 |
-
return "path/to/generated/audio.wav"
|
1425 |
-
|
1426 |
-
def audionar_tts(text_input, lang_dropdown, soundscape_input, kv_input):
|
1427 |
-
"""
|
1428 |
-
This function would handle the TTS generation for 'audionar TTS' languages.
|
1429 |
-
"""
|
1430 |
-
print(f"Generating TTS for language: {lang_dropdown}")
|
1431 |
-
print(f"Text: {text_input}")
|
1432 |
-
print(f"Soundscape: {soundscape_input}")
|
1433 |
-
print(f"KV Period: {kv_input}")
|
1434 |
-
# Replace with your actual TTS generation code
|
1435 |
-
return "path/to/generated/audio.wav"
|
1436 |
-
|
1437 |
-
def recognize(audio):
|
1438 |
-
"""
|
1439 |
-
This function handles speech analysis.
|
1440 |
-
"""
|
1441 |
-
print(f"Analyzing audio from: {audio}")
|
1442 |
-
# Replace with your actual speech analysis code
|
1443 |
-
return "30", "Male", "Happy"
|
1444 |
-
|
1445 |
-
|
1446 |
-
|
1447 |
with gr.Blocks(theme='huggingface') as demo:
|
1448 |
-
with gr.Tab(label="TTS
|
1449 |
with gr.Row():
|
1450 |
text_input = gr.Textbox(
|
1451 |
label="Type text for TTS:",
|
@@ -1474,18 +1384,8 @@ with gr.Blocks(theme='huggingface') as demo:
|
|
1474 |
|
1475 |
output_audio = gr.Audio(label="TTS Output")
|
1476 |
|
1477 |
-
def generate_audio_unified(text, choice, soundscape, kv):
|
1478 |
-
"""
|
1479 |
-
Unified function to call the correct TTS backend based on the dropdown choice.
|
1480 |
-
"""
|
1481 |
-
# Logic to determine which function to call based on the choice
|
1482 |
-
if choice in VOICES:
|
1483 |
-
return other_tts(text, choice, soundscape, kv)
|
1484 |
-
elif choice in language_names:
|
1485 |
-
return audionar_tts(text, choice, soundscape, kv)
|
1486 |
-
|
1487 |
generate_button.click(
|
1488 |
-
fn=
|
1489 |
inputs=[text_input, choice_dropdown, soundscape_input, kv_input],
|
1490 |
outputs=output_audio
|
1491 |
)
|
|
|
464 |
'romanian': 'ron',
|
465 |
'serbian (approx.)': 'rmc-script_latin',
|
466 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
467 |
|
|
|
|
|
|
|
|
|
468 |
if text and text.strip():
|
469 |
+
if 'wav/' in lang:
|
470 |
+
# call StyleTTS2
|
471 |
+
speech_audio = _styletts2(text=text,
|
472 |
+
ref_s=lang)
|
473 |
|
474 |
+
else: # VITS
|
475 |
+
|
476 |
+
lang_code = lang_map.get(lang.lower(), lang.lower().split()[0].strip())
|
477 |
+
|
478 |
+
global cached_lang_code, cached_net_g, cached_tokenizer
|
479 |
+
|
480 |
+
if 'cached_lang_code' not in globals() or cached_lang_code != lang_code:
|
481 |
+
cached_lang_code = lang_code
|
482 |
+
cached_net_g = VitsModel.from_pretrained(f'facebook/mms-tts-{lang_code}').eval()
|
483 |
+
cached_tokenizer = VitsTokenizer.from_pretrained(f'facebook/mms-tts-{lang_code}')
|
484 |
|
485 |
+
net_g = cached_net_g
|
486 |
+
tokenizer = cached_tokenizer
|
487 |
+
text = only_greek_or_only_latin(text, lang=lang_code)
|
488 |
+
text = transliterate_number(text, lang=lang_code)
|
489 |
+
text = fix_vocals(text, lang=lang_code)
|
490 |
|
491 |
|
492 |
+
sentences = textwrap.wrap(text, width=439)
|
493 |
|
494 |
+
total_audio_parts = []
|
495 |
+
for sentence in sentences:
|
496 |
+
inputs = cached_tokenizer(sentence, return_tensors="pt")
|
497 |
+
with torch.no_grad():
|
498 |
+
audio_part = cached_net_g(
|
499 |
+
input_ids=inputs.input_ids.to(device),
|
500 |
+
attention_mask=inputs.attention_mask.to(device),
|
501 |
+
lang_code=lang_code,
|
502 |
+
)[0, :]
|
503 |
+
total_audio_parts.append(audio_part)
|
504 |
|
505 |
+
speech_audio = torch.cat(total_audio_parts).cpu().numpy()
|
506 |
|
507 |
# AudioGen
|
508 |
if soundscape and soundscape.strip():
|
|
|
1320 |
return ''.join(output_chars)
|
1321 |
|
1322 |
|
1323 |
+
def _stylett2(text='Hallov worlds Far over the',
|
1324 |
+
ref_s='wav/af_ZA_google-nwu_0184.wav'):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1325 |
|
1326 |
if text and text.strip():
|
1327 |
|
|
|
1336 |
original_rate=24000,
|
1337 |
target_rate=16000)[0, :] # 16 KHz
|
1338 |
|
1339 |
+
return speech_audio
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1340 |
|
1341 |
def update_selected_voice(voice_filename):
|
1342 |
return 'wav/' + voice_filename + '.wav'
|
|
|
1354 |
)
|
1355 |
|
1356 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1357 |
with gr.Blocks(theme='huggingface') as demo:
|
1358 |
+
with gr.Tab(label="TTS"):
|
1359 |
with gr.Row():
|
1360 |
text_input = gr.Textbox(
|
1361 |
label="Type text for TTS:",
|
|
|
1384 |
|
1385 |
output_audio = gr.Audio(label="TTS Output")
|
1386 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1387 |
generate_button.click(
|
1388 |
+
fn=audionar_tts,
|
1389 |
inputs=[text_input, choice_dropdown, soundscape_input, kv_input],
|
1390 |
outputs=output_audio
|
1391 |
)
|