Spaces:
Running
Running
Dropslists 2 tabs
Browse files
app.py
CHANGED
@@ -1191,6 +1191,7 @@ VOICES = ['jv_ID_google-gmu_04982.wav',
|
|
1191 |
'jv_ID_google-gmu_07765.wav',
|
1192 |
'en_US_vctk_p273.wav'
|
1193 |
]
|
|
|
1194 |
|
1195 |
_tts = StyleTTS2().to('cpu')
|
1196 |
|
@@ -1321,7 +1322,7 @@ def only_greek_or_only_latin(text, lang='grc'):
|
|
1321 |
|
1322 |
|
1323 |
def other_tts(text='Hallov worlds Far over the',
|
1324 |
-
ref_s='af_ZA_google-nwu_0184.wav',
|
1325 |
soundscape='birds fomig',
|
1326 |
cache_lim=64):
|
1327 |
|
@@ -1336,7 +1337,7 @@ def other_tts(text='Hallov worlds Far over the',
|
|
1336 |
text = only_greek_or_only_latin(text, lang='eng')
|
1337 |
|
1338 |
speech_audio = _tts.inference(text,
|
1339 |
-
ref_s=
|
1340 |
|
1341 |
if speech_audio.shape[0] > 10:
|
1342 |
|
@@ -1390,7 +1391,7 @@ def other_tts(text='Hallov worlds Far over the',
|
|
1390 |
# If both inputs are empty, create a 2s silent audio file.
|
1391 |
if final_audio is None:
|
1392 |
final_audio = np.zeros(16000 * 2, dtype=np.float32)
|
1393 |
-
|
1394 |
wavfile = '_audionar_.wav'
|
1395 |
audiofile.write(wavfile, final_audio, 16000)
|
1396 |
return wavfile
|
@@ -1410,48 +1411,60 @@ description = (
|
|
1410 |
"recognises the expression dimensions arousal, dominance, and valence. "
|
1411 |
)
|
1412 |
|
1413 |
-
css_buttons = """
|
1414 |
-
.cool-button {
|
1415 |
-
background-color: #1a2a40; /* Slightly lighter dark blue */
|
1416 |
-
color: white;
|
1417 |
-
padding: 15px 32px;
|
1418 |
-
text-align: center;
|
1419 |
-
font-size: 16px;
|
1420 |
-
border-radius: 12px;
|
1421 |
-
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.4);
|
1422 |
-
transition: all 0.3s ease-in-out;
|
1423 |
-
border: none;
|
1424 |
-
cursor: pointer;
|
1425 |
-
}
|
1426 |
-
.cool-button:hover {
|
1427 |
-
background-color: #1a2a40; /* Slightly lighter dark blue */
|
1428 |
-
transform: scale(1.05);
|
1429 |
-
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.4);
|
1430 |
-
}
|
1431 |
-
.cool-row {
|
1432 |
-
margin-bottom: 10px;
|
1433 |
-
}
|
1434 |
-
"""
|
1435 |
|
1436 |
-
|
1437 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1438 |
|
1439 |
-
selected_voice = gr.State(value='wav/en_US_m-ailabs_mary_ann.wav')
|
1440 |
|
1441 |
-
with gr.Row():
|
1442 |
-
voice_info = gr.Markdown(f'Vox = `{selected_voice.value}`')
|
1443 |
|
1444 |
-
|
|
|
1445 |
with gr.Row():
|
1446 |
text_input = gr.Textbox(
|
1447 |
-
label="
|
1448 |
-
placeholder="Type
|
1449 |
lines=4,
|
1450 |
value="Farover the misty mountains cold too dungeons deep and caverns old.",
|
1451 |
)
|
1452 |
-
|
1453 |
-
|
1454 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1455 |
)
|
1456 |
kv_input = gr.Number(
|
1457 |
label="kv Period",
|
@@ -1461,32 +1474,21 @@ with gr.Blocks(theme='huggingface', css=css_buttons) as demo:
|
|
1461 |
|
1462 |
output_audio = gr.Audio(label="TTS Output")
|
1463 |
|
1464 |
-
|
1465 |
-
|
1466 |
-
|
1467 |
-
|
1468 |
-
|
1469 |
-
|
1470 |
-
|
1471 |
-
|
1472 |
-
|
1473 |
-
fn=update_selected_voice,
|
1474 |
-
inputs=[gr.Textbox(value=voice_filename, visible=False)],
|
1475 |
-
outputs=[selected_voice]
|
1476 |
-
)
|
1477 |
-
button.click(
|
1478 |
-
fn=lambda v=voice_filename: f'Vox = `{v}`',
|
1479 |
-
inputs=None,
|
1480 |
-
outputs=voice_info
|
1481 |
-
)
|
1482 |
-
voice_buttons.append(button)
|
1483 |
|
1484 |
generate_button.click(
|
1485 |
-
fn=
|
1486 |
-
inputs=[text_input,
|
1487 |
outputs=output_audio
|
1488 |
)
|
1489 |
-
|
1490 |
with gr.Tab(label="Speech Analysis"):
|
1491 |
with gr.Row():
|
1492 |
with gr.Column():
|
@@ -1517,29 +1519,4 @@ with gr.Blocks(theme='huggingface', css=css_buttons) as demo:
|
|
1517 |
outputs = [output_age, output_gender, output_expression]
|
1518 |
submit_btn.click(recognize, input, outputs)
|
1519 |
|
1520 |
-
|
1521 |
-
with gr.Tab("audionar TTS"):
|
1522 |
-
with gr.Row():
|
1523 |
-
text_input = gr.Textbox(
|
1524 |
-
lines=4,
|
1525 |
-
value='Η γρηγορη καφετι αλεπου πειδαει πανω απο τον τεμπελη σκυλο.',
|
1526 |
-
label="Type text for TTS"
|
1527 |
-
)
|
1528 |
-
lang_dropdown = gr.Dropdown(choices=language_names, label="TTS language", value="Ancient greek")
|
1529 |
-
soundscape_input = gr.Textbox(lines=1, value="dogs barg", label="AudioGen Txt")
|
1530 |
-
kv_input = gr.Number(label="kv Period", value=70)
|
1531 |
-
|
1532 |
-
# Create a button to trigger the TTS function
|
1533 |
-
tts_button = gr.Button("Generate Audio")
|
1534 |
-
|
1535 |
-
# Create the output audio component
|
1536 |
-
audio_output = gr.Audio(label="Generated Audio")
|
1537 |
-
|
1538 |
-
# Link the button click event to the mms_tts function
|
1539 |
-
tts_button.click(
|
1540 |
-
fn=audionar_tts,
|
1541 |
-
inputs=[text_input, lang_dropdown, soundscape_input, kv_input],
|
1542 |
-
outputs=audio_output
|
1543 |
-
)
|
1544 |
-
|
1545 |
-
demo.launch(debug=True)
|
|
|
1191 |
'jv_ID_google-gmu_07765.wav',
|
1192 |
'en_US_vctk_p273.wav'
|
1193 |
]
|
1194 |
+
VOICES = [t[:-4] for t in VOICES] # crop .wav for visuals in gr.DropDown
|
1195 |
|
1196 |
_tts = StyleTTS2().to('cpu')
|
1197 |
|
|
|
1322 |
|
1323 |
|
1324 |
def other_tts(text='Hallov worlds Far over the',
|
1325 |
+
ref_s='wav/af_ZA_google-nwu_0184.wav',
|
1326 |
soundscape='birds fomig',
|
1327 |
cache_lim=64):
|
1328 |
|
|
|
1337 |
text = only_greek_or_only_latin(text, lang='eng')
|
1338 |
|
1339 |
speech_audio = _tts.inference(text,
|
1340 |
+
ref_s=re_s)[0, 0, :].numpy() # 24 Khz
|
1341 |
|
1342 |
if speech_audio.shape[0] > 10:
|
1343 |
|
|
|
1391 |
# If both inputs are empty, create a 2s silent audio file.
|
1392 |
if final_audio is None:
|
1393 |
final_audio = np.zeros(16000 * 2, dtype=np.float32)
|
1394 |
+
|
1395 |
wavfile = '_audionar_.wav'
|
1396 |
audiofile.write(wavfile, final_audio, 16000)
|
1397 |
return wavfile
|
|
|
1411 |
"recognises the expression dimensions arousal, dominance, and valence. "
|
1412 |
)
|
1413 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1414 |
|
1415 |
+
def other_tts(text_input, selected_voice, soundscape_input, kv_input):
|
1416 |
+
"""
|
1417 |
+
This function would handle the TTS generation for 'other TTS' voices.
|
1418 |
+
"""
|
1419 |
+
print(f"Generating TTS for voice: {selected_voice}")
|
1420 |
+
print(f"Text: {text_input}")
|
1421 |
+
print(f"Soundscape: {soundscape_input}")
|
1422 |
+
print(f"KV Period: {kv_input}")
|
1423 |
+
# Replace with your actual TTS generation code
|
1424 |
+
return "path/to/generated/audio.wav"
|
1425 |
+
|
1426 |
+
def audionar_tts(text_input, lang_dropdown, soundscape_input, kv_input):
|
1427 |
+
"""
|
1428 |
+
This function would handle the TTS generation for 'audionar TTS' languages.
|
1429 |
+
"""
|
1430 |
+
print(f"Generating TTS for language: {lang_dropdown}")
|
1431 |
+
print(f"Text: {text_input}")
|
1432 |
+
print(f"Soundscape: {soundscape_input}")
|
1433 |
+
print(f"KV Period: {kv_input}")
|
1434 |
+
# Replace with your actual TTS generation code
|
1435 |
+
return "path/to/generated/audio.wav"
|
1436 |
+
|
1437 |
+
def recognize(audio):
|
1438 |
+
"""
|
1439 |
+
This function handles speech analysis.
|
1440 |
+
"""
|
1441 |
+
print(f"Analyzing audio from: {audio}")
|
1442 |
+
# Replace with your actual speech analysis code
|
1443 |
+
return "30", "Male", "Happy"
|
1444 |
|
|
|
1445 |
|
|
|
|
|
1446 |
|
1447 |
+
with gr.Blocks(theme='huggingface', css=css_buttons) as demo:
|
1448 |
+
with gr.Tab(label="TTS Generation"):
|
1449 |
with gr.Row():
|
1450 |
text_input = gr.Textbox(
|
1451 |
+
label="Type text for TTS:",
|
1452 |
+
placeholder="Type Text for TTS",
|
1453 |
lines=4,
|
1454 |
value="Farover the misty mountains cold too dungeons deep and caverns old.",
|
1455 |
)
|
1456 |
+
# Unified dropdown for both voices and languages
|
1457 |
+
# You'll need to handle the logic to determine if it's a voice or a language
|
1458 |
+
# based on the selection. A single list of choices is used here.
|
1459 |
+
choice_dropdown = gr.Dropdown(
|
1460 |
+
choices=language_names + VOICES,
|
1461 |
+
label="Select Voice or Language",
|
1462 |
+
value=VOICES[0] # Set a default value
|
1463 |
+
)
|
1464 |
+
soundscape_input = gr.Textbox(
|
1465 |
+
lines=1,
|
1466 |
+
value="frogs",
|
1467 |
+
label="AudioGen Txt"
|
1468 |
)
|
1469 |
kv_input = gr.Number(
|
1470 |
label="kv Period",
|
|
|
1474 |
|
1475 |
output_audio = gr.Audio(label="TTS Output")
|
1476 |
|
1477 |
+
def generate_audio_unified(text, choice, soundscape, kv):
|
1478 |
+
"""
|
1479 |
+
Unified function to call the correct TTS backend based on the dropdown choice.
|
1480 |
+
"""
|
1481 |
+
# Logic to determine which function to call based on the choice
|
1482 |
+
if choice in VOICES:
|
1483 |
+
return other_tts(text, choice, soundscape, kv)
|
1484 |
+
elif choice in language_names:
|
1485 |
+
return audionar_tts(text, choice, soundscape, kv)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1486 |
|
1487 |
generate_button.click(
|
1488 |
+
fn=generate_audio_unified,
|
1489 |
+
inputs=[text_input, choice_dropdown, soundscape_input, kv_input],
|
1490 |
outputs=output_audio
|
1491 |
)
|
|
|
1492 |
with gr.Tab(label="Speech Analysis"):
|
1493 |
with gr.Row():
|
1494 |
with gr.Column():
|
|
|
1519 |
outputs = [output_age, output_gender, output_expression]
|
1520 |
submit_btn.click(recognize, input, outputs)
|
1521 |
|
1522 |
+
demo.launch(debug=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|