Dionyssos commited on
Commit
059ae13
·
1 Parent(s): e16f567

Dropslists 2 tabs

Browse files
Files changed (1) hide show
  1. app.py +61 -84
app.py CHANGED
@@ -1191,6 +1191,7 @@ VOICES = ['jv_ID_google-gmu_04982.wav',
1191
  'jv_ID_google-gmu_07765.wav',
1192
  'en_US_vctk_p273.wav'
1193
  ]
 
1194
 
1195
  _tts = StyleTTS2().to('cpu')
1196
 
@@ -1321,7 +1322,7 @@ def only_greek_or_only_latin(text, lang='grc'):
1321
 
1322
 
1323
  def other_tts(text='Hallov worlds Far over the',
1324
- ref_s='af_ZA_google-nwu_0184.wav',
1325
  soundscape='birds fomig',
1326
  cache_lim=64):
1327
 
@@ -1336,7 +1337,7 @@ def other_tts(text='Hallov worlds Far over the',
1336
  text = only_greek_or_only_latin(text, lang='eng')
1337
 
1338
  speech_audio = _tts.inference(text,
1339
- ref_s='wav/' + ref_s)[0, 0, :].numpy() # 24 Khz
1340
 
1341
  if speech_audio.shape[0] > 10:
1342
 
@@ -1390,7 +1391,7 @@ def other_tts(text='Hallov worlds Far over the',
1390
  # If both inputs are empty, create a 2s silent audio file.
1391
  if final_audio is None:
1392
  final_audio = np.zeros(16000 * 2, dtype=np.float32)
1393
- print('\n=============F I N A L\n', final_audio.shape, final_audio.dtype, final_audio.min(), np.isnan(final_audio).sum())
1394
  wavfile = '_audionar_.wav'
1395
  audiofile.write(wavfile, final_audio, 16000)
1396
  return wavfile
@@ -1410,48 +1411,60 @@ description = (
1410
  "recognises the expression dimensions arousal, dominance, and valence. "
1411
  )
1412
 
1413
- css_buttons = """
1414
- .cool-button {
1415
- background-color: #1a2a40; /* Slightly lighter dark blue */
1416
- color: white;
1417
- padding: 15px 32px;
1418
- text-align: center;
1419
- font-size: 16px;
1420
- border-radius: 12px;
1421
- box-shadow: 0 4px 8px rgba(0, 0, 0, 0.4);
1422
- transition: all 0.3s ease-in-out;
1423
- border: none;
1424
- cursor: pointer;
1425
- }
1426
- .cool-button:hover {
1427
- background-color: #1a2a40; /* Slightly lighter dark blue */
1428
- transform: scale(1.05);
1429
- box-shadow: 0 4px 8px rgba(0, 0, 0, 0.4);
1430
- }
1431
- .cool-row {
1432
- margin-bottom: 10px;
1433
- }
1434
- """
1435
 
1436
- with gr.Blocks(theme='huggingface', css=css_buttons) as demo:
1437
- with gr.Tab(label="other TTS"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1438
 
1439
- selected_voice = gr.State(value='wav/en_US_m-ailabs_mary_ann.wav')
1440
 
1441
- with gr.Row():
1442
- voice_info = gr.Markdown(f'Vox = `{selected_voice.value}`')
1443
 
1444
- # Main input and output components
 
1445
  with gr.Row():
1446
  text_input = gr.Textbox(
1447
- label="TYpe text for TTS:",
1448
- placeholder="Type your message here...",
1449
  lines=4,
1450
  value="Farover the misty mountains cold too dungeons deep and caverns old.",
1451
  )
1452
- soundscape_input = gr.Textbox(lines=1,
1453
- value="frogs",
1454
- label="AudioGen Txt"
 
 
 
 
 
 
 
 
 
1455
  )
1456
  kv_input = gr.Number(
1457
  label="kv Period",
@@ -1461,32 +1474,21 @@ with gr.Blocks(theme='huggingface', css=css_buttons) as demo:
1461
 
1462
  output_audio = gr.Audio(label="TTS Output")
1463
 
1464
- with gr.Column():
1465
- voice_buttons = []
1466
- for i in range(0, len(VOICES), 7):
1467
- with gr.Row(elem_classes=["cool-row"]):
1468
- for voice_filename in VOICES[i:i+7]:
1469
-
1470
- button = gr.Button(voice_filename, elem_classes=["cool-button"])
1471
-
1472
- button.click(
1473
- fn=update_selected_voice,
1474
- inputs=[gr.Textbox(value=voice_filename, visible=False)],
1475
- outputs=[selected_voice]
1476
- )
1477
- button.click(
1478
- fn=lambda v=voice_filename: f'Vox = `{v}`',
1479
- inputs=None,
1480
- outputs=voice_info
1481
- )
1482
- voice_buttons.append(button)
1483
 
1484
  generate_button.click(
1485
- fn=other_tts,
1486
- inputs=[text_input, selected_voice, soundscape_input, kv_input],
1487
  outputs=output_audio
1488
  )
1489
-
1490
  with gr.Tab(label="Speech Analysis"):
1491
  with gr.Row():
1492
  with gr.Column():
@@ -1517,29 +1519,4 @@ with gr.Blocks(theme='huggingface', css=css_buttons) as demo:
1517
  outputs = [output_age, output_gender, output_expression]
1518
  submit_btn.click(recognize, input, outputs)
1519
 
1520
-
1521
- with gr.Tab("audionar TTS"):
1522
- with gr.Row():
1523
- text_input = gr.Textbox(
1524
- lines=4,
1525
- value='Η γρηγορη καφετι αλεπου πειδαει πανω απο τον τεμπελη σκυλο.',
1526
- label="Type text for TTS"
1527
- )
1528
- lang_dropdown = gr.Dropdown(choices=language_names, label="TTS language", value="Ancient greek")
1529
- soundscape_input = gr.Textbox(lines=1, value="dogs barg", label="AudioGen Txt")
1530
- kv_input = gr.Number(label="kv Period", value=70)
1531
-
1532
- # Create a button to trigger the TTS function
1533
- tts_button = gr.Button("Generate Audio")
1534
-
1535
- # Create the output audio component
1536
- audio_output = gr.Audio(label="Generated Audio")
1537
-
1538
- # Link the button click event to the mms_tts function
1539
- tts_button.click(
1540
- fn=audionar_tts,
1541
- inputs=[text_input, lang_dropdown, soundscape_input, kv_input],
1542
- outputs=audio_output
1543
- )
1544
-
1545
- demo.launch(debug=True)
 
1191
  'jv_ID_google-gmu_07765.wav',
1192
  'en_US_vctk_p273.wav'
1193
  ]
1194
+ VOICES = [t[:-4] for t in VOICES] # crop .wav for visuals in gr.DropDown
1195
 
1196
  _tts = StyleTTS2().to('cpu')
1197
 
 
1322
 
1323
 
1324
  def other_tts(text='Hallov worlds Far over the',
1325
+ ref_s='wav/af_ZA_google-nwu_0184.wav',
1326
  soundscape='birds fomig',
1327
  cache_lim=64):
1328
 
 
1337
  text = only_greek_or_only_latin(text, lang='eng')
1338
 
1339
  speech_audio = _tts.inference(text,
1340
+ ref_s=re_s)[0, 0, :].numpy() # 24 Khz
1341
 
1342
  if speech_audio.shape[0] > 10:
1343
 
 
1391
  # If both inputs are empty, create a 2s silent audio file.
1392
  if final_audio is None:
1393
  final_audio = np.zeros(16000 * 2, dtype=np.float32)
1394
+
1395
  wavfile = '_audionar_.wav'
1396
  audiofile.write(wavfile, final_audio, 16000)
1397
  return wavfile
 
1411
  "recognises the expression dimensions arousal, dominance, and valence. "
1412
  )
1413
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1414
 
1415
+ def other_tts(text_input, selected_voice, soundscape_input, kv_input):
1416
+ """
1417
+ This function would handle the TTS generation for 'other TTS' voices.
1418
+ """
1419
+ print(f"Generating TTS for voice: {selected_voice}")
1420
+ print(f"Text: {text_input}")
1421
+ print(f"Soundscape: {soundscape_input}")
1422
+ print(f"KV Period: {kv_input}")
1423
+ # Replace with your actual TTS generation code
1424
+ return "path/to/generated/audio.wav"
1425
+
1426
+ def audionar_tts(text_input, lang_dropdown, soundscape_input, kv_input):
1427
+ """
1428
+ This function would handle the TTS generation for 'audionar TTS' languages.
1429
+ """
1430
+ print(f"Generating TTS for language: {lang_dropdown}")
1431
+ print(f"Text: {text_input}")
1432
+ print(f"Soundscape: {soundscape_input}")
1433
+ print(f"KV Period: {kv_input}")
1434
+ # Replace with your actual TTS generation code
1435
+ return "path/to/generated/audio.wav"
1436
+
1437
+ def recognize(audio):
1438
+ """
1439
+ This function handles speech analysis.
1440
+ """
1441
+ print(f"Analyzing audio from: {audio}")
1442
+ # Replace with your actual speech analysis code
1443
+ return "30", "Male", "Happy"
1444
 
 
1445
 
 
 
1446
 
1447
+ with gr.Blocks(theme='huggingface', css=css_buttons) as demo:
1448
+ with gr.Tab(label="TTS Generation"):
1449
  with gr.Row():
1450
  text_input = gr.Textbox(
1451
+ label="Type text for TTS:",
1452
+ placeholder="Type Text for TTS",
1453
  lines=4,
1454
  value="Farover the misty mountains cold too dungeons deep and caverns old.",
1455
  )
1456
+ # Unified dropdown for both voices and languages
1457
+ # You'll need to handle the logic to determine if it's a voice or a language
1458
+ # based on the selection. A single list of choices is used here.
1459
+ choice_dropdown = gr.Dropdown(
1460
+ choices=language_names + VOICES,
1461
+ label="Select Voice or Language",
1462
+ value=VOICES[0] # Set a default value
1463
+ )
1464
+ soundscape_input = gr.Textbox(
1465
+ lines=1,
1466
+ value="frogs",
1467
+ label="AudioGen Txt"
1468
  )
1469
  kv_input = gr.Number(
1470
  label="kv Period",
 
1474
 
1475
  output_audio = gr.Audio(label="TTS Output")
1476
 
1477
+ def generate_audio_unified(text, choice, soundscape, kv):
1478
+ """
1479
+ Unified function to call the correct TTS backend based on the dropdown choice.
1480
+ """
1481
+ # Logic to determine which function to call based on the choice
1482
+ if choice in VOICES:
1483
+ return other_tts(text, choice, soundscape, kv)
1484
+ elif choice in language_names:
1485
+ return audionar_tts(text, choice, soundscape, kv)
 
 
 
 
 
 
 
 
 
 
1486
 
1487
  generate_button.click(
1488
+ fn=generate_audio_unified,
1489
+ inputs=[text_input, choice_dropdown, soundscape_input, kv_input],
1490
  outputs=output_audio
1491
  )
 
1492
  with gr.Tab(label="Speech Analysis"):
1493
  with gr.Row():
1494
  with gr.Column():
 
1519
  outputs = [output_age, output_gender, output_expression]
1520
  submit_btn.click(recognize, input, outputs)
1521
 
1522
+ demo.launch(debug=True)