Spaces:

Dionyssos
/

speech-analysis2

Running

App Files Files Community

Dionyssos commited on 17 days ago

Commit

059ae13

1 Parent(s): e16f567

Dropslists 2 tabs

Browse files

Files changed (1) hide show

app.py +61 -84

app.py CHANGED Viewed

@@ -1191,6 +1191,7 @@ VOICES = ['jv_ID_google-gmu_04982.wav',
         'jv_ID_google-gmu_07765.wav',
         'en_US_vctk_p273.wav'
         ]
 _tts = StyleTTS2().to('cpu')
@@ -1321,7 +1322,7 @@ def only_greek_or_only_latin(text, lang='grc'):
 def other_tts(text='Hallov worlds Far over the',
-              ref_s='af_ZA_google-nwu_0184.wav',
               soundscape='birds fomig',
               cache_lim=64):
@@ -1336,7 +1337,7 @@ def other_tts(text='Hallov worlds Far over the',
         text = only_greek_or_only_latin(text, lang='eng')
         speech_audio = _tts.inference(text,
-                                      ref_s='wav/' + ref_s)[0, 0, :].numpy()  # 24 Khz
         if speech_audio.shape[0] > 10:
@@ -1390,7 +1391,7 @@ def other_tts(text='Hallov worlds Far over the',
     # If both inputs are empty, create a 2s silent audio file.
     if final_audio is None:
         final_audio = np.zeros(16000 * 2, dtype=np.float32)
-    print('\n=============F I N A L\n', final_audio.shape, final_audio.dtype, final_audio.min(), np.isnan(final_audio).sum())
     wavfile = '_audionar_.wav'
     audiofile.write(wavfile, final_audio, 16000)
     return wavfile
@@ -1410,48 +1411,60 @@ description = (
     "recognises the expression dimensions arousal, dominance, and valence. "
 )
-css_buttons = """
-            .cool-button {
-            background-color: #1a2a40; /* Slightly lighter dark blue */
-            color: white;
-            padding: 15px 32px;
-            text-align: center;
-            font-size: 16px;
-            border-radius: 12px;
-            box-shadow: 0 4px 8px rgba(0, 0, 0, 0.4);
-            transition: all 0.3s ease-in-out;
-            border: none;
-            cursor: pointer;
-        }
-        .cool-button:hover {
-            background-color: #1a2a40; /* Slightly lighter dark blue */
-            transform: scale(1.05);
-            box-shadow: 0 4px 8px rgba(0, 0, 0, 0.4);
-        }
-        .cool-row {
-            margin-bottom: 10px;
-        }
-        """
-with gr.Blocks(theme='huggingface', css=css_buttons) as demo:
-    with gr.Tab(label="other TTS"):
-        selected_voice = gr.State(value='wav/en_US_m-ailabs_mary_ann.wav')
-        with gr.Row():
-            voice_info = gr.Markdown(f'Vox = `{selected_voice.value}`')
-        # Main input and output components
         with gr.Row():
             text_input = gr.Textbox(
-                label="TYpe text for TTS:",
-                placeholder="Type your message here...",
                 lines=4,
                 value="Farover the misty mountains cold too dungeons deep and caverns old.",
             )
-            soundscape_input = gr.Textbox(lines=1,
-                   value="frogs",
-                   label="AudioGen Txt"
             )
             kv_input = gr.Number(
                 label="kv Period",
@@ -1461,32 +1474,21 @@ with gr.Blocks(theme='huggingface', css=css_buttons) as demo:
         output_audio = gr.Audio(label="TTS Output")
-        with gr.Column():
-            voice_buttons = []
-            for i in range(0, len(VOICES), 7):
-                with gr.Row(elem_classes=["cool-row"]):
-                    for voice_filename in VOICES[i:i+7]:
-                        button = gr.Button(voice_filename,  elem_classes=["cool-button"])
-                        button.click(
-                            fn=update_selected_voice,
-                            inputs=[gr.Textbox(value=voice_filename, visible=False)],
-                            outputs=[selected_voice]
-                        )
-                        button.click(
-                            fn=lambda v=voice_filename: f'Vox = `{v}`',
-                            inputs=None,
-                            outputs=voice_info
-                        )
-                        voice_buttons.append(button)
         generate_button.click(
-            fn=other_tts,
-            inputs=[text_input, selected_voice, soundscape_input, kv_input],
             outputs=output_audio
         )
     with gr.Tab(label="Speech Analysis"):
         with gr.Row():
             with gr.Column():
@@ -1517,29 +1519,4 @@ with gr.Blocks(theme='huggingface', css=css_buttons) as demo:
         outputs = [output_age, output_gender, output_expression]
         submit_btn.click(recognize, input, outputs)
-    with gr.Tab("audionar TTS"):
-            with gr.Row():
-                text_input = gr.Textbox(
-                    lines=4,
-                    value='Η γρηγορη καφετι αλεπου πειδαει πανω απο τον τεμπελη σκυλο.',
-                    label="Type text for TTS"
-                )
-                lang_dropdown = gr.Dropdown(choices=language_names, label="TTS language", value="Ancient greek")
-                soundscape_input = gr.Textbox(lines=1, value="dogs barg", label="AudioGen Txt")
-                kv_input = gr.Number(label="kv Period", value=70)
-            # Create a button to trigger the TTS function
-            tts_button = gr.Button("Generate Audio")
-            # Create the output audio component
-            audio_output = gr.Audio(label="Generated Audio")
-            # Link the button click event to the mms_tts function
-            tts_button.click(
-                fn=audionar_tts,
-                inputs=[text_input, lang_dropdown, soundscape_input, kv_input],
-                outputs=audio_output
-            )
-demo.launch(debug=True)

         'jv_ID_google-gmu_07765.wav',
         'en_US_vctk_p273.wav'
         ]
+VOICES = [t[:-4] for t in VOICES]  # crop .wav for visuals in gr.DropDown
 _tts = StyleTTS2().to('cpu')
 def other_tts(text='Hallov worlds Far over the',
+              ref_s='wav/af_ZA_google-nwu_0184.wav',
               soundscape='birds fomig',
               cache_lim=64):
         text = only_greek_or_only_latin(text, lang='eng')
         speech_audio = _tts.inference(text,
+                                      ref_s=re_s)[0, 0, :].numpy()  # 24 Khz
         if speech_audio.shape[0] > 10:
     # If both inputs are empty, create a 2s silent audio file.
     if final_audio is None:
         final_audio = np.zeros(16000 * 2, dtype=np.float32)
     wavfile = '_audionar_.wav'
     audiofile.write(wavfile, final_audio, 16000)
     return wavfile
     "recognises the expression dimensions arousal, dominance, and valence. "
 )
+def other_tts(text_input, selected_voice, soundscape_input, kv_input):
+    """
+    This function would handle the TTS generation for 'other TTS' voices.
+    """
+    print(f"Generating TTS for voice: {selected_voice}")
+    print(f"Text: {text_input}")
+    print(f"Soundscape: {soundscape_input}")
+    print(f"KV Period: {kv_input}")
+    # Replace with your actual TTS generation code
+    return "path/to/generated/audio.wav"
+def audionar_tts(text_input, lang_dropdown, soundscape_input, kv_input):
+    """
+    This function would handle the TTS generation for 'audionar TTS' languages.
+    """
+    print(f"Generating TTS for language: {lang_dropdown}")
+    print(f"Text: {text_input}")
+    print(f"Soundscape: {soundscape_input}")
+    print(f"KV Period: {kv_input}")
+    # Replace with your actual TTS generation code
+    return "path/to/generated/audio.wav"
+def recognize(audio):
+    """
+    This function handles speech analysis.
+    """
+    print(f"Analyzing audio from: {audio}")
+    # Replace with your actual speech analysis code
+    return "30", "Male", "Happy"
+with gr.Blocks(theme='huggingface', css=css_buttons) as demo:
+    with gr.Tab(label="TTS Generation"):
         with gr.Row():
             text_input = gr.Textbox(
+                label="Type text for TTS:",
+                placeholder="Type Text for TTS",
                 lines=4,
                 value="Farover the misty mountains cold too dungeons deep and caverns old.",
             )
+            # Unified dropdown for both voices and languages
+            # You'll need to handle the logic to determine if it's a voice or a language
+            # based on the selection. A single list of choices is used here.
+            choice_dropdown = gr.Dropdown(
+                choices=language_names + VOICES,
+                label="Select Voice or Language",
+                value=VOICES[0]  # Set a default value
+            )
+            soundscape_input = gr.Textbox(
+                lines=1,
+                value="frogs",
+                label="AudioGen Txt"
             )
             kv_input = gr.Number(
                 label="kv Period",
         output_audio = gr.Audio(label="TTS Output")
+        def generate_audio_unified(text, choice, soundscape, kv):
+            """
+            Unified function to call the correct TTS backend based on the dropdown choice.
+            """
+            # Logic to determine which function to call based on the choice
+            if choice in VOICES:
+                return other_tts(text, choice, soundscape, kv)
+            elif choice in language_names:
+                return audionar_tts(text, choice, soundscape, kv)
         generate_button.click(
+            fn=generate_audio_unified,
+            inputs=[text_input, choice_dropdown, soundscape_input, kv_input],
             outputs=output_audio
         )
     with gr.Tab(label="Speech Analysis"):
         with gr.Row():
             with gr.Column():
         outputs = [output_age, output_gender, output_expression]
         submit_btn.click(recognize, input, outputs)
+demo.launch(debug=True)