Spaces:

Baghdad99
/

ha-en

Sleeping

Baghdad99 commited on Dec 21, 2023

Commit

ea3653e

•

1 Parent(s): 776c3e1

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import gradio as gr
 from transformers import pipeline, AutoTokenizer
 import numpy as np
 from pydub import AudioSegment
 # Load the pipeline for speech recognition and translation
 pipe = pipeline(
@@ -12,18 +13,19 @@ pipe = pipeline(
 translator = pipeline("text2text-generation", model="Baghdad99/saad-hausa-text-to-english-text")
 tts = pipeline("text-to-speech", model="Baghdad99/english_voice_tts")
-def translate_speech(audio_data_tuple):
     print(f"Type of audio: {type(audio_data_tuple)}, Value of audio: {audio_data_tuple}")  # Debug line
-    # Extract the audio data from the tuple
-    sample_rate, audio_data = audio_data_tuple
-    # Print the shape and type of the audio data
-    print(f"Audio data type: {type(audio_data)}, Audio data shape: {audio_data.shape}")
     # Normalize the audio data to the range [-1, 1]
     audio_data_normalized = audio_data / np.iinfo(audio_data.dtype).max
     # Convert the normalized audio data to float64
     audio_data_float64 = audio_data_normalized.astype(np.float64)
@@ -82,7 +84,7 @@ def translate_speech(audio_data_tuple):
 # Define the Gradio interface
 iface = gr.Interface(
     fn=translate_speech,
-    inputs=gr.inputs.Audio(source="microphone"),  # Change this line
     outputs=gr.outputs.Audio(type="numpy"),
     title="Hausa to English Translation",
     description="Realtime demo for Hausa to English translation using speech recognition and text-to-speech synthesis."

 from transformers import pipeline, AutoTokenizer
 import numpy as np
 from pydub import AudioSegment
+import librosa
 # Load the pipeline for speech recognition and translation
 pipe = pipeline(
 translator = pipeline("text2text-generation", model="Baghdad99/saad-hausa-text-to-english-text")
 tts = pipeline("text-to-speech", model="Baghdad99/english_voice_tts")
+def translate_speech(audio_input):
     print(f"Type of audio: {type(audio_data_tuple)}, Value of audio: {audio_data_tuple}")  # Debug line
+    # Check if the input is a tuple (recorded audio) or a string (uploaded file)
+    if isinstance(audio_input, tuple):
+        # Extract the audio data from the tuple
+        sample_rate, audio_data = audio_input
+    else:
+        # Load the audio file as a floating point time series
+        audio_data, sample_rate = librosa.load(audio_input, sr=None)
     # Normalize the audio data to the range [-1, 1]
     audio_data_normalized = audio_data / np.iinfo(audio_data.dtype).max
     # Convert the normalized audio data to float64
     audio_data_float64 = audio_data_normalized.astype(np.float64)
 # Define the Gradio interface
 iface = gr.Interface(
     fn=translate_speech,
+    inputs=gr.inputs.Audio(source="microphone", type="file"),  # Change this line
     outputs=gr.outputs.Audio(type="numpy"),
     title="Hausa to English Translation",
     description="Realtime demo for Hausa to English translation using speech recognition and text-to-speech synthesis."