Spaces:

Baghdad99
/

ha-en

Running

App Files Files Community

Baghdad99 commited on Dec 21, 2023

Commit

5dbc4ea

1 Parent(s): 563f027

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -24

app.py CHANGED Viewed

@@ -1,15 +1,12 @@
 import gradio as gr
-from transformers import pipeline, AutoTokenizer
 import numpy as np
-from pydub import AudioSegment
 import librosa
-from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
-# Load the model and processor
-model = Wav2Vec2ForCTC.from_pretrained("Akashpb13/Hausa_xlsr")
-processor = Wav2Vec2Processor.from_pretrained("Akashpb13/Hausa_xlsr")
 translator = pipeline("text2text-generation", model="Baghdad99/saad-hausa-text-to-english-text")
 tts = pipeline("text-to-speech", model="Baghdad99/english_voice_tts")
@@ -18,27 +15,17 @@ def translate_speech(audio_input):
     audio_data, sample_rate = librosa.load(audio_input, sr=None)
     # Prepare the input dictionary
-    input_dict = processor(audio_data, return_tensors="pt", padding=True)
-    # Use the model to get the logits
-    logits = model(input_dict.input_values.to("cpu")).logits
     # Get the predicted IDs
     pred_ids = torch.argmax(logits, dim=-1)[0]
     # Decode the predicted IDs to get the transcription
-    transcription = processor.decode(pred_ids)
-    # Use the speech recognition pipeline to transcribe the audio
-    output = pipe(audio_data)
-    # Check if the output contains 'text'
-    if 'text' in output:
-        transcription = output["text"]
-        print(f"Transcription: {transcription}")  # Print the transcription
-    else:
-        print("The output does not contain 'text'")
-        return
     # Use the translation pipeline to translate the transcription
     translated_text = translator(transcription, return_tensors="pt")
@@ -71,7 +58,6 @@ def translate_speech(audio_input):
     return 16000, synthesised_speech
 # Define the Gradio interface
 iface = gr.Interface(
     fn=translate_speech,

+import torch  # Add this line
 import gradio as gr
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline, AutoTokenizer
 import numpy as np
 import librosa
+# Load the models and processors
+asr_model = Wav2Vec2ForCTC.from_pretrained("Akashpb13/Hausa_xlsr")
+asr_processor = Wav2Vec2Processor.from_pretrained("Akashpb13/Hausa_xlsr")
 translator = pipeline("text2text-generation", model="Baghdad99/saad-hausa-text-to-english-text")
 tts = pipeline("text-to-speech", model="Baghdad99/english_voice_tts")
     audio_data, sample_rate = librosa.load(audio_input, sr=None)
     # Prepare the input dictionary
+    input_dict = asr_processor(audio_data, return_tensors="pt", padding=True)
+    # Use the ASR model to get the logits
+    logits = asr_model(input_dict.input_values.to("cpu")).logits
     # Get the predicted IDs
     pred_ids = torch.argmax(logits, dim=-1)[0]
     # Decode the predicted IDs to get the transcription
+    transcription = asr_processor.decode(pred_ids)
+    print(f"Transcription: {transcription}")  # Print the transcription
     # Use the translation pipeline to translate the transcription
     translated_text = translator(transcription, return_tensors="pt")
     return 16000, synthesised_speech
 # Define the Gradio interface
 iface = gr.Interface(
     fn=translate_speech,