Spaces:
Sleeping
Sleeping
File size: 2,590 Bytes
5b74a4b 8a6097b ee193bb 83e3ccb 5b74a4b ee193bb f0d7e02 def416c a927d1d 393002d 3431153 8a6097b ee193bb 25fb027 393002d 8a6097b ee193bb 8a6097b cd0ec84 8a6097b cd0ec84 25fb027 8a6097b 25fb027 c58bd88 8c23bfa 25fb027 17cfe18 25fb027 a5ec736 b2c7d3a 5b74a4b ee37b95 8fe6fd5 5b74a4b ee37b95 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
import gradio as gr
from transformers import pipeline, AutoTokenizer
from huggingsound import SpeechRecognitionModel
import numpy as np
# Load the model for speech recognition
model = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-english")
translator = pipeline("text2text-generation", model="Baghdad99/saad-english-text-to-hausa-text")
tts = pipeline("text-to-speech", model="Baghdad99/hausa_voice_tts")
# Define the function to translate speech
def translate_speech(audio_data_tuple):
print(f"Type of audio: {type(audio_data_tuple)}, Value of audio: {audio_data_tuple}") # Debug line
# Extract the audio data from the tuple
sample_rate, audio_data = audio_data_tuple
# Use the speech recognition model to transcribe the audio
output = model.transcribe(audio_data)
print(f"Output: {output}") # Print the output to see what it contains
# Use the translation pipeline to translate the transcription
translated_text = translator(output, return_tensors="pt")
print(f"Translated text: {translated_text}") # Print the translated text to see what it contains
# Check if the translated text contains 'generated_token_ids'
if 'generated_token_ids' in translated_text[0]:
# Decode the tokens into text
translated_text_str = translator.tokenizer.decode(translated_text[0]['generated_token_ids'])
else:
print("The translated text does not contain 'generated_token_ids'")
return
# Use the text-to-speech pipeline to synthesize the translated text
synthesised_speech = tts(translated_text_str)
print(f"Synthesised speech: {synthesised_speech}") # Print the synthesised speech to see what it contains
# Check if the synthesised speech contains 'audio'
if 'audio' in synthesised_speech:
synthesised_speech_data = synthesised_speech['audio']
else:
print("The synthesised speech does not contain 'audio'")
return
# Flatten the audio data
synthesised_speech_data = synthesised_speech_data.flatten()
# Scale the audio data to the range of int16 format
synthesised_speech = (synthesised_speech_data * 32767).astype(np.int16)
return 16000, synthesised_speech
# Define the Gradio interface
iface = gr.Interface(
fn=translate_speech,
inputs=gr.inputs.Audio(source="microphone"), # Change this line
outputs=gr.outputs.Audio(type="numpy"),
title="Hausa to English Translation",
description="Realtime demo for Hausa to English translation using speech recognition and text-to-speech synthesis."
)
iface.launch()
|