speech2speech / app.py
frogcho123's picture
Update app.py
bda48ea
raw
history blame
1.7 kB
import os
import gradio as gr
import whisper
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from gtts import gTTS
# Load Whisper STT model
whisper_model = whisper.load_model("base")
# Load translation models
tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100")
model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100")
def translate_speech(audio):
audio = audio[0]
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)
_, probs = whisper_model.detect_language(mel)
options = whisper.DecodingOptions(fp16=False)
result = whisper.decode(whisper_model, mel, options)
text = result.text
# Translate text
tokenizer.src_lang = 'en' # Assuming the input is always in English
encoded_text = tokenizer(text, return_tensors="pt")
generated_tokens = model.generate(**encoded_text)
translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
# Text-to-speech (TTS)
tts = gTTS(text=translated_text, lang='en') # Assuming the target language is English
audio_path = "translated_audio.mp3"
tts.save(audio_path)
return audio_path
def translate_speech_interface(audio):
translated_audio = translate_speech(audio)
translated_audio_bytes = open(translated_audio, "rb").read()
return translated_audio_bytes
audio_recording = gr.inputs.Audio(source="microphone", type="numpy", label="Record your speech")
output_audio = gr.outputs.Audio(type="numpy", label="Translated Audio")
iface = gr.Interface(fn=translate_speech_interface, inputs=audio_recording, outputs=output_audio, title="Speech Translator")
iface.launch()