import gradio as gr import os import whisper from transformers import AutoTokenizer, AutoModelForSeq2SeqLM from gtts import gTTS import IPython.display as ipd import numpy as np # Load Whisper STT model whisper_model = whisper.load_model("base") # Load translation models tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100") model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100") def translate_speech(audio, target_lang): # Save audio as a temporary file audio_path = "recorded_audio.wav" with open(audio_path, "wb") as f: f.write(audio) # Load audio audio = whisper.load_audio(audio_path) audio = whisper.pad_or_trim(audio) mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device) # Detect language _, probs = whisper_model.detect_language(mel) lang = max(probs, key=probs.get) # Decode audio into text options = whisper.DecodingOptions() result = whisper.decode(whisper_model, mel, options) text = result.text # Translate text tokenizer.src_lang = lang encoded_text = tokenizer(text, return_tensors="pt") generated_tokens = model.generate(**encoded_text) translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] # Text-to-speech (TTS) tts = gTTS(text=translated_text, lang=target_lang) audio_path = "translated_audio.mp3" tts.save(audio_path) return audio_path def translate_speech_interface(audio, target_lang): translated_audio = translate_speech(audio, target_lang) translated_audio = open(translated_audio, "rb") return translated_audio # Define the Gradio interface audio_recording = gr.inputs.Audio(source="microphone", type="numpy", label="Record your speech") target_language = gr.inputs.Dropdown(["en", "ru", "fr"], label="Target Language") output_audio = gr.outputs.Audio(type="audio/mpeg", label="Translated Audio") gr.Interface(fn=translate_speech_interface, inputs=[audio_recording, target_language], outputs=output_audio, title="Speech Translator").launch()