import os import tempfile import gradio as gr import whisper from transformers import AutoTokenizer, AutoModelForSeq2SeqLM from gtts import gTTS from IPython.display import Audio # Load the models and tokenizer whisper_model = whisper.load_model("base") tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100") model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100") def translate_audio(input_file, to_lang): # Load the audio file audio = whisper.load_audio(input_file) audio = whisper.pad_or_trim(audio) mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device) # Detect language using Whisper _, probs = whisper_model.detect_language(mel) lang = max(probs, key=probs.get) # Convert audio to text options = whisper.DecodingOptions() result = whisper.decode(whisper_model, mel, options) text = result.text # Translate the text tokenizer.src_lang = lang encoded_bg = tokenizer(text, return_tensors="pt") generated_tokens = model.generate(**encoded_bg) translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] # Convert translated text to audio tts = gTTS(text=translated_text, lang=to_lang) temp_output_file = tempfile.NamedTemporaryFile(suffix=".mp3").name tts.save(temp_output_file) # Load audio data from file audio_data = open(temp_output_file, "rb").read() return Audio(audio_data) def translate_audio_interface(input_file, to_lang): return translate_audio(input_file, to_lang) iface = gr.Interface( fn=translate_audio_interface, inputs=["file", "text"], outputs="audio", title="Audio Translation", description="Upload an MP3 file and select the target language for translation.", examples=[ ["audio_example.mp3", "en"], ["speech_sample.mp3", "fr"], ] ) iface.launch(debug = True)