import os import gradio as gr import whisper from transformers import AutoTokenizer, AutoModelForSeq2SeqLM from gtts import gTTS def translate_voice(file, target_lang): try: # Load the model and switch to float32 model = whisper.load_model("base").float() # Load the audio audio = whisper.load_audio(file.name) # Pad or trim the audio audio = whisper.pad_or_trim(audio) # Convert the audio to a log Mel spectrogram and move it to the same device as the model (CPU in your case) mel = whisper.log_mel_spectrogram(audio).to(model.device).float() # convert to full-precision float32 # Proceed with your language detection and decoding _, probs = model.detect_language(mel) options = whisper.DecodingOptions(fp16 = False) result = whisper.decode(model, mel, options) text = result.text lang = max(probs, key=probs.get) # Translate tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100") model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100") tokenizer.src_lang = target_lang encoded_bg = tokenizer(text, return_tensors="pt") generated_tokens = model.generate(**encoded_bg) translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] # Text-to-audio (TTS) tts = gTTS(text=translated_text, lang=target_lang) filename = "to_speech.mp3" tts.save(filename) return filename, text, translated_text, target_lang except Exception as e: return str(e), "", "", "" iface = gr.Interface( fn=translate_voice, inputs=[ gr.inputs.File(label="Your Audio"), gr.inputs.Dropdown(choices=['en', 'ru', 'de', 'fr'], label="Target Language") ], outputs=[ gr.outputs.Audio(type="filepath", label="Translated Audio"), gr.outputs.Textbox(label="Original Text"), gr.outputs.Textbox(label="Translated Text"), gr.outputs.Textbox(label="Target Language"), ] ) iface.launch()