import os import gradio as gr import whisper from transformers import AutoTokenizer, AutoModelForSeq2SeqLM from gtts import gTTS import sentencepiece import sounddevice as sd import soundfile as sf import tempfile def translate_voice(audio, target_lang): with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio: temp_filename = temp_audio.name sf.write(temp_filename, audio, 16000) model = whisper.load_model("base").float() audio = whisper.load_audio(temp_filename) audio = whisper.pad_or_trim(audio) mel = whisper.log_mel_spectrogram(audio).to(model.device).float() _, probs = model.detect_language(mel) options = whisper.DecodingOptions(fp16=False) result = whisper.decode(model, mel, options) text = result.text lang = max(probs, key=probs.get) tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100") model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100") tokenizer.src_lang = target_lang encoded_bg = tokenizer(text, return_tensors="pt") generated_tokens = model.generate(**encoded_bg) translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] tts = gTTS(text=translated_text, lang=target_lang) filename = "to_speech.mp3" tts.save(filename) return filename, text, translated_text, target_lang def record_audio(): fs = 16000 duration = 5 # Record audio for 5 seconds, you can adjust the duration as needed audio = sd.rec(int(duration * fs), samplerate=fs, channels=1) sd.wait() return audio.flatten() iface = gr.Interface( fn=translate_voice, inputs=[ gr.inputs.Audio(type="microphone", label="Speak"), gr.inputs.Dropdown(choices=['en', 'ru', 'de', 'fr'], label="Target Language") ], outputs=[ gr.outputs.Audio(type="filepath", label="Translated Audio"), gr.outputs.Textbox(label="Original Text"), gr.outputs.Textbox(label="Translated Text"), gr.outputs.Textbox(label="Target Language"), ] ) iface.launch()