import gradio as gr from gradio import components import whisper from transformers import AutoTokenizer, AutoModelForSeq2SeqLM from gtts import gTTS import soundfile as sf import os import numpy as np def translate_speech_to_speech(input_tuple): input_audio, sample_rate = input_tuple # Save the input audio to a temporary file input_file = "input_audio.wav" sf.write(input_file, input_audio, sample_rate) # use the sample rate from Gradio # Language detection and translation code from the first code snippet model = whisper.load_model("base") audio = whisper.load_audio(input_file) audio = whisper.pad_or_trim(audio) mel = whisper.log_mel_spectrogram(audio).to(model.device) _, probs = model.detect_language(mel) options = whisper.DecodingOptions() result = whisper.decode(model, mel, options) text = result.text lang = max(probs, key=probs.get) # Translation code from the first code snippet to_lang = 'ru' tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100") model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100") tokenizer.src_lang = lang encoded_bg = tokenizer(text, return_tensors="pt") generated_tokens = model.generate(**encoded_bg) translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] # Text-to-speech (TTS) code from the first code snippet tts = gTTS(text=translated_text, lang=to_lang) output_file = "translated_speech.wav" tts.save(output_file) # Load the translated audio and return as an output translated_audio, _ = sf.read(output_file, dtype="int16") return translated_audio title = "Speech-to-Speech Translator" input_audio = gr.inputs.Audio(source="microphone") output_audio = gr.outputs.Audio(type="numpy") stt_demo = gr.Interface( fn=translate_speech_to_speech, inputs=input_audio, outputs=output_audio, title=title, description="Speak in any language, and the translator will convert it to speech in the target language.", ) if __name__ == "__main__": stt_demo.launch()