import gradio as gr import whisper from translate import Translator import os os.environ["COQUI_TOS_AGREED"] = "1" # Loading the base model model = whisper.load_model("base") def speech_to_text(audio_file): """ :param audio_file: Input audio file :return: result["text"] - The transcribed text after processing the input """ result = model.transcribe(audio_file) print(result["text"]) return result["text"] # Only first tuple # Defining the Translate Function def translate(text, language): """ :param text: Input text to be translated :param language: Language choice :return: translated_text - The translated text """ translator = Translator(to_lang=language) translated_text = translator.translate(text) return translated_text # Initialize TTS model outside the function to avoid reinitialization on each call from TTS.api import TTS tts_model = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False) # Speech to Speech Function def s2s(audio, language): """ :param audio: :param language: :return: """ print(audio) # Load the audio file from the file path result_text = speech_to_text(audio) translated_text = translate(result_text, language) # Generate speech using the input audio as the speaker's voice tts_model.tts_to_file(text=translated_text, file_path="output.wav", speaker_wav=audio, language=language) return [result_text, translated_text, "output.wav"] # List of supported language codes language_names = ["Arabic", "Portuguese", "Chinese", "Czech", "Dutch", "English", "French", "German", "Italian", "Polish", "Russian", "Spanish", "Turkish", "Korean", "Hungarian", "Hindi"] language_options = ["ar", "pt", "zh-cn", "cs", "nl", "en", "fr", "de", "it", "pl", "ru", "es", "tr", "ko", "hu", "hi"] language_dropdown = gr.Dropdown(choices=zip(language_names, language_options), value="es", label="Target Language", ) translate_button = gr.Button(value="Synthesize and Translate my Voice!") transcribed_text = gr.Textbox(label="Transcribed Text") output_text = gr.Textbox(label="Translated Text") output_speech = gr.Audio(label="Synthesized Audio", type="filepath") # Gradio interface with the transcribe function as the main function demo = gr.Interface( fn=s2s, inputs=[gr.Audio(sources=["upload", "microphone"], type="filepath", format='wav', # value="Original Audio", show_download_button=True, waveform_options=gr.WaveformOptions( waveform_color="#01C6FF", waveform_progress_color="#FF69B4", skip_length=2, show_controls=False, ) ), language_dropdown], outputs=[transcribed_text, output_text, output_speech], theme=gr.themes.Soft(), title="Speech Translation Synthesis", description="This speech-to-speech translator uses the Whisper model for speech-to-text " "transcription, the translate library for translation, and the Coqui TTS model for text-to-speech " "synthesis.", allow_flagging="never" ) demo.launch(debug=True, share=True)