|
import gradio as gr |
|
import whisper |
|
from translate import Translator |
|
import os |
|
|
|
os.environ["COQUI_TOS_AGREED"] = "1" |
|
|
|
|
|
model = whisper.load_model("base") |
|
|
|
|
|
def speech_to_text(audio_file): |
|
""" |
|
:param audio_file: Input audio file |
|
:return: result["text"] - The transcribed text after processing the input |
|
""" |
|
result = model.transcribe(audio_file) |
|
print(result["text"]) |
|
return result["text"] |
|
|
|
|
|
|
|
def translate(text, language): |
|
""" |
|
|
|
:param text: Input text to be translated |
|
:param language: Language choice |
|
:return: translated_text - The translated text |
|
""" |
|
translator = Translator(to_lang=language) |
|
translated_text = translator.translate(text) |
|
return translated_text |
|
|
|
|
|
from TTS.api import TTS |
|
|
|
tts_model = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False) |
|
|
|
|
|
|
|
def s2s(audio, language): |
|
""" |
|
|
|
:param audio: |
|
:param language: |
|
:return: |
|
""" |
|
print(audio) |
|
|
|
result_text = speech_to_text(audio) |
|
translated_text = translate(result_text, language) |
|
|
|
|
|
tts_model.tts_to_file(text=translated_text, |
|
file_path="output.wav", |
|
speaker_wav=audio, |
|
language=language) |
|
|
|
return [result_text, translated_text, "output.wav"] |
|
|
|
|
|
|
|
language_names = ["Arabic", "Portuguese", "Chinese", "Czech", "Dutch", |
|
"English", "French", "German", "Italian", "Polish", |
|
"Russian", "Spanish", "Turkish", "Korean", |
|
"Hungarian", "Hindi"] |
|
language_options = ["ar", "pt", "zh-cn", "cs", "nl", "en", "fr", "de", |
|
"it", "pl", "ru", "es", "tr", "ko", "hu", "hi"] |
|
|
|
language_dropdown = gr.Dropdown(choices=zip(language_names, language_options), |
|
value="es", |
|
label="Target Language", |
|
) |
|
|
|
translate_button = gr.Button(value="Synthesize and Translate my Voice!") |
|
transcribed_text = gr.Textbox(label="Transcribed Text") |
|
output_text = gr.Textbox(label="Translated Text") |
|
output_speech = gr.Audio(label="Synthesized Audio", type="filepath") |
|
|
|
|
|
demo = gr.Interface( |
|
fn=s2s, |
|
inputs=[gr.Audio(sources=["upload", "microphone"], |
|
type="filepath", |
|
format='wav', |
|
|
|
show_download_button=True, |
|
waveform_options=gr.WaveformOptions( |
|
waveform_color="#01C6FF", |
|
waveform_progress_color="#FF69B4", |
|
skip_length=2, |
|
show_controls=False, |
|
) |
|
), |
|
language_dropdown], |
|
outputs=[transcribed_text, output_text, output_speech], |
|
theme=gr.themes.Soft(), |
|
title="Speech Translation Synthesis", |
|
description="This speech-to-speech translator uses the Whisper model for speech-to-text " |
|
"transcription, the translate library for translation, and the Coqui TTS model for text-to-speech " |
|
"synthesis.", |
|
allow_flagging="never" |
|
) |
|
|
|
demo.launch(debug=True, share=True) |
|
|