|
""" |
|
Speech Translation Demo with Automatic TTS, Restart Option, and About Tab |
|
|
|
This demo performs the following: |
|
1. Accepts up to 15 seconds of audio recording from the microphone. |
|
2. Uses OpenAI’s Whisper model to transcribe the speech. |
|
3. Splits the transcription into segments and translates each segment on-the-fly using Facebook’s M2M100 model. |
|
4. Streams the cumulative translation output to the user. |
|
5. Automatically converts the final translated text to speech using gTTS. |
|
6. Provides a "Restart Recording" button (located just below the recording section) |
|
to reset the audio input, translated text, and TTS output. |
|
|
|
Note: True real-time translation (i.e. while speaking) requires a continuous streaming |
|
solution which is not provided by the standard browser microphone input. |
|
""" |
|
|
|
import gradio as gr |
|
import whisper |
|
import torch |
|
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer |
|
from gtts import gTTS |
|
import uuid |
|
|
|
|
|
|
|
|
|
whisper_model = whisper.load_model("base") |
|
|
|
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M") |
|
m2m100_model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") |
|
|
|
|
|
|
|
|
|
LANGUAGES = { |
|
"English": "en", |
|
"Spanish": "es", |
|
"French": "fr", |
|
"German": "de", |
|
"Chinese": "zh", |
|
"Polish": "pl" |
|
} |
|
|
|
|
|
|
|
|
|
def translate_audio(audio, target_language): |
|
""" |
|
Transcribes the input audio using Whisper and translates the text into the target language. |
|
Returns the cumulative translated text. |
|
""" |
|
if audio is None: |
|
return "No audio provided." |
|
|
|
|
|
result = whisper_model.transcribe(audio, fp16=False) |
|
source_lang = result.get("language", "en") |
|
target_lang_code = LANGUAGES.get(target_language, "en") |
|
|
|
cumulative_translation = "" |
|
for segment in result.get("segments", []): |
|
segment_text = segment.get("text", "").strip() |
|
if not segment_text: |
|
continue |
|
|
|
if source_lang == target_lang_code: |
|
translated_segment = segment_text |
|
else: |
|
tokenizer.src_lang = source_lang |
|
encoded = tokenizer(segment_text, return_tensors="pt") |
|
generated_tokens = m2m100_model.generate( |
|
**encoded, |
|
forced_bos_token_id=tokenizer.get_lang_id(target_lang_code) |
|
) |
|
translated_segment = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] |
|
|
|
cumulative_translation += translated_segment + " " |
|
|
|
return cumulative_translation.strip() |
|
|
|
|
|
|
|
|
|
def generate_tts(text, target_language): |
|
""" |
|
Converts the given text to speech using gTTS and returns the filename of the generated audio. |
|
""" |
|
lang_code = LANGUAGES.get(target_language, "en") |
|
if not text or not text.strip(): |
|
return None |
|
filename = f"tts_{uuid.uuid4().hex}.mp3" |
|
tts = gTTS(text=text, lang=lang_code) |
|
tts.save(filename) |
|
return filename |
|
|
|
|
|
|
|
|
|
def restart_recording(): |
|
""" |
|
Clears the audio input, translated text, and TTS output. |
|
""" |
|
return None, "", None |
|
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
with gr.Tabs(): |
|
|
|
with gr.TabItem("Demo"): |
|
gr.Markdown("# Speech Translation Demo") |
|
gr.Markdown( |
|
"Speak into the microphone and your speech will be transcribed and translated " |
|
"segment-by-segment. (Recording is limited to 15 seconds.)\n\n" |
|
"**Note:** The translation and speech synthesis occur automatically after recording." |
|
) |
|
|
|
|
|
with gr.Row(): |
|
audio_input = gr.Audio( |
|
sources=["microphone"], |
|
type="filepath", |
|
label="Record your speech (max 15 seconds)", |
|
elem_id="audio_input" |
|
) |
|
target_lang_dropdown = gr.Dropdown( |
|
choices=list(LANGUAGES.keys()), |
|
value="English", |
|
label="Select Target Language" |
|
) |
|
|
|
|
|
with gr.Row(): |
|
restart_button = gr.Button("Restart Recording") |
|
|
|
|
|
output_text = gr.Textbox(label="Translated Text", lines=10) |
|
tts_audio = gr.Audio(label="Translated Speech", type="filepath") |
|
|
|
|
|
audio_input.change( |
|
fn=translate_audio, |
|
inputs=[audio_input, target_lang_dropdown], |
|
outputs=output_text |
|
).then( |
|
fn=generate_tts, |
|
inputs=[output_text, target_lang_dropdown], |
|
outputs=tts_audio |
|
) |
|
|
|
|
|
restart_button.click( |
|
fn=restart_recording, |
|
inputs=[], |
|
outputs=[audio_input, output_text, tts_audio] |
|
) |
|
|
|
|
|
with gr.TabItem("About"): |
|
gr.Markdown( |
|
""" |
|
**Speech Translation Demo with Automatic TTS and Restart Option** |
|
|
|
This demo performs the following: |
|
1. Accepts up to 15 seconds of audio recording from the microphone. |
|
2. Uses OpenAI’s Whisper model to transcribe the speech. |
|
3. Splits the transcription into segments and translates each segment on-the-fly using Facebook’s M2M100 model. |
|
4. Streams the cumulative translation output to the user. |
|
5. Automatically converts the final translated text to speech using gTTS. |
|
6. Provides a "Restart Recording" button (located just below the recording section) to reset the audio input, translated text, and TTS output. |
|
|
|
**Note:** True real-time translation (i.e. while speaking) requires a continuous streaming solution which is not provided by the standard browser microphone input. |
|
""" |
|
) |
|
|
|
|
|
demo.launch() |
|
|
|
|