import gradio as gr import torch from transformers import AutoProcessor, AutoModel import numpy as np # Initialize models and processors model_id = "facebook/mms-tts" processor = AutoProcessor.from_pretrained(model_id) model = AutoModel.from_pretrained(model_id) LANGUAGES = { "English": "eng", "French": "fra", "Spanish": "spa" } SPEAKERS = { "Male": 0, "Female": 1 } def text_to_speech(text, language, speaker_gender, speed): try: # Prepare inputs inputs = processor( text=text, language=LANGUAGES[language], return_tensors="pt", ) # Generate speech with torch.no_grad(): output = model.generate( **inputs, speaker_id=torch.tensor([SPEAKERS[speaker_gender]]), speed_ratios=torch.tensor([speed]) ) # Convert to waveform speech = output[0].cpu().numpy() sampling_rate = model.config.sampling_rate return (sampling_rate, speech) except Exception as e: return None # Create Gradio interface def create_interface(): with gr.Blocks(theme=gr.themes.Soft( primary_hue="blue", secondary_hue="gray", )) as demo: gr.Markdown( """ # 🎙️ Multilingual Text-to-Speech Convert text to natural-sounding speech in multiple languages. """ ) with gr.Row(): with gr.Column(): text_input = gr.Textbox( label="Enter Text", placeholder="Type your text here...", lines=5 ) language = gr.Dropdown( choices=list(LANGUAGES.keys()), value="English", label="Language" ) speaker = gr.Radio( choices=list(SPEAKERS.keys()), value="Male", label="Speaker Gender" ) speed = gr.Slider( minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speech Speed" ) submit_btn = gr.Button("Generate Speech", variant="primary") with gr.Column(): audio_output = gr.Audio( label="Generated Speech", type="numpy" ) submit_btn.click( fn=text_to_speech, inputs=[text_input, language, speaker, speed], outputs=audio_output ) gr.Markdown( """ ### Features: - Support for English, French, and Spanish - Male and Female voice options - Adjustable speech speed - High-quality, natural-sounding voices """ ) return demo demo = create_interface() demo.launch()