import gradio as gr import torch from parler_tts import ParlerTTSForConditionalGeneration from transformers import AutoTokenizer import soundfile as sf import numpy as np import os # Set device (GPU if available, else CPU) device = "cuda:0" if torch.cuda.is_available() else "cpu" # Load Indic Parler-TTS model and tokenizer model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts").to(device) tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts") # Supported languages (Indic Parler-TTS officially supports these) languages = [ "Assamese", "Bengali", "Bodo", "Dogri", "English", "Gujarati", "Hindi", "Kannada", "Konkani", "Maithili", "Malayalam", "Manipuri", "Marathi", "Nepali", "Odia", "Sanskrit", "Santali", "Sindhi", "Tamil", "Telugu", "Urdu" ] def generate_speech(text, language, voice_description): """ Generate speech from text, language, and voice description. Returns the path to the generated audio file. """ if not text.strip(): return None, "Error: Text input cannot be empty." if language not in languages: return None, f"Error: Language '{language}' is not supported. Choose from: {', '.join(languages)}" # Combine voice description with language context (optional, for better control) description = f"A speaker delivering speech in {language}. {voice_description}" # Tokenize inputs input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device) prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device) # Generate audio try: generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids) audio_arr = generation.cpu().numpy().squeeze() # Save audio to a temporary file output_file = "output.wav" sf.write(output_file, audio_arr, model.config.sampling_rate) return output_file, None except Exception as e: return None, f"Error generating audio: {str(e)}" # Gradio interface with gr.Blocks() as demo: gr.Markdown("# Indic Parler-TTS: Text-to-Speech") gr.Markdown("Enter text, select a language, and describe the voice to generate audio. Download the audio output.") with gr.Row(): text_input = gr.Textbox(label="Input Text", placeholder="Enter text to convert to speech...") language_input = gr.Dropdown(label="Language", choices=languages, value="English") voice_description = gr.Textbox( label="Voice Description", placeholder="E.g., A female speaker with a clear, cheerful tone and moderate pace.", value="A neutral speaker with clear audio quality." ) generate_btn = gr.Button("Generate Audio") audio_output = gr.Audio(label="Generated Audio", type="filepath", interactive=False) error_output = gr.Textbox(label="Status/Error", visible=True, interactive=False) # Connect button to function generate_btn.click( fn=generate_speech, inputs=[text_input, language_input, voice_description], outputs=[audio_output, error_output] ) if __name__ == "__main__": demo.launch()