TextToSpeech / app.py
Nimesh Naik
model name change
eb67530
import gradio as gr
import torch
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
import soundfile as sf
import numpy as np
import os
# Set device (GPU if available, else CPU)
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# Load Indic Parler-TTS model and tokenizer
model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts").to(device)
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
# Supported languages (Indic Parler-TTS officially supports these)
languages = [
"Assamese", "Bengali", "Bodo", "Dogri", "English", "Gujarati", "Hindi",
"Kannada", "Konkani", "Maithili", "Malayalam", "Manipuri", "Marathi",
"Nepali", "Odia", "Sanskrit", "Santali", "Sindhi", "Tamil", "Telugu", "Urdu"
]
def generate_speech(text, language, voice_description):
"""
Generate speech from text, language, and voice description.
Returns the path to the generated audio file.
"""
if not text.strip():
return None, "Error: Text input cannot be empty."
if language not in languages:
return None, f"Error: Language '{language}' is not supported. Choose from: {', '.join(languages)}"
# Combine voice description with language context (optional, for better control)
description = f"A speaker delivering speech in {language}. {voice_description}"
# Tokenize inputs
input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
# Generate audio
try:
generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
audio_arr = generation.cpu().numpy().squeeze()
# Save audio to a temporary file
output_file = "output.wav"
sf.write(output_file, audio_arr, model.config.sampling_rate)
return output_file, None
except Exception as e:
return None, f"Error generating audio: {str(e)}"
# Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# Indic Parler-TTS: Text-to-Speech")
gr.Markdown("Enter text, select a language, and describe the voice to generate audio. Download the audio output.")
with gr.Row():
text_input = gr.Textbox(label="Input Text", placeholder="Enter text to convert to speech...")
language_input = gr.Dropdown(label="Language", choices=languages, value="English")
voice_description = gr.Textbox(
label="Voice Description",
placeholder="E.g., A female speaker with a clear, cheerful tone and moderate pace.",
value="A neutral speaker with clear audio quality."
)
generate_btn = gr.Button("Generate Audio")
audio_output = gr.Audio(label="Generated Audio", type="filepath", interactive=False)
error_output = gr.Textbox(label="Status/Error", visible=True, interactive=False)
# Connect button to function
generate_btn.click(
fn=generate_speech,
inputs=[text_input, language_input, voice_description],
outputs=[audio_output, error_output]
)
if __name__ == "__main__":
demo.launch()