Spaces:
Running
Running
import gradio as gr | |
import torch | |
from parler_tts import ParlerTTSForConditionalGeneration | |
from transformers import AutoTokenizer | |
import soundfile as sf | |
import numpy as np | |
import os | |
# Set device (GPU if available, else CPU) | |
device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
# Load Indic Parler-TTS model and tokenizer | |
model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts").to(device) | |
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts") | |
# Supported languages (Indic Parler-TTS officially supports these) | |
languages = [ | |
"Assamese", "Bengali", "Bodo", "Dogri", "English", "Gujarati", "Hindi", | |
"Kannada", "Konkani", "Maithili", "Malayalam", "Manipuri", "Marathi", | |
"Nepali", "Odia", "Sanskrit", "Santali", "Sindhi", "Tamil", "Telugu", "Urdu" | |
] | |
def generate_speech(text, language, voice_description): | |
""" | |
Generate speech from text, language, and voice description. | |
Returns the path to the generated audio file. | |
""" | |
if not text.strip(): | |
return None, "Error: Text input cannot be empty." | |
if language not in languages: | |
return None, f"Error: Language '{language}' is not supported. Choose from: {', '.join(languages)}" | |
# Combine voice description with language context (optional, for better control) | |
description = f"A speaker delivering speech in {language}. {voice_description}" | |
# Tokenize inputs | |
input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device) | |
prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device) | |
# Generate audio | |
try: | |
generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids) | |
audio_arr = generation.cpu().numpy().squeeze() | |
# Save audio to a temporary file | |
output_file = "output.wav" | |
sf.write(output_file, audio_arr, model.config.sampling_rate) | |
return output_file, None | |
except Exception as e: | |
return None, f"Error generating audio: {str(e)}" | |
# Gradio interface | |
with gr.Blocks() as demo: | |
gr.Markdown("# Indic Parler-TTS: Text-to-Speech") | |
gr.Markdown("Enter text, select a language, and describe the voice to generate audio. Download the audio output.") | |
with gr.Row(): | |
text_input = gr.Textbox(label="Input Text", placeholder="Enter text to convert to speech...") | |
language_input = gr.Dropdown(label="Language", choices=languages, value="English") | |
voice_description = gr.Textbox( | |
label="Voice Description", | |
placeholder="E.g., A female speaker with a clear, cheerful tone and moderate pace.", | |
value="A neutral speaker with clear audio quality." | |
) | |
generate_btn = gr.Button("Generate Audio") | |
audio_output = gr.Audio(label="Generated Audio", type="filepath", interactive=False) | |
error_output = gr.Textbox(label="Status/Error", visible=True, interactive=False) | |
# Connect button to function | |
generate_btn.click( | |
fn=generate_speech, | |
inputs=[text_input, language_input, voice_description], | |
outputs=[audio_output, error_output] | |
) | |
if __name__ == "__main__": | |
demo.launch() |