Spaces:

nimeshnaik
/

TextToSpeech

Running

TextToSpeech / app.py

Nimesh Naik

model name change

eb67530 2 months ago

3.17 kB

	import gradio as gr
	import torch
	from parler_tts import ParlerTTSForConditionalGeneration
	from transformers import AutoTokenizer
	import soundfile as sf
	import numpy as np
	import os

	# Set device (GPU if available, else CPU)
	device = "cuda:0" if torch.cuda.is_available() else "cpu"

	# Load Indic Parler-TTS model and tokenizer
	model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts").to(device)
	tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")

	# Supported languages (Indic Parler-TTS officially supports these)
	languages = [
	"Assamese", "Bengali", "Bodo", "Dogri", "English", "Gujarati", "Hindi",
	"Kannada", "Konkani", "Maithili", "Malayalam", "Manipuri", "Marathi",
	"Nepali", "Odia", "Sanskrit", "Santali", "Sindhi", "Tamil", "Telugu", "Urdu"
	]

	def generate_speech(text, language, voice_description):
	"""
	Generate speech from text, language, and voice description.
	Returns the path to the generated audio file.
	"""
	if not text.strip():
	return None, "Error: Text input cannot be empty."
	if language not in languages:
	return None, f"Error: Language '{language}' is not supported. Choose from: {', '.join(languages)}"

	# Combine voice description with language context (optional, for better control)
	description = f"A speaker delivering speech in {language}. {voice_description}"

	# Tokenize inputs
	input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
	prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)

	# Generate audio
	try:
	generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
	audio_arr = generation.cpu().numpy().squeeze()

	# Save audio to a temporary file
	output_file = "output.wav"
	sf.write(output_file, audio_arr, model.config.sampling_rate)
	return output_file, None
	except Exception as e:
	return None, f"Error generating audio: {str(e)}"

	# Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("# Indic Parler-TTS: Text-to-Speech")
	gr.Markdown("Enter text, select a language, and describe the voice to generate audio. Download the audio output.")

	with gr.Row():
	text_input = gr.Textbox(label="Input Text", placeholder="Enter text to convert to speech...")
	language_input = gr.Dropdown(label="Language", choices=languages, value="English")
	voice_description = gr.Textbox(
	label="Voice Description",
	placeholder="E.g., A female speaker with a clear, cheerful tone and moderate pace.",
	value="A neutral speaker with clear audio quality."
	)

	generate_btn = gr.Button("Generate Audio")
	audio_output = gr.Audio(label="Generated Audio", type="filepath", interactive=False)
	error_output = gr.Textbox(label="Status/Error", visible=True, interactive=False)

	# Connect button to function
	generate_btn.click(
	fn=generate_speech,
	inputs=[text_input, language_input, voice_description],
	outputs=[audio_output, error_output]
	)

	if __name__ == "__main__":
	demo.launch()