import os import gradio as gr import spaces import time from tts_model import TTSModel from lib import format_audio_output # Set HF_HOME for faster restarts with cached models/voices os.environ["HF_HOME"] = "/data/.huggingface" # Create TTS model instance model = TTSModel() @spaces.GPU(duration=10) # Quick initialization def initialize_model(): """Initialize model and get voices""" if model.model is None: if not model.initialize(): raise gr.Error("Failed to initialize model") return model.list_voices() # Get initial voice list voice_list = initialize_model() @spaces.GPU(duration=120) # Allow 5 minutes for processing def generate_speech_from_ui(text, voice_name, speed, progress=gr.Progress(track_tqdm=False)): """Handle text-to-speech generation from the Gradio UI""" try: start_time = time.time() gpu_timeout = 120 # seconds # Create progress state progress_state = { "progress": 0.0, "tokens_per_sec": 0.0, "gpu_time_left": gpu_timeout } def update_progress(chunk_num, total_chunks, tokens_per_sec, rtf): progress_state["progress"] = chunk_num / total_chunks progress_state["tokens_per_sec"] = tokens_per_sec # Update GPU time remaining elapsed = time.time() - start_time gpu_time_left = max(0, gpu_timeout - elapsed) progress_state["gpu_time_left"] = gpu_time_left # Only update progress display during processing progress(progress_state["progress"], desc=f"Processing chunk {chunk_num}/{total_chunks} | GPU Time Left: {int(gpu_time_left)}s") # Generate speech with progress tracking audio_array, duration = model.generate_speech( text, voice_name, speed, progress_callback=update_progress ) # Format output for Gradio audio_output, duration_text = format_audio_output(audio_array) # Calculate final metrics total_time = time.time() - start_time total_duration = len(audio_array) / 24000 # audio duration in seconds final_rtf = total_time / total_duration if total_duration > 0 else 0 # Prepare final metrics display metrics_text = ( f"Tokens/sec: {progress_state['tokens_per_sec']:.1f}\n" + f"Real-time factor: {final_rtf:.2f}x (Processing Time / Audio Duration)\n" + f"GPU Time Used: {int(total_time)}s of {gpu_timeout}s" ) return ( audio_output, metrics_text, duration_text ) except Exception as e: raise gr.Error(f"Generation failed: {str(e)}") # Create Gradio interface with gr.Blocks(title="Kokoro TTS Demo") as demo: gr.HTML( """
Convert text to natural-sounding speech using various voices.