Spaces:
Running
Running
import gradio as gr | |
import numpy as np | |
import torch | |
import soundfile as sf | |
from kokoro import KPipeline | |
import re | |
import traceback | |
# Helper: Format seconds into SRT timestamp (hh:mm:ss,ms) | |
def format_time(seconds): | |
hours = int(seconds // 3600) | |
minutes = int((seconds % 3600) // 60) | |
secs = seconds % 60 | |
# Ensure milliseconds are comma separated | |
return f"{hours:02d}:{minutes:02d}:{secs:06.3f}".replace('.', ',') | |
def generate_audio(text, voice, speed, lang_code, split_pattern, debug): | |
debug_logs = [] | |
debug_logs.append("Starting Kokoro TTS generation...") | |
try: | |
debug_logs.append(f"Initializing Kokoro pipeline with lang_code: '{lang_code}' (CPU mode assumed)") | |
# Initialize the pipeline; by default, it will run on CPU if no GPU is available. | |
pipeline = KPipeline(lang_code=lang_code) | |
debug_logs.append("Pipeline initialized successfully.") | |
except Exception as e: | |
error_msg = f"Error initializing pipeline: {str(e)}" | |
debug_logs.append(error_msg) | |
return None, "", "\n".join(debug_logs) | |
# Prepare lists for audio segments, SRT entries, and segment-level debug info. | |
audio_segments = [] | |
srt_entries = [] | |
current_time = 0.0 # cumulative time for SRT timestamps | |
segment_index = 1 | |
segment_debug_info = [] | |
try: | |
debug_logs.append("Generating audio segments from input text...") | |
# Invoke the pipeline to process the text. | |
# The split_pattern parameter (regex) allows you to define how text is segmented. | |
generator = pipeline( | |
text, | |
voice=voice, | |
speed=speed, | |
split_pattern=split_pattern | |
) | |
for i, (gs, ps, audio) in enumerate(generator): | |
duration = len(audio) / 24000.0 # assuming a sample rate of 24000 Hz | |
start_timestamp = current_time | |
end_timestamp = current_time + duration | |
# Create an SRT entry for the segment. | |
srt_entry = f"{segment_index}\n{format_time(start_timestamp)} --> {format_time(end_timestamp)}\n{gs}\n" | |
srt_entries.append(srt_entry) | |
current_time = end_timestamp | |
# Record segment details for debugging. | |
segment_debug_info.append(f"Segment {segment_index}: Duration = {duration:.3f}s, Graphemes = {gs}, Phonemes = {ps}") | |
audio_segments.append(audio) | |
segment_index += 1 | |
debug_logs.append("Audio segments generated successfully.") | |
except Exception as e: | |
error_msg = f"Error during audio generation: {str(e)}\n{traceback.format_exc()}" | |
debug_logs.append(error_msg) | |
return None, "", "\n".join(debug_logs) | |
# Concatenate all the generated segments into a single audio array. | |
if audio_segments: | |
full_audio = np.concatenate(audio_segments) | |
else: | |
debug_logs.append("No audio segments were generated.") | |
return None, "", "\n".join(debug_logs) | |
# Combine all SRT entries into one string. | |
srt_content = "\n".join(srt_entries) | |
# Combine all debug logs (with optional segment details). | |
if debug: | |
debug_info = "\n".join(debug_logs + segment_debug_info) | |
else: | |
debug_info = "\n".join(debug_logs) | |
# Return a tuple: audio (with sample rate), the SRT text, and the debug log. | |
return (24000, full_audio), srt_content, debug_info | |
# Build the Gradio interface. | |
iface = gr.Interface( | |
fn=generate_audio, | |
inputs=[ | |
gr.Textbox(label="Input Text", lines=10, placeholder="Enter the text to be synthesized here..."), | |
gr.Textbox(label="Voice (e.g., af_heart)", value="af_heart"), | |
gr.Slider(label="Speed", minimum=0.5, maximum=2.0, step=0.1, value=1.0), | |
gr.Textbox(label="Language Code", value="a", | |
placeholder="Enter language code ('a' for American English, 'b' for British, etc.)"), | |
gr.Textbox(label="Split Pattern (Regex)", value=r'\n+', | |
placeholder="Regex to split the input text (e.g., '\\n+')"), | |
gr.Checkbox(label="Enable Debug Mode", value=True) | |
], | |
outputs=[ | |
gr.Audio(label="Generated Audio", type="numpy"), | |
gr.Textbox(label="Generated SRT"), | |
gr.Textbox(label="Debug Information", lines=15) | |
], | |
title="Kokoro TTS Gradio App (CPU Mode)", | |
description=("This app uses the Kokoro TTS model to generate audio from text. " | |
"You can tweak parameters such as voice, speed, language code, and the text split pattern. " | |
"When debug mode is enabled, detailed processing steps (including grapheme and phoneme outputs) are displayed.") | |
) | |
if __name__ == "__main__": | |
iface.launch() | |