kokoro-test / app.py
sdafd's picture
Create app.py
b7ec89c verified
raw
history blame contribute delete
4.76 kB
import gradio as gr
import numpy as np
import torch
import soundfile as sf
from kokoro import KPipeline
import re
import traceback
# Helper: Format seconds into SRT timestamp (hh:mm:ss,ms)
def format_time(seconds):
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = seconds % 60
# Ensure milliseconds are comma separated
return f"{hours:02d}:{minutes:02d}:{secs:06.3f}".replace('.', ',')
def generate_audio(text, voice, speed, lang_code, split_pattern, debug):
debug_logs = []
debug_logs.append("Starting Kokoro TTS generation...")
try:
debug_logs.append(f"Initializing Kokoro pipeline with lang_code: '{lang_code}' (CPU mode assumed)")
# Initialize the pipeline; by default, it will run on CPU if no GPU is available.
pipeline = KPipeline(lang_code=lang_code)
debug_logs.append("Pipeline initialized successfully.")
except Exception as e:
error_msg = f"Error initializing pipeline: {str(e)}"
debug_logs.append(error_msg)
return None, "", "\n".join(debug_logs)
# Prepare lists for audio segments, SRT entries, and segment-level debug info.
audio_segments = []
srt_entries = []
current_time = 0.0 # cumulative time for SRT timestamps
segment_index = 1
segment_debug_info = []
try:
debug_logs.append("Generating audio segments from input text...")
# Invoke the pipeline to process the text.
# The split_pattern parameter (regex) allows you to define how text is segmented.
generator = pipeline(
text,
voice=voice,
speed=speed,
split_pattern=split_pattern
)
for i, (gs, ps, audio) in enumerate(generator):
duration = len(audio) / 24000.0 # assuming a sample rate of 24000 Hz
start_timestamp = current_time
end_timestamp = current_time + duration
# Create an SRT entry for the segment.
srt_entry = f"{segment_index}\n{format_time(start_timestamp)} --> {format_time(end_timestamp)}\n{gs}\n"
srt_entries.append(srt_entry)
current_time = end_timestamp
# Record segment details for debugging.
segment_debug_info.append(f"Segment {segment_index}: Duration = {duration:.3f}s, Graphemes = {gs}, Phonemes = {ps}")
audio_segments.append(audio)
segment_index += 1
debug_logs.append("Audio segments generated successfully.")
except Exception as e:
error_msg = f"Error during audio generation: {str(e)}\n{traceback.format_exc()}"
debug_logs.append(error_msg)
return None, "", "\n".join(debug_logs)
# Concatenate all the generated segments into a single audio array.
if audio_segments:
full_audio = np.concatenate(audio_segments)
else:
debug_logs.append("No audio segments were generated.")
return None, "", "\n".join(debug_logs)
# Combine all SRT entries into one string.
srt_content = "\n".join(srt_entries)
# Combine all debug logs (with optional segment details).
if debug:
debug_info = "\n".join(debug_logs + segment_debug_info)
else:
debug_info = "\n".join(debug_logs)
# Return a tuple: audio (with sample rate), the SRT text, and the debug log.
return (24000, full_audio), srt_content, debug_info
# Build the Gradio interface.
iface = gr.Interface(
fn=generate_audio,
inputs=[
gr.Textbox(label="Input Text", lines=10, placeholder="Enter the text to be synthesized here..."),
gr.Textbox(label="Voice (e.g., af_heart)", value="af_heart"),
gr.Slider(label="Speed", minimum=0.5, maximum=2.0, step=0.1, value=1.0),
gr.Textbox(label="Language Code", value="a",
placeholder="Enter language code ('a' for American English, 'b' for British, etc.)"),
gr.Textbox(label="Split Pattern (Regex)", value=r'\n+',
placeholder="Regex to split the input text (e.g., '\\n+')"),
gr.Checkbox(label="Enable Debug Mode", value=True)
],
outputs=[
gr.Audio(label="Generated Audio", type="numpy"),
gr.Textbox(label="Generated SRT"),
gr.Textbox(label="Debug Information", lines=15)
],
title="Kokoro TTS Gradio App (CPU Mode)",
description=("This app uses the Kokoro TTS model to generate audio from text. "
"You can tweak parameters such as voice, speed, language code, and the text split pattern. "
"When debug mode is enabled, detailed processing steps (including grapheme and phoneme outputs) are displayed.")
)
if __name__ == "__main__":
iface.launch()