Spaces:
Running
Running
File size: 4,755 Bytes
b7ec89c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
import gradio as gr
import numpy as np
import torch
import soundfile as sf
from kokoro import KPipeline
import re
import traceback
# Helper: Format seconds into SRT timestamp (hh:mm:ss,ms)
def format_time(seconds):
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = seconds % 60
# Ensure milliseconds are comma separated
return f"{hours:02d}:{minutes:02d}:{secs:06.3f}".replace('.', ',')
def generate_audio(text, voice, speed, lang_code, split_pattern, debug):
debug_logs = []
debug_logs.append("Starting Kokoro TTS generation...")
try:
debug_logs.append(f"Initializing Kokoro pipeline with lang_code: '{lang_code}' (CPU mode assumed)")
# Initialize the pipeline; by default, it will run on CPU if no GPU is available.
pipeline = KPipeline(lang_code=lang_code)
debug_logs.append("Pipeline initialized successfully.")
except Exception as e:
error_msg = f"Error initializing pipeline: {str(e)}"
debug_logs.append(error_msg)
return None, "", "\n".join(debug_logs)
# Prepare lists for audio segments, SRT entries, and segment-level debug info.
audio_segments = []
srt_entries = []
current_time = 0.0 # cumulative time for SRT timestamps
segment_index = 1
segment_debug_info = []
try:
debug_logs.append("Generating audio segments from input text...")
# Invoke the pipeline to process the text.
# The split_pattern parameter (regex) allows you to define how text is segmented.
generator = pipeline(
text,
voice=voice,
speed=speed,
split_pattern=split_pattern
)
for i, (gs, ps, audio) in enumerate(generator):
duration = len(audio) / 24000.0 # assuming a sample rate of 24000 Hz
start_timestamp = current_time
end_timestamp = current_time + duration
# Create an SRT entry for the segment.
srt_entry = f"{segment_index}\n{format_time(start_timestamp)} --> {format_time(end_timestamp)}\n{gs}\n"
srt_entries.append(srt_entry)
current_time = end_timestamp
# Record segment details for debugging.
segment_debug_info.append(f"Segment {segment_index}: Duration = {duration:.3f}s, Graphemes = {gs}, Phonemes = {ps}")
audio_segments.append(audio)
segment_index += 1
debug_logs.append("Audio segments generated successfully.")
except Exception as e:
error_msg = f"Error during audio generation: {str(e)}\n{traceback.format_exc()}"
debug_logs.append(error_msg)
return None, "", "\n".join(debug_logs)
# Concatenate all the generated segments into a single audio array.
if audio_segments:
full_audio = np.concatenate(audio_segments)
else:
debug_logs.append("No audio segments were generated.")
return None, "", "\n".join(debug_logs)
# Combine all SRT entries into one string.
srt_content = "\n".join(srt_entries)
# Combine all debug logs (with optional segment details).
if debug:
debug_info = "\n".join(debug_logs + segment_debug_info)
else:
debug_info = "\n".join(debug_logs)
# Return a tuple: audio (with sample rate), the SRT text, and the debug log.
return (24000, full_audio), srt_content, debug_info
# Build the Gradio interface.
iface = gr.Interface(
fn=generate_audio,
inputs=[
gr.Textbox(label="Input Text", lines=10, placeholder="Enter the text to be synthesized here..."),
gr.Textbox(label="Voice (e.g., af_heart)", value="af_heart"),
gr.Slider(label="Speed", minimum=0.5, maximum=2.0, step=0.1, value=1.0),
gr.Textbox(label="Language Code", value="a",
placeholder="Enter language code ('a' for American English, 'b' for British, etc.)"),
gr.Textbox(label="Split Pattern (Regex)", value=r'\n+',
placeholder="Regex to split the input text (e.g., '\\n+')"),
gr.Checkbox(label="Enable Debug Mode", value=True)
],
outputs=[
gr.Audio(label="Generated Audio", type="numpy"),
gr.Textbox(label="Generated SRT"),
gr.Textbox(label="Debug Information", lines=15)
],
title="Kokoro TTS Gradio App (CPU Mode)",
description=("This app uses the Kokoro TTS model to generate audio from text. "
"You can tweak parameters such as voice, speed, language code, and the text split pattern. "
"When debug mode is enabled, detailed processing steps (including grapheme and phoneme outputs) are displayed.")
)
if __name__ == "__main__":
iface.launch()
|