File size: 4,755 Bytes
b7ec89c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import gradio as gr
import numpy as np
import torch
import soundfile as sf
from kokoro import KPipeline
import re
import traceback

# Helper: Format seconds into SRT timestamp (hh:mm:ss,ms)
def format_time(seconds):
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = seconds % 60
    # Ensure milliseconds are comma separated
    return f"{hours:02d}:{minutes:02d}:{secs:06.3f}".replace('.', ',')

def generate_audio(text, voice, speed, lang_code, split_pattern, debug):
    debug_logs = []
    debug_logs.append("Starting Kokoro TTS generation...")
    
    try:
        debug_logs.append(f"Initializing Kokoro pipeline with lang_code: '{lang_code}' (CPU mode assumed)")
        # Initialize the pipeline; by default, it will run on CPU if no GPU is available.
        pipeline = KPipeline(lang_code=lang_code)
        debug_logs.append("Pipeline initialized successfully.")
    except Exception as e:
        error_msg = f"Error initializing pipeline: {str(e)}"
        debug_logs.append(error_msg)
        return None, "", "\n".join(debug_logs)
    
    # Prepare lists for audio segments, SRT entries, and segment-level debug info.
    audio_segments = []
    srt_entries = []
    current_time = 0.0  # cumulative time for SRT timestamps
    segment_index = 1
    segment_debug_info = []
    
    try:
        debug_logs.append("Generating audio segments from input text...")
        # Invoke the pipeline to process the text.
        # The split_pattern parameter (regex) allows you to define how text is segmented.
        generator = pipeline(
            text,
            voice=voice,
            speed=speed,
            split_pattern=split_pattern
        )
        
        for i, (gs, ps, audio) in enumerate(generator):
            duration = len(audio) / 24000.0  # assuming a sample rate of 24000 Hz
            start_timestamp = current_time
            end_timestamp = current_time + duration
            # Create an SRT entry for the segment.
            srt_entry = f"{segment_index}\n{format_time(start_timestamp)} --> {format_time(end_timestamp)}\n{gs}\n"
            srt_entries.append(srt_entry)
            current_time = end_timestamp
            
            # Record segment details for debugging.
            segment_debug_info.append(f"Segment {segment_index}: Duration = {duration:.3f}s, Graphemes = {gs}, Phonemes = {ps}")
            audio_segments.append(audio)
            segment_index += 1
        
        debug_logs.append("Audio segments generated successfully.")
    except Exception as e:
        error_msg = f"Error during audio generation: {str(e)}\n{traceback.format_exc()}"
        debug_logs.append(error_msg)
        return None, "", "\n".join(debug_logs)
    
    # Concatenate all the generated segments into a single audio array.
    if audio_segments:
        full_audio = np.concatenate(audio_segments)
    else:
        debug_logs.append("No audio segments were generated.")
        return None, "", "\n".join(debug_logs)
    
    # Combine all SRT entries into one string.
    srt_content = "\n".join(srt_entries)
    
    # Combine all debug logs (with optional segment details).
    if debug:
        debug_info = "\n".join(debug_logs + segment_debug_info)
    else:
        debug_info = "\n".join(debug_logs)
    
    # Return a tuple: audio (with sample rate), the SRT text, and the debug log.
    return (24000, full_audio), srt_content, debug_info

# Build the Gradio interface.
iface = gr.Interface(
    fn=generate_audio,
    inputs=[
        gr.Textbox(label="Input Text", lines=10, placeholder="Enter the text to be synthesized here..."),
        gr.Textbox(label="Voice (e.g., af_heart)", value="af_heart"),
        gr.Slider(label="Speed", minimum=0.5, maximum=2.0, step=0.1, value=1.0),
        gr.Textbox(label="Language Code", value="a", 
                   placeholder="Enter language code ('a' for American English, 'b' for British, etc.)"),
        gr.Textbox(label="Split Pattern (Regex)", value=r'\n+',
                   placeholder="Regex to split the input text (e.g., '\\n+')"),
        gr.Checkbox(label="Enable Debug Mode", value=True)
    ],
    outputs=[
        gr.Audio(label="Generated Audio", type="numpy"),
        gr.Textbox(label="Generated SRT"),
        gr.Textbox(label="Debug Information", lines=15)
    ],
    title="Kokoro TTS Gradio App (CPU Mode)",
    description=("This app uses the Kokoro TTS model to generate audio from text. "
                 "You can tweak parameters such as voice, speed, language code, and the text split pattern. "
                 "When debug mode is enabled, detailed processing steps (including grapheme and phoneme outputs) are displayed.")
)

if __name__ == "__main__":
    iface.launch()