File size: 3,231 Bytes
f7e1683
ec75c79
 
1689d75
eceecf3
7c14ea7
eceecf3
1689d75
eceecf3
7d70e82
 
7c14ea7
7d70e82
eceecf3
 
ec75c79
 
3cccab6
ec75c79
 
eceecf3
3cccab6
ec75c79
 
3cccab6
eceecf3
ec75c79
 
 
1689d75
ec75c79
3cccab6
ec75c79
7c14ea7
ec75c79
1689d75
3cccab6
 
eceecf3
3cccab6
eceecf3
 
3cccab6
eceecf3
3cccab6
eceecf3
3cccab6
 
eceecf3
 
3cccab6
ec75c79
1689d75
 
 
 
ec75c79
eceecf3
1689d75
3cccab6
1689d75
 
 
7c14ea7
ec75c79
7c14ea7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import gradio as gr
import edge_tts
import asyncio
import tempfile
import nltk
import os
import srt
from pydub import AudioSegment, silence
import datetime
import nest_asyncio

nltk.download("punkt")
nest_asyncio.apply()


async def text_to_speech(text, voice, rate, pitch):
    if not text.strip():
        return None, None, "Please enter some text."
    if not voice:
        return None, None, "Please select a voice."

    voice_short = voice.split(" - ")[0]
    rate_str = f"{rate:+d}%"
    pitch_str = f"{pitch:+d}Hz"
    communicate = edge_tts.Communicate(text, voice_short, rate=rate_str, pitch=pitch_str)

    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
        tmp_path = tmp_file.name
        await communicate.save(tmp_path)

    srt_path = generate_srt(tmp_path, text)
    return tmp_path, srt_path, ""


def generate_srt(audio_path, text):
    audio = AudioSegment.from_file(audio_path)
    silences = silence.detect_silence(audio, min_silence_len=400, silence_thresh=audio.dBFS - 16)
    silences = [(start / 1000.0, end / 1000.0) for start, end in silences]

    sentences = nltk.tokenize.sent_tokenize(text)
    subtitles = []
    last_time = 0.0

    for i, sentence in enumerate(sentences):
        if i < len(silences):
            start = last_time
            end = silences[i][0]
            last_time = silences[i][1]
        else:
            start = last_time
            end = start + 2.5
        subtitles.append(srt.Subtitle(
            index=i + 1,
            start=datetime.timedelta(seconds=start),
            end=datetime.timedelta(seconds=end),
            content=sentence
        ))

    srt_data = srt.compose(subtitles)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".srt", mode="w") as srt_file:
        srt_file.write(srt_data)
        return srt_file.name


async def tts_interface(text, voice, rate, pitch):
    return await text_to_speech(text, voice, rate, pitch)


# ⬇️ Create demo synchronously (run async functions in loop)
voices = asyncio.run(edge_tts.list_voices())
voice_dict = {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}

with gr.Blocks() as demo:
    gr.Markdown("# 🎙️ Text-to-Speech + Subtitle Generator")

    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(label="Input Text", lines=5)
            voice_dropdown = gr.Dropdown(choices=[""] + list(voice_dict.keys()), label="Select Voice")
            rate_slider = gr.Slider(minimum=-50, maximum=50, value=0, label="Speech Rate (%)")
            pitch_slider = gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch (Hz)")
            generate_btn = gr.Button("🎧 Generate Audio + SRT")

        with gr.Column():
            audio_output = gr.Audio(label="Generated Audio", type="filepath")
            srt_output = gr.File(label="Download Subtitle (.srt)")
            message_output = gr.Textbox(label="Status", interactive=False)

    generate_btn.click(
        fn=lambda text, voice, rate, pitch: asyncio.run(tts_interface(text, voice, rate, pitch)),
        inputs=[text_input, voice_dropdown, rate_slider, pitch_slider],
        outputs=[audio_output, srt_output, message_output]
    )