|
import gradio as gr |
|
import torch |
|
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor |
|
import soundfile as sf |
|
import os |
|
import time |
|
|
|
|
|
model_name = "hexgrad/Kokoro-TTS" |
|
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name) |
|
processor = AutoProcessor.from_pretrained(model_name) |
|
|
|
|
|
speakers = ["Speaker 1", "Speaker 2", "Speaker 3"] |
|
|
|
|
|
def generate_tts(text, speaker): |
|
try: |
|
|
|
inputs = processor(text, return_tensors="pt", speaker=speaker) |
|
|
|
|
|
with torch.no_grad(): |
|
speech = model.generate(**inputs) |
|
|
|
|
|
timestamp = int(time.time()) |
|
output_file = f"output_{timestamp}.wav" |
|
sf.write(output_file, speech.numpy(), samplerate=22050) |
|
|
|
return output_file |
|
except Exception as e: |
|
return str(e) |
|
|
|
|
|
def tts_app(text, speaker): |
|
output_file = generate_tts(text, speaker) |
|
if output_file.endswith(".wav"): |
|
return output_file, f"Generated: {output_file}" |
|
else: |
|
return None, output_file |
|
|
|
|
|
def get_download_name(): |
|
return f"tts_output_{int(time.time())}.wav" |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# Kokoro-TTS v1.9: Long Input TTS Generation") |
|
|
|
with gr.Row(): |
|
text_input = gr.Textbox(label="Input Text", placeholder="Enter your text here...", lines=10) |
|
speaker_dropdown = gr.Dropdown(label="Select Speaker", choices=speakers, value=speakers[0]) |
|
|
|
generate_button = gr.Button("Generate TTS") |
|
|
|
with gr.Row(): |
|
audio_output = gr.Audio(label="Generated Audio") |
|
status_output = gr.Textbox(label="Status", placeholder="Generation status will appear here...") |
|
|
|
download_button = gr.Button("Download Audio") |
|
download_output = gr.File(label="Download Generated Audio") |
|
|
|
|
|
generate_button.click( |
|
fn=tts_app, |
|
inputs=[text_input, speaker_dropdown], |
|
outputs=[audio_output, status_output] |
|
) |
|
|
|
download_button.click( |
|
fn=get_download_name, |
|
outputs=download_output |
|
) |
|
|
|
|
|
demo.launch() |