orb-audio / app.py
hivecorp's picture
Update app.py
81b3ec7 verified
import gradio as gr
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
import soundfile as sf
import os
import time
# Load the Kokoro-TTS model and processor
model_name = "hexgrad/Kokoro-TTS"
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name)
processor = AutoProcessor.from_pretrained(model_name)
# Define available speakers (update this based on the model's capabilities)
speakers = ["Speaker 1", "Speaker 2", "Speaker 3"] # Replace with actual speaker names
# Function to generate TTS
def generate_tts(text, speaker):
try:
# Preprocess input text
inputs = processor(text, return_tensors="pt", speaker=speaker)
# Generate speech
with torch.no_grad():
speech = model.generate(**inputs)
# Save the output as a temporary file with an auto-generated name
timestamp = int(time.time())
output_file = f"output_{timestamp}.wav"
sf.write(output_file, speech.numpy(), samplerate=22050) # Adjust samplerate if needed
return output_file
except Exception as e:
return str(e)
# Gradio interface
def tts_app(text, speaker):
output_file = generate_tts(text, speaker)
if output_file.endswith(".wav"):
return output_file, f"Generated: {output_file}"
else:
return None, output_file
# Auto-naming system for downloads
def get_download_name():
return f"tts_output_{int(time.time())}.wav"
# Create the Gradio app
with gr.Blocks() as demo:
gr.Markdown("# Kokoro-TTS v1.9: Long Input TTS Generation")
with gr.Row():
text_input = gr.Textbox(label="Input Text", placeholder="Enter your text here...", lines=10)
speaker_dropdown = gr.Dropdown(label="Select Speaker", choices=speakers, value=speakers[0])
generate_button = gr.Button("Generate TTS")
with gr.Row():
audio_output = gr.Audio(label="Generated Audio")
status_output = gr.Textbox(label="Status", placeholder="Generation status will appear here...")
download_button = gr.Button("Download Audio")
download_output = gr.File(label="Download Generated Audio")
# Link functions to interface
generate_button.click(
fn=tts_app,
inputs=[text_input, speaker_dropdown],
outputs=[audio_output, status_output]
)
download_button.click(
fn=get_download_name,
outputs=download_output
)
# Launch the app
demo.launch()