Zonos / app.py
Steveeeeeeen's picture
Steveeeeeeen HF staff
Update app.py
b1f1246 verified
raw
history blame
1.69 kB
import torch
import torchaudio
import gradio as gr
from zonos.model import Zonos
from zonos.conditioning import make_cond_dict
model = Zonos.from_pretrained("Zyphra/Zonos-v0.1-hybrid", device="cuda")
model.bfloat16()
def tts(text, reference_audio):
if reference_audio is None:
return None
# Gradio returns (sample_rate, audio_data) for type="numpy"
sr, wav_np = reference_audio
# Convert NumPy audio data to Torch tensor
wav_torch = torch.from_numpy(wav_np).float().unsqueeze(0)
if wav_torch.dim() == 2 and wav_torch.shape[0] > wav_torch.shape[1]:
wav_torch = wav_torch.T
# Create speaker embedding
spk_embedding = model.embed_spk_audio(wav_torch, sr)
# Prepare conditioning
cond_dict = make_cond_dict(
text=text,
speaker=spk_embedding.to(torch.bfloat16),
language="en-us",
)
conditioning = model.prepare_conditioning(cond_dict)
# Generate codes & decode
with torch.no_grad():
torch.manual_seed(421)
codes = model.generate(conditioning)
wavs = model.autoencoder.decode(codes).cpu()
out_audio = wavs[0].numpy()
# Return a tuple of (sample_rate, audio_data) for playback
return (model.autoencoder.sampling_rate, out_audio)
demo = gr.Interface(
fn=tts,
inputs=[
gr.Textbox(label="Text to Synthesize"),
gr.Audio(type="numpy", label="Reference Audio (Speaker)"),
],
outputs=gr.Audio(label="Generated Audio"),
title="Zonos TTS Demo (Hybrid)",
description="Upload a reference audio for speaker embedding, enter text, and generate speech!"
)
if __name__ == "__main__":
demo.launch(debug=True)