Spaces:
Running
on
Zero
Running
on
Zero
import torch | |
import torchaudio | |
import gradio as gr | |
from zonos.model import Zonos | |
from zonos.conditioning import make_cond_dict | |
model = Zonos.from_pretrained("Zyphra/Zonos-v0.1-hybrid", device="cuda") | |
model.bfloat16() | |
def tts(text, reference_audio): | |
if reference_audio is None: | |
return None | |
# Gradio returns (sample_rate, audio_data) for type="numpy" | |
sr, wav_np = reference_audio | |
# Convert NumPy audio data to Torch tensor | |
wav_torch = torch.from_numpy(wav_np).float().unsqueeze(0) | |
if wav_torch.dim() == 2 and wav_torch.shape[0] > wav_torch.shape[1]: | |
wav_torch = wav_torch.T | |
# Create speaker embedding | |
spk_embedding = model.embed_spk_audio(wav_torch, sr) | |
# Prepare conditioning | |
cond_dict = make_cond_dict( | |
text=text, | |
speaker=spk_embedding.to(torch.bfloat16), | |
language="en-us", | |
) | |
conditioning = model.prepare_conditioning(cond_dict) | |
# Generate codes & decode | |
with torch.no_grad(): | |
torch.manual_seed(421) | |
codes = model.generate(conditioning) | |
wavs = model.autoencoder.decode(codes).cpu() | |
out_audio = wavs[0].numpy() | |
# Return a tuple of (sample_rate, audio_data) for playback | |
return (model.autoencoder.sampling_rate, out_audio) | |
demo = gr.Interface( | |
fn=tts, | |
inputs=[ | |
gr.Textbox(label="Text to Synthesize"), | |
gr.Audio(type="numpy", label="Reference Audio (Speaker)"), | |
], | |
outputs=gr.Audio(label="Generated Audio"), | |
title="Zonos TTS Demo (Hybrid)", | |
description="Upload a reference audio for speaker embedding, enter text, and generate speech!" | |
) | |
if __name__ == "__main__": | |
demo.launch(debug=True) | |