import torch import torchaudio import gradio as gr from zonos.model import Zonos from zonos.conditioning import make_cond_dict model = Zonos.from_pretrained("Zyphra/Zonos-v0.1-hybrid", device="cuda") model.bfloat16() def tts(text, reference_audio): if reference_audio is None: return None # Gradio returns (sample_rate, audio_data) for type="numpy" sr, wav_np = reference_audio # Convert NumPy audio data to Torch tensor wav_torch = torch.from_numpy(wav_np).float().unsqueeze(0) if wav_torch.dim() == 2 and wav_torch.shape[0] > wav_torch.shape[1]: wav_torch = wav_torch.T # Create speaker embedding spk_embedding = model.embed_spk_audio(wav_torch, sr) # Prepare conditioning cond_dict = make_cond_dict( text=text, speaker=spk_embedding.to(torch.bfloat16), language="en-us", ) conditioning = model.prepare_conditioning(cond_dict) # Generate codes & decode with torch.no_grad(): torch.manual_seed(421) codes = model.generate(conditioning) wavs = model.autoencoder.decode(codes).cpu() out_audio = wavs[0].numpy() # Return a tuple of (sample_rate, audio_data) for playback return (model.autoencoder.sampling_rate, out_audio) demo = gr.Interface( fn=tts, inputs=[ gr.Textbox(label="Text to Synthesize"), gr.Audio(type="numpy", label="Reference Audio (Speaker)"), ], outputs=gr.Audio(label="Generated Audio"), title="Zonos TTS Demo (Hybrid)", description="Upload a reference audio for speaker embedding, enter text, and generate speech!" ) if __name__ == "__main__": demo.launch(debug=True)