Spaces:
Running
on
Zero
Running
on
Zero
File size: 2,414 Bytes
748ecaa d743fc1 748ecaa d743fc1 748ecaa d743fc1 748ecaa d743fc1 748ecaa d743fc1 748ecaa d743fc1 748ecaa d743fc1 c28b0ef d743fc1 748ecaa d743fc1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import torch
import torchaudio
import gradio as gr
from zonos.model import Zonos
from zonos.conditioning import make_cond_dict
# Load the hybrid model
model = Zonos.from_pretrained("Zyphra/Zonos-v0.1-hybrid", device="cuda")
model.bfloat16() # Switch model weights to bfloat16 precision (optional, but recommended for GPU)
# Main inference function for Gradio
def tts(text, reference_audio):
"""
text: str
reference_audio: (numpy.ndarray, int) -> (data, sample_rate)
"""
if reference_audio is None:
return "No reference audio provided."
# reference_audio[0] is a NumPy float32 array of shape (num_samples, 1) or (num_samples,)
# reference_audio[1] is the sample rate
wav_np, sr = reference_audio
# Convert NumPy audio to Torch tensor
wav_torch = torch.from_numpy(wav_np).float().unsqueeze(0) # shape: (1, num_samples)
if wav_torch.dim() == 2 and wav_torch.shape[0] > wav_torch.shape[1]:
# If the shape is (samples, 1), reorder to (1, samples)
wav_torch = wav_torch.T
# Create speaker embedding
spk_embedding = model.embed_spk_audio(wav_torch, sr)
# Prepare conditioning
cond_dict = make_cond_dict(
text=text,
speaker=spk_embedding.to(torch.bfloat16),
language="en-us",
)
conditioning = model.prepare_conditioning(cond_dict)
# Generate codes
with torch.no_grad():
torch.manual_seed(421) # Seeding for reproducible results
codes = model.generate(conditioning)
# Decode the codes into waveform
wavs = model.autoencoder.decode(codes).cpu()
out_audio = wavs[0].numpy() # shape: (num_samples,)
# Return as (sample_rate, audio_ndarray) for Gradio's "audio" output
return (model.autoencoder.sampling_rate, out_audio)
# Define the Gradio interface
# - text input for the prompt
# - audio input for the speaker reference
# - audio output with the generated speech
demo = gr.Interface(
fn=tts,
inputs=[
gr.Textbox(label="Text to Synthesize"),
gr.Audio(label="Reference Audio (for speaker embedding)"),
],
outputs=gr.Audio(label="Generated Audio"),
title="Zonos TTS Demo (Hybrid)",
description=(
"Provide a reference audio snippet for speaker embedding, "
"enter text, and generate speech with Zonos TTS."
),
)
if __name__ == "__main__":
demo.launch(debug=True)
|