File size: 2,414 Bytes
748ecaa
 
 
 
 
d743fc1
748ecaa
d743fc1
 
 
748ecaa
d743fc1
 
748ecaa
d743fc1
 
748ecaa
d743fc1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
748ecaa
 
d743fc1
 
748ecaa
d743fc1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c28b0ef
d743fc1
 
 
 
 
 
 
 
748ecaa
 
d743fc1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import torch
import torchaudio
import gradio as gr

from zonos.model import Zonos
from zonos.conditioning import make_cond_dict

# Load the hybrid model
model = Zonos.from_pretrained("Zyphra/Zonos-v0.1-hybrid", device="cuda")
model.bfloat16()  # Switch model weights to bfloat16 precision (optional, but recommended for GPU)

# Main inference function for Gradio
def tts(text, reference_audio):
    """
    text: str
    reference_audio: (numpy.ndarray, int) -> (data, sample_rate)
    """
    if reference_audio is None:
        return "No reference audio provided."
    
    # reference_audio[0] is a NumPy float32 array of shape (num_samples, 1) or (num_samples,)
    # reference_audio[1] is the sample rate
    wav_np, sr = reference_audio
    
    # Convert NumPy audio to Torch tensor
    wav_torch = torch.from_numpy(wav_np).float().unsqueeze(0)  # shape: (1, num_samples)
    if wav_torch.dim() == 2 and wav_torch.shape[0] > wav_torch.shape[1]:
        # If the shape is (samples, 1), reorder to (1, samples)
        wav_torch = wav_torch.T
    
    # Create speaker embedding
    spk_embedding = model.embed_spk_audio(wav_torch, sr)
    
    # Prepare conditioning
    cond_dict = make_cond_dict(
        text=text,
        speaker=spk_embedding.to(torch.bfloat16),
        language="en-us",
    )
    conditioning = model.prepare_conditioning(cond_dict)
    
    # Generate codes
    with torch.no_grad():
        torch.manual_seed(421)  # Seeding for reproducible results
        codes = model.generate(conditioning)
    
    # Decode the codes into waveform
    wavs = model.autoencoder.decode(codes).cpu()
    out_audio = wavs[0].numpy()  # shape: (num_samples,)

    # Return as (sample_rate, audio_ndarray) for Gradio's "audio" output
    return (model.autoencoder.sampling_rate, out_audio)


# Define the Gradio interface
# - text input for the prompt
# - audio input for the speaker reference
# - audio output with the generated speech
demo = gr.Interface(
    fn=tts,
    inputs=[
        gr.Textbox(label="Text to Synthesize"),
        gr.Audio(label="Reference Audio (for speaker embedding)"),
    ],
    outputs=gr.Audio(label="Generated Audio"),
    title="Zonos TTS Demo (Hybrid)",
    description=(
        "Provide a reference audio snippet for speaker embedding, "
        "enter text, and generate speech with Zonos TTS."
    ),
)

if __name__ == "__main__":
    demo.launch(debug=True)