File size: 6,795 Bytes
5d5100f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a62db76
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from snac import SNAC
import soundfile as sf
import tempfile
import spaces

# --- global handles (lazy-loaded) ---
model = None
tokenizer = None
snac_model = None

def load_models(device: str):
    """Load Maya1 and SNAC once, with device-aware dtype."""
    global model, tokenizer, snac_model

    if tokenizer is None or model is None:
        dtype = torch.bfloat16 if device == "cuda" else torch.float32
        print(f"[load_models] loading Maya1 (dtype={dtype}, device={device})")

        # device_map only on CUDA; on CPU keep None to avoid accelerate errors
        device_map = "auto" if device == "cuda" else None

        model = AutoModelForCausalLM.from_pretrained(
            "maya-research/maya1",
            torch_dtype=dtype,
            device_map=device_map,
            trust_remote_code=True,
        )
        tokenizer = AutoTokenizer.from_pretrained(
            "maya-research/maya1",
            trust_remote_code=True,
        )
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

    if snac_model is None:
        print("[load_models] loading SNAC 24kHz decoder")
        snac = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval()
        # move later inside handler (after ZeroGPU alloc)
        return snac
    return None

@spaces.GPU(duration=180)
def generate_speech(text, voice_description, temperature, top_p, max_tokens):
    if not text.strip():
        raise gr.Error("Enter some text.")
    if not voice_description.strip():
        voice_description = "Realistic voice with neutral tone and conversational pacing."

    # ZeroGPU gives us CUDA during this call
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # load / ensure models exist
    snac_fresh = load_models(device)  # returns SNAC if created
    global snac_model
    if snac_fresh is not None:
        snac_model = snac_fresh

    # move models to the active device (ZeroGPU alloc happened)
    if device == "cuda":
        model.to(device)
        snac_model.to(device)

    # prompt exactly like the model card
    prompt = f'<description="{voice_description}"> {text}'
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.inference_mode():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs.get("attention_mask"),
            max_new_tokens=int(max_tokens),
            temperature=float(temperature),
            top_p=float(top_p),
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=None,
            repetition_penalty=1.1,
        )

    # SNAC token extraction (7-token frames) — as per model card
    generated_ids = outputs[0, inputs["input_ids"].shape[1]:]
    snac_tokens = [t.item() for t in generated_ids if 128266 <= t <= 156937]
    frames = len(snac_tokens) // 7
    if frames == 0:
        raise gr.Error("No SNAC tokens generated. Try longer text and max_tokens=1200–1500.")

    codes = [[], [], []]
    for i in range(frames):
        s = snac_tokens[i*7:(i+1)*7]
        codes[0].append((s[0]-128266) % 4096)
        codes[1].extend([(s[1]-128266) % 4096, (s[4]-128266) % 4096])
        codes[2].extend([
            (s[2]-128266) % 4096,
            (s[3]-128266) % 4096,
            (s[5]-128266) % 4096,
            (s[6]-128266) % 4096,
        ])

    codes_tensor = [torch.tensor(c, dtype=torch.long, device=device).unsqueeze(0) for c in codes]
    with torch.inference_mode():
        audio = snac_model.decoder(snac_model.quantizer.from_codes(codes_tensor))[0, 0].cpu().numpy()

    # write to wav; return filepath for gr.Audio(type="filepath")
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
        sf.write(f.name, audio, 24000)
        return f.name

# ------------------- UI -------------------
voice_presets = {
    "Male - American": "Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing.",
    "Female - British": "Clear female voice in the 20s age with British accent. Pleasant tone, articulate delivery, moderate pacing.",
    "Male - Deep": "Deep male voice with authoritative tone. Low pitch, resonant timbre, steady pacing.",
    "Female - Energetic": "Energetic female voice with enthusiastic tone. Higher pitch, bright timbre, upbeat pacing.",
    "Neutral - Professional": "Professional neutral voice with clear articulation. Balanced pitch, crisp tone, measured pacing.",
    "Custom": ""
}

def update_voice_description(preset): return voice_presets.get(preset, "")

with gr.Blocks(theme=gr.themes.Soft(), title="Maya1 Text-to-Speech") as demo:
    gr.HTML("""
        <div style="text-align:center;padding:16px">
            <h1>🎙️ Maya1 Text-to-Speech</h1>
            <p style="color:#666">Generate emotional, realistic speech with natural-language voice design</p>
            <p style="font-size:12px;color:#28a745">⚡ ZeroGPU inference</p>
        </div>
    """)

    with gr.Row():
        with gr.Column(scale=1):
            text_input = gr.Textbox(
                label="Text to Speak",
                value="Hello! This is Maya1 <laugh> the best open source voice AI model with emotions.",
                lines=5,
            )
            voice_preset = gr.Dropdown(choices=list(voice_presets.keys()),
                                       value="Male - American",
                                       label="Voice Preset")
            voice_description = gr.Textbox(
                label="Voice Description",
                value=voice_presets["Male - American"],
                lines=3,
            )
            with gr.Accordion("Advanced", open=False):
                temperature = gr.Slider(0.1, 1.0, value=0.7, step=0.1, label="Temperature")
                top_p = gr.Slider(0.5, 1.0, value=0.9, step=0.05, label="Top-p")
                max_tokens = gr.Slider(500, 2000, value=1000, step=100, label="Max tokens")

            generate_btn = gr.Button("🎤 Generate Speech", variant="primary")

        with gr.Column(scale=1):
            audio_output = gr.Audio(label="Generated Speech", type="filepath", interactive=False)

    voice_preset.change(fn=update_voice_description, inputs=[voice_preset], outputs=[voice_description])
    generate_btn.click(fn=generate_speech,
                       inputs=[text_input, voice_description, temperature, top_p, max_tokens],
                       outputs=[audio_output])

# Register an explicit API endpoint so Spaces never shows “No API found”
gr.api(fn=generate_speech, name="generate_speech")

if __name__ == "__main__":
    demo.queue()
    demo.launch(show_error=True)