Spaces:
Runtime error
Runtime error
File size: 6,795 Bytes
5d5100f a62db76 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from snac import SNAC
import soundfile as sf
import tempfile
import spaces
# --- global handles (lazy-loaded) ---
model = None
tokenizer = None
snac_model = None
def load_models(device: str):
"""Load Maya1 and SNAC once, with device-aware dtype."""
global model, tokenizer, snac_model
if tokenizer is None or model is None:
dtype = torch.bfloat16 if device == "cuda" else torch.float32
print(f"[load_models] loading Maya1 (dtype={dtype}, device={device})")
# device_map only on CUDA; on CPU keep None to avoid accelerate errors
device_map = "auto" if device == "cuda" else None
model = AutoModelForCausalLM.from_pretrained(
"maya-research/maya1",
torch_dtype=dtype,
device_map=device_map,
trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(
"maya-research/maya1",
trust_remote_code=True,
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
if snac_model is None:
print("[load_models] loading SNAC 24kHz decoder")
snac = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval()
# move later inside handler (after ZeroGPU alloc)
return snac
return None
@spaces.GPU(duration=180)
def generate_speech(text, voice_description, temperature, top_p, max_tokens):
if not text.strip():
raise gr.Error("Enter some text.")
if not voice_description.strip():
voice_description = "Realistic voice with neutral tone and conversational pacing."
# ZeroGPU gives us CUDA during this call
device = "cuda" if torch.cuda.is_available() else "cpu"
# load / ensure models exist
snac_fresh = load_models(device) # returns SNAC if created
global snac_model
if snac_fresh is not None:
snac_model = snac_fresh
# move models to the active device (ZeroGPU alloc happened)
if device == "cuda":
model.to(device)
snac_model.to(device)
# prompt exactly like the model card
prompt = f'<description="{voice_description}"> {text}'
inputs = tokenizer(prompt, return_tensors="pt").to(device)
with torch.inference_mode():
outputs = model.generate(
input_ids=inputs["input_ids"],
attention_mask=inputs.get("attention_mask"),
max_new_tokens=int(max_tokens),
temperature=float(temperature),
top_p=float(top_p),
do_sample=True,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=None,
repetition_penalty=1.1,
)
# SNAC token extraction (7-token frames) — as per model card
generated_ids = outputs[0, inputs["input_ids"].shape[1]:]
snac_tokens = [t.item() for t in generated_ids if 128266 <= t <= 156937]
frames = len(snac_tokens) // 7
if frames == 0:
raise gr.Error("No SNAC tokens generated. Try longer text and max_tokens=1200–1500.")
codes = [[], [], []]
for i in range(frames):
s = snac_tokens[i*7:(i+1)*7]
codes[0].append((s[0]-128266) % 4096)
codes[1].extend([(s[1]-128266) % 4096, (s[4]-128266) % 4096])
codes[2].extend([
(s[2]-128266) % 4096,
(s[3]-128266) % 4096,
(s[5]-128266) % 4096,
(s[6]-128266) % 4096,
])
codes_tensor = [torch.tensor(c, dtype=torch.long, device=device).unsqueeze(0) for c in codes]
with torch.inference_mode():
audio = snac_model.decoder(snac_model.quantizer.from_codes(codes_tensor))[0, 0].cpu().numpy()
# write to wav; return filepath for gr.Audio(type="filepath")
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
sf.write(f.name, audio, 24000)
return f.name
# ------------------- UI -------------------
voice_presets = {
"Male - American": "Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing.",
"Female - British": "Clear female voice in the 20s age with British accent. Pleasant tone, articulate delivery, moderate pacing.",
"Male - Deep": "Deep male voice with authoritative tone. Low pitch, resonant timbre, steady pacing.",
"Female - Energetic": "Energetic female voice with enthusiastic tone. Higher pitch, bright timbre, upbeat pacing.",
"Neutral - Professional": "Professional neutral voice with clear articulation. Balanced pitch, crisp tone, measured pacing.",
"Custom": ""
}
def update_voice_description(preset): return voice_presets.get(preset, "")
with gr.Blocks(theme=gr.themes.Soft(), title="Maya1 Text-to-Speech") as demo:
gr.HTML("""
<div style="text-align:center;padding:16px">
<h1>🎙️ Maya1 Text-to-Speech</h1>
<p style="color:#666">Generate emotional, realistic speech with natural-language voice design</p>
<p style="font-size:12px;color:#28a745">⚡ ZeroGPU inference</p>
</div>
""")
with gr.Row():
with gr.Column(scale=1):
text_input = gr.Textbox(
label="Text to Speak",
value="Hello! This is Maya1 <laugh> the best open source voice AI model with emotions.",
lines=5,
)
voice_preset = gr.Dropdown(choices=list(voice_presets.keys()),
value="Male - American",
label="Voice Preset")
voice_description = gr.Textbox(
label="Voice Description",
value=voice_presets["Male - American"],
lines=3,
)
with gr.Accordion("Advanced", open=False):
temperature = gr.Slider(0.1, 1.0, value=0.7, step=0.1, label="Temperature")
top_p = gr.Slider(0.5, 1.0, value=0.9, step=0.05, label="Top-p")
max_tokens = gr.Slider(500, 2000, value=1000, step=100, label="Max tokens")
generate_btn = gr.Button("🎤 Generate Speech", variant="primary")
with gr.Column(scale=1):
audio_output = gr.Audio(label="Generated Speech", type="filepath", interactive=False)
voice_preset.change(fn=update_voice_description, inputs=[voice_preset], outputs=[voice_description])
generate_btn.click(fn=generate_speech,
inputs=[text_input, voice_description, temperature, top_p, max_tokens],
outputs=[audio_output])
# Register an explicit API endpoint so Spaces never shows “No API found”
gr.api(fn=generate_speech, name="generate_speech")
if __name__ == "__main__":
demo.queue()
demo.launch(show_error=True)
|