shreyask's picture
Initial deploy: built app at root + source under _source/
24b9788 verified
import { useState } from "react";
import { useModel } from "./hooks/useModel";
import Waveform from "./components/Waveform";
import PulseBars from "./components/PulseBars";
const PRESETS = [
{
name: "Pop Ballad",
emoji: "💗",
duration: 60,
caption: "A gentle pop ballad with piano and soft vocals, key of C major, 80 BPM, emotional and dreamy",
lyrics: "[verse]\nUnderneath the stars tonight\nWe dance beneath the pale moonlight\nEvery moment feels so right\nHolding you so close and tight\n\n[chorus]\nThis is where I want to be\nRight here with you next to me\nLet the world just fade away\nIn your arms I want to stay",
},
{
name: "Rock Anthem",
emoji: "🎸",
duration: 60,
caption: "An energetic rock anthem with electric guitars and powerful drums, key of E minor, 140 BPM, aggressive and intense",
lyrics: "[verse]\nFire burning in my veins\nBreaking free from all these chains\nNothing left to hold me back\nRiding down the beaten track\n\n[chorus]\nWe are the ones who rise\nWith thunder in our eyes\nWe'll never be denied\nWe're burning up the sky",
},
{
name: "Lo-fi Chill",
emoji: "☕",
duration: 20,
caption: "A relaxing lo-fi hip hop beat with jazz piano samples and vinyl crackle, key of F major, 75 BPM, mellow and nostalgic",
lyrics: "[instrumental]",
},
];
function WebGPUGate({ children }) {
const supported = typeof navigator !== "undefined" && !!navigator.gpu;
if (supported) return children;
return (
<div className="fixed inset-0 flex items-center justify-center z-50" style={{ background: "var(--bg)" }}>
<div className="text-center max-w-md px-6">
<div className="text-5xl mb-4">🎹</div>
<h1 className="text-2xl font-semibold mb-3" style={{ color: "var(--text)" }}>
WebGPU not available
</h1>
<p style={{ color: "var(--text-muted)" }}>
This demo needs WebGPU to run ACE-Step in your browser. Try Chrome 113+, Edge 113+, or Safari 26+ on desktop.
</p>
</div>
</div>
);
}
function ProgressBar({ progress }) {
if (!progress) return null;
const pct = Math.max(0, Math.min(100, progress.percent || 0));
return (
<div className="w-full">
<div className="flex justify-between text-[11px] mb-1.5" style={{ color: "var(--text-muted)" }}>
<span>{progress.label}</span>
<span className="font-mono">
{progress.total > 1 && `${(progress.loaded / 1e6).toFixed(0)} / ${(progress.total / 1e6).toFixed(0)} MB · `}
{pct.toFixed(0)}%
</span>
</div>
<div className="h-1 rounded-full overflow-hidden" style={{ background: "var(--border)" }}>
<div
className="h-full rounded-full transition-all duration-300"
style={{ width: `${pct}%`, background: "var(--accent)" }}
/>
</div>
</div>
);
}
function LoadGate({ onLoad, status, message, progress, error }) {
const loading = status === "loading";
return (
<div
className="rounded-2xl p-8 fade-in"
style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}
>
<div className="flex flex-col items-center text-center">
<div className="text-4xl mb-3">🎹</div>
<h2 className="text-xl font-semibold mb-2" style={{ color: "var(--text)" }}>
Load models
</h2>
<p className="text-sm max-w-sm mb-5" style={{ color: "var(--text-muted)" }}>
Loads ~8 GB of ONNX models. Everything runs in your browser — your prompts never leave this device.
Built with{" "}
<a href="https://huggingface.co/docs/transformers.js" target="_blank" rel="noreferrer" className="underline" style={{ color: "var(--accent)" }}>
🤗 Transformers.js
</a>
{" + "}
<a href="https://onnxruntime.ai/docs/tutorials/web/" target="_blank" rel="noreferrer" className="underline" style={{ color: "var(--accent)" }}>
ONNX Runtime Web
</a>.
</p>
{error ? (
<div className="w-full text-sm mb-4 p-3 rounded-lg text-left" style={{ background: "oklch(0.25 0.08 22 / 0.3)", color: "var(--danger)" }}>
{error}
</div>
) : loading ? (
<div className="w-full space-y-3">
{message && (
<p className="text-xs" style={{ color: "var(--text-muted)" }}>
{message}
</p>
)}
{progress && <ProgressBar progress={progress} />}
</div>
) : (
<button
onClick={onLoad}
disabled={loading}
className="px-8 py-2.5 rounded-full font-medium transition hover:scale-[1.02] cursor-pointer"
style={{
background: "var(--accent)",
color: "var(--bg)",
letterSpacing: "-0.01em",
}}
>
Load models
</button>
)}
</div>
</div>
);
}
function PresetCard({ preset, active, onClick }) {
return (
<button
onClick={onClick}
className="flex-1 min-w-0 p-3 rounded-xl text-left transition-all cursor-pointer hover:scale-[1.02]"
style={{
background: active ? "var(--accent-soft)" : "var(--bg-elev)",
border: `1px solid ${active ? "var(--accent)" : "var(--border)"}`,
}}
>
<div className="text-xl mb-1">{preset.emoji}</div>
<div className="text-sm font-medium truncate" style={{ color: "var(--text)" }}>
{preset.name}
</div>
<div className="text-[10px] uppercase tracking-wider mt-0.5" style={{ color: "var(--text-dim)" }}>
{preset.duration}s · {preset.lyrics === "[instrumental]" ? "instrumental" : "vocal"}
</div>
</button>
);
}
function GenerationStatus({ status, message }) {
if (status !== "generating") return null;
return (
<div
className="rounded-2xl p-5 fade-in"
style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}
>
<PulseBars count={60} />
<div className="mt-3 flex items-center justify-between text-xs">
<span style={{ color: "var(--text)" }}>{message || "Generating…"}</span>
<span className="font-mono" style={{ color: "var(--text-muted)" }}>
this takes 1–4 min
</span>
</div>
</div>
);
}
function OutputCard({ audioUrl, audioInfo }) {
if (!audioUrl) return null;
return (
<div
className="rounded-2xl p-5 fade-in space-y-3"
style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}
>
<Waveform src={audioUrl} duration={audioInfo?.duration} />
<div className="flex items-center justify-between text-xs pt-2" style={{ borderTop: "1px solid var(--border)" }}>
<div className="font-mono" style={{ color: "var(--text-muted)" }}>
48 kHz · stereo
{audioInfo?.totalTime && ` · ${audioInfo.totalTime}s gen`}
</div>
<a
href={audioUrl}
download={audioInfo?.filename || "ace-step.wav"}
className="px-3 py-1.5 rounded-md text-xs font-medium transition hover:opacity-80 cursor-pointer"
style={{ background: "var(--surface)", color: "var(--text)" }}
>
⬇ Download WAV
</a>
</div>
</div>
);
}
export default function App() {
const { status, message, progress, audioUrl, audioInfo, error, isLoaded, loadModel, generate } = useModel();
const [activeIdx, setActiveIdx] = useState(0);
const [caption, setCaption] = useState(PRESETS[0].caption);
const [lyrics, setLyrics] = useState(PRESETS[0].lyrics);
const [duration, setDuration] = useState(PRESETS[0].duration);
const [shift, setShift] = useState(3.0);
const [numSteps, setNumSteps] = useState(8);
const isWorking = status === "loading" || status === "generating";
const applyPreset = (i) => {
setActiveIdx(i);
setCaption(PRESETS[i].caption);
setLyrics(PRESETS[i].lyrics);
setDuration(PRESETS[i].duration);
};
return (
<WebGPUGate>
<div className="min-h-screen flex flex-col items-center px-4 py-10" style={{ background: "var(--bg)" }}>
{/* Hero */}
<header className="mb-10 w-full max-w-2xl fade-in">
<h1 className="leading-none mb-2 flex items-baseline gap-3 flex-wrap" style={{
fontSize: "clamp(2.5rem, 5vw, 3.5rem)",
color: "var(--text)",
}}>
<span style={{ fontFamily: "'Dancing Script', cursive", fontWeight: 600 }}>
ACE-Step
</span>
<span style={{ fontWeight: 600, letterSpacing: "-0.03em" }}>
WebGPU
</span>
</h1>
<p className="text-lg" style={{ color: "var(--text-muted)" }}>
Describe any song. AI writes &amp; produces it.
</p>
</header>
<main className="w-full max-w-2xl space-y-4">
{!isLoaded ? (
<LoadGate onLoad={loadModel} status={status} message={message} progress={progress} error={error} />
) : (
<>
{/* Presets */}
<div className="flex gap-2">
{PRESETS.map((p, i) => (
<PresetCard key={p.name} preset={p} active={i === activeIdx} onClick={() => applyPreset(i)} />
))}
</div>
{/* Caption */}
<div className="rounded-2xl p-4" style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}>
<label className="text-[10px] uppercase tracking-widest mb-2 block" style={{ color: "var(--text-dim)" }}>
Description
</label>
<textarea
value={caption}
onChange={(e) => setCaption(e.target.value)}
onInput={() => setActiveIdx(-1)}
rows={2}
className="w-full bg-transparent text-sm resize-none outline-none"
style={{ color: "var(--text)" }}
placeholder="Describe the music — style, instruments, key, BPM, mood…"
/>
</div>
{/* Lyrics */}
<div className="rounded-2xl p-4" style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}>
<label className="text-[10px] uppercase tracking-widest mb-2 block" style={{ color: "var(--text-dim)" }}>
Lyrics (use [verse] / [chorus] tags, or [instrumental])
</label>
<textarea
value={lyrics}
onChange={(e) => setLyrics(e.target.value)}
onInput={() => setActiveIdx(-1)}
rows={6}
className="w-full bg-transparent text-sm resize-none outline-none font-mono"
style={{ color: "var(--text)" }}
/>
</div>
{/* Controls — pill row */}
<div className="flex items-center gap-3 flex-wrap">
<div className="flex items-center gap-2 px-3 py-1.5 rounded-full text-xs"
style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}>
<span style={{ color: "var(--text-muted)" }}>Duration</span>
<input
type="range"
min={10}
max={90}
step={10}
value={duration}
onChange={(e) => setDuration(Number(e.target.value))}
className="w-24"
/>
<span className="font-mono w-8 text-right" style={{ color: "var(--text)" }}>{duration}s</span>
</div>
<div className="flex items-center gap-2 px-3 py-1.5 rounded-full text-xs"
style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}>
<span style={{ color: "var(--text-muted)" }}>Steps</span>
<select
value={numSteps}
onChange={(e) => setNumSteps(Number(e.target.value))}
className="bg-transparent outline-none cursor-pointer"
style={{ color: "var(--text)" }}
>
<option value={8}>8 (turbo)</option>
</select>
</div>
<div className="flex items-center gap-2 px-3 py-1.5 rounded-full text-xs"
style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}>
<span style={{ color: "var(--text-muted)" }}>Shift</span>
<select
value={shift}
onChange={(e) => setShift(Number(e.target.value))}
className="bg-transparent outline-none cursor-pointer"
style={{ color: "var(--text)" }}
>
<option value={1.0}>1.0</option>
<option value={2.0}>2.0</option>
<option value={3.0}>3.0</option>
</select>
</div>
</div>
{/* Generate */}
<button
onClick={() => generate({ caption, lyrics, duration, shift, numSteps })}
disabled={isWorking}
className="w-full py-3.5 rounded-full font-medium text-base transition disabled:opacity-50 disabled:cursor-not-allowed hover:scale-[1.01] cursor-pointer"
style={{
background: "var(--accent)",
color: "var(--bg)",
letterSpacing: "-0.01em",
boxShadow: "0 0 40px oklch(0.72 0.17 305 / 0.25)",
}}
>
{status === "generating" ? "Generating music…" : "Generate"}
</button>
<GenerationStatus status={status} message={message} />
<OutputCard audioUrl={audioUrl} audioInfo={audioInfo} />
{error && (
<div className="rounded-lg p-3 text-sm" style={{ background: "oklch(0.25 0.08 22 / 0.3)", color: "var(--danger)" }}>
{error}
</div>
)}
</>
)}
</main>
{/* About / methodology */}
<section className="w-full max-w-2xl mt-12 text-sm" style={{ color: "var(--text-muted)" }}>
<details className="rounded-xl px-4 py-3"
style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}>
<summary className="cursor-pointer font-medium select-none" style={{ color: "var(--text)" }}>
How it works &amp; known limitations
</summary>
<div className="mt-4 space-y-4 leading-relaxed">
<div>
<h3 className="text-[13px] uppercase tracking-widest mb-2" style={{ color: "var(--text-dim)" }}>Pipeline</h3>
<ol className="list-decimal list-inside space-y-1">
<li><span style={{ color: "var(--text)" }}>Text encoder</span> (Qwen3-Embedding-0.6B, fp16) turns the caption into conditioning hidden states; the same model provides token embeddings for the lyric path.</li>
<li><span style={{ color: "var(--text)" }}>5&nbsp;Hz LM</span> (ACE-Step acestep-5Hz-lm-0.6B, 4-bit MatMulNBits) writes a short chain-of-thought, then emits ~50 audio codes per 10&nbsp;s of output.</li>
<li><span style={{ color: "var(--text)" }}>FSQ → detokenizer</span> expands the codes into 25&nbsp;Hz acoustic features used as cross-attention hints.</li>
<li><span style={{ color: "var(--text)" }}>DiT decoder</span> (2B parameters, fp16) runs 8 Euler flow-matching steps (shift=3.0) over a random latent conditioned on text, lyrics, and hints.</li>
<li><span style={{ color: "var(--text)" }}>Oobleck VAE</span> (fp16) decodes the 25&nbsp;Hz latent into stereo 48&nbsp;kHz audio.</li>
</ol>
</div>
<div>
<h3 className="text-[13px] uppercase tracking-widest mb-2" style={{ color: "var(--text-dim)" }}>Why it runs in the browser</h3>
<p>
Everything executes on-device via <code className="font-mono text-xs">onnxruntime-web</code> with the WebGPU execution provider. Two Web Workers keep the LM and the diffusion+VAE graphs in separate WASM heaps so neither hits the 4&nbsp;GB single-heap limit. Total download is ~2&nbsp;GB (cached in the browser after the first load).
</p>
</div>
<div>
<h3 className="text-[13px] uppercase tracking-widest mb-2" style={{ color: "var(--text-dim)" }}>Methodology notes</h3>
<ul className="list-disc list-inside space-y-1">
<li>Compared stage-by-stage against the PyTorch fp32 reference: every tensor agrees to within 0.2% relative L2, and the generated waveforms sound identical.</li>
<li>FP16 DiT is exported natively (<code className="font-mono text-xs">model.half()</code> + dynamo). An earlier fp32→fp16 conversion with post-hoc Cast insertion produced a 25&nbsp;Hz helicopter artifact, now resolved.</li>
<li>4-bit quantization is MatMulNBits with <code className="font-mono text-xs">block_size=64</code>, asymmetric, <code className="font-mono text-xs">accuracy_level=1</code> (fp32 accumulate).</li>
</ul>
</div>
<div>
<h3 className="text-[13px] uppercase tracking-widest mb-2" style={{ color: "var(--text-dim)" }}>Known limitations</h3>
<ul className="list-disc list-inside space-y-1">
<li><span style={{ color: "var(--text)" }}>First load is slow.</span> ~2&nbsp;GB of weights must be fetched and cached; subsequent runs start fast.</li>
<li><span style={{ color: "var(--text)" }}>Vocals need ≥60&nbsp;s.</span> The 0.6B LM often refuses to emit lyric-aligned audio codes for short durations — instrumentals work at any length.</li>
<li><span style={{ color: "var(--text)" }}>Turbo quality ceiling.</span> We run 8 diffusion steps (shift=3.0). More steps nudge quality up but aren't supported by the turbo weights we ship.</li>
<li><span style={{ color: "var(--text)" }}>Condition-encoder drift.</span> The ONNX condition_encoder has a small drift (~0.4 max_diff) vs PyTorch on real inputs — inaudible today but a known residual we haven&rsquo;t closed.</li>
<li><span style={{ color: "var(--text)" }}>WebGPU only.</span> No fallback path; the demo gates on WebGPU support (Chrome/Edge 113+, Safari 26+ desktop).</li>
<li><span style={{ color: "var(--text)" }}>Memory.</span> Two workers each hold ~1–2&nbsp;GB; low-RAM devices may hit <code className="font-mono text-xs">std::bad_alloc</code> during model creation.</li>
<li><span style={{ color: "var(--text)" }}>No seed control.</span> Each generation uses a fresh RNG, so re-runs with the same prompt will differ.</li>
</ul>
</div>
</div>
</details>
</section>
{/* Footer */}
<footer className="mt-12 mb-6 text-center text-xs space-y-2" style={{ color: "var(--text-dim)" }}>
<div>
<a href="https://huggingface.co/shreyask/ACE-Step-v1.5-ONNX" target="_blank" rel="noreferrer" className="hover:opacity-80 transition" style={{ color: "var(--text-muted)" }}>
shreyask/ACE-Step-v1.5-ONNX
</a>
<span className="mx-2">·</span>
<a href="https://huggingface.co/ACE-Step/Ace-Step1.5" target="_blank" rel="noreferrer" className="hover:opacity-80 transition" style={{ color: "var(--text-muted)" }}>
ACE-Step 1.5
</a>
<span className="mx-2">·</span>
<span>Apache 2.0</span>
</div>
<div>
Made with <a href="https://huggingface.co/docs/transformers.js" target="_blank" rel="noreferrer" className="underline" style={{ color: "var(--text-muted)" }}>🤗 Transformers.js</a>
</div>
</footer>
</div>
</WebGPUGate>
);
}