Spaces:

shreyask
/

ace-step-webgpu

Running

App Files Files Community

ace-step-webgpu / _source /src /App.jsx

shreyask

Initial deploy: built app at root + source under _source/

24b9788 verified 3 days ago

raw

history blame contribute delete

20.3 kB

	import { useState } from "react";
	import { useModel } from "./hooks/useModel";
	import Waveform from "./components/Waveform";
	import PulseBars from "./components/PulseBars";

	const PRESETS = [
	{
	name: "Pop Ballad",
	emoji: "💗",
	duration: 60,
	caption: "A gentle pop ballad with piano and soft vocals, key of C major, 80 BPM, emotional and dreamy",
	lyrics: "[verse]\nUnderneath the stars tonight\nWe dance beneath the pale moonlight\nEvery moment feels so right\nHolding you so close and tight\n\n[chorus]\nThis is where I want to be\nRight here with you next to me\nLet the world just fade away\nIn your arms I want to stay",
	},
	{
	name: "Rock Anthem",
	emoji: "🎸",
	duration: 60,
	caption: "An energetic rock anthem with electric guitars and powerful drums, key of E minor, 140 BPM, aggressive and intense",
	lyrics: "[verse]\nFire burning in my veins\nBreaking free from all these chains\nNothing left to hold me back\nRiding down the beaten track\n\n[chorus]\nWe are the ones who rise\nWith thunder in our eyes\nWe'll never be denied\nWe're burning up the sky",
	},
	{
	name: "Lo-fi Chill",
	emoji: "☕",
	duration: 20,
	caption: "A relaxing lo-fi hip hop beat with jazz piano samples and vinyl crackle, key of F major, 75 BPM, mellow and nostalgic",
	lyrics: "[instrumental]",
	},
	];

	function WebGPUGate({ children }) {
	const supported = typeof navigator !== "undefined" && !!navigator.gpu;
	if (supported) return children;
	return (
	<div className="fixed inset-0 flex items-center justify-center z-50" style={{ background: "var(--bg)" }}>
	<div className="text-center max-w-md px-6">
	<div className="text-5xl mb-4">🎹</div>
	<h1 className="text-2xl font-semibold mb-3" style={{ color: "var(--text)" }}>
	WebGPU not available
	</h1>
	<p style={{ color: "var(--text-muted)" }}>
	This demo needs WebGPU to run ACE-Step in your browser. Try Chrome 113+, Edge 113+, or Safari 26+ on desktop.
	</p>
	</div>
	</div>
	);
	}

	function ProgressBar({ progress }) {
	if (!progress) return null;
	const pct = Math.max(0, Math.min(100, progress.percent \|\| 0));
	return (
	<div className="w-full">
	<div className="flex justify-between text-[11px] mb-1.5" style={{ color: "var(--text-muted)" }}>
	<span>{progress.label}</span>
	<span className="font-mono">
	{progress.total > 1 && `${(progress.loaded / 1e6).toFixed(0)} / ${(progress.total / 1e6).toFixed(0)} MB · `}
	{pct.toFixed(0)}%
	</span>
	</div>
	<div className="h-1 rounded-full overflow-hidden" style={{ background: "var(--border)" }}>
	<div
	className="h-full rounded-full transition-all duration-300"
	style={{ width: `${pct}%`, background: "var(--accent)" }}
	/>
	</div>
	</div>
	);
	}

	function LoadGate({ onLoad, status, message, progress, error }) {
	const loading = status === "loading";
	return (
	<div
	className="rounded-2xl p-8 fade-in"
	style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}
	>
	<div className="flex flex-col items-center text-center">
	<div className="text-4xl mb-3">🎹</div>
	<h2 className="text-xl font-semibold mb-2" style={{ color: "var(--text)" }}>
	Load models
	</h2>
	<p className="text-sm max-w-sm mb-5" style={{ color: "var(--text-muted)" }}>
	Loads ~8 GB of ONNX models. Everything runs in your browser — your prompts never leave this device.
	Built with{" "}
	<a href="https://huggingface.co/docs/transformers.js" target="_blank" rel="noreferrer" className="underline" style={{ color: "var(--accent)" }}>
	🤗 Transformers.js
	</a>
	{" + "}
	<a href="https://onnxruntime.ai/docs/tutorials/web/" target="_blank" rel="noreferrer" className="underline" style={{ color: "var(--accent)" }}>
	ONNX Runtime Web
	</a>.
	</p>

	{error ? (
	<div className="w-full text-sm mb-4 p-3 rounded-lg text-left" style={{ background: "oklch(0.25 0.08 22 / 0.3)", color: "var(--danger)" }}>
	{error}
	</div>
	) : loading ? (
	<div className="w-full space-y-3">
	{message && (
	<p className="text-xs" style={{ color: "var(--text-muted)" }}>
	{message}
	</p>
	)}
	{progress && <ProgressBar progress={progress} />}
	</div>
	) : (
	<button
	onClick={onLoad}
	disabled={loading}
	className="px-8 py-2.5 rounded-full font-medium transition hover:scale-[1.02] cursor-pointer"
	style={{
	background: "var(--accent)",
	color: "var(--bg)",
	letterSpacing: "-0.01em",
	}}
	>
	Load models
	</button>
	)}
	</div>
	</div>
	);
	}

	function PresetCard({ preset, active, onClick }) {
	return (
	<button
	onClick={onClick}
	className="flex-1 min-w-0 p-3 rounded-xl text-left transition-all cursor-pointer hover:scale-[1.02]"
	style={{
	background: active ? "var(--accent-soft)" : "var(--bg-elev)",
	border: `1px solid ${active ? "var(--accent)" : "var(--border)"}`,
	}}
	>
	<div className="text-xl mb-1">{preset.emoji}</div>
	<div className="text-sm font-medium truncate" style={{ color: "var(--text)" }}>
	{preset.name}
	</div>
	<div className="text-[10px] uppercase tracking-wider mt-0.5" style={{ color: "var(--text-dim)" }}>
	{preset.duration}s · {preset.lyrics === "[instrumental]" ? "instrumental" : "vocal"}
	</div>
	</button>
	);
	}

	function GenerationStatus({ status, message }) {
	if (status !== "generating") return null;
	return (
	<div
	className="rounded-2xl p-5 fade-in"
	style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}
	>
	<PulseBars count={60} />
	<div className="mt-3 flex items-center justify-between text-xs">
	<span style={{ color: "var(--text)" }}>{message \|\| "Generating…"}</span>
	<span className="font-mono" style={{ color: "var(--text-muted)" }}>
	this takes 1–4 min
	</span>
	</div>
	</div>
	);
	}

	function OutputCard({ audioUrl, audioInfo }) {
	if (!audioUrl) return null;
	return (
	<div
	className="rounded-2xl p-5 fade-in space-y-3"
	style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}
	>
	<Waveform src={audioUrl} duration={audioInfo?.duration} />
	<div className="flex items-center justify-between text-xs pt-2" style={{ borderTop: "1px solid var(--border)" }}>
	<div className="font-mono" style={{ color: "var(--text-muted)" }}>
	48 kHz · stereo
	{audioInfo?.totalTime && ` · ${audioInfo.totalTime}s gen`}
	</div>
	<a
	href={audioUrl}
	download={audioInfo?.filename \|\| "ace-step.wav"}
	className="px-3 py-1.5 rounded-md text-xs font-medium transition hover:opacity-80 cursor-pointer"
	style={{ background: "var(--surface)", color: "var(--text)" }}
	>
	⬇ Download WAV
	</a>
	</div>
	</div>
	);
	}

	export default function App() {
	const { status, message, progress, audioUrl, audioInfo, error, isLoaded, loadModel, generate } = useModel();
	const [activeIdx, setActiveIdx] = useState(0);
	const [caption, setCaption] = useState(PRESETS[0].caption);
	const [lyrics, setLyrics] = useState(PRESETS[0].lyrics);
	const [duration, setDuration] = useState(PRESETS[0].duration);
	const [shift, setShift] = useState(3.0);
	const [numSteps, setNumSteps] = useState(8);

	const isWorking = status === "loading" \|\| status === "generating";

	const applyPreset = (i) => {
	setActiveIdx(i);
	setCaption(PRESETS[i].caption);
	setLyrics(PRESETS[i].lyrics);
	setDuration(PRESETS[i].duration);
	};

	return (
	<WebGPUGate>
	<div className="min-h-screen flex flex-col items-center px-4 py-10" style={{ background: "var(--bg)" }}>
	{/* Hero */}
	<header className="mb-10 w-full max-w-2xl fade-in">
	<h1 className="leading-none mb-2 flex items-baseline gap-3 flex-wrap" style={{
	fontSize: "clamp(2.5rem, 5vw, 3.5rem)",
	color: "var(--text)",
	}}>
	<span style={{ fontFamily: "'Dancing Script', cursive", fontWeight: 600 }}>
	ACE-Step
	</span>
	<span style={{ fontWeight: 600, letterSpacing: "-0.03em" }}>
	WebGPU
	</span>
	</h1>
	<p className="text-lg" style={{ color: "var(--text-muted)" }}>
	Describe any song. AI writes & produces it.
	</p>
	</header>

	<main className="w-full max-w-2xl space-y-4">
	{!isLoaded ? (
	<LoadGate onLoad={loadModel} status={status} message={message} progress={progress} error={error} />
	) : (
	<>
	{/* Presets */}
	<div className="flex gap-2">
	{PRESETS.map((p, i) => (
	<PresetCard key={p.name} preset={p} active={i === activeIdx} onClick={() => applyPreset(i)} />
	))}
	</div>

	{/* Caption */}
	<div className="rounded-2xl p-4" style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}>
	<label className="text-[10px] uppercase tracking-widest mb-2 block" style={{ color: "var(--text-dim)" }}>
	Description
	</label>
	<textarea
	value={caption}
	onChange={(e) => setCaption(e.target.value)}
	onInput={() => setActiveIdx(-1)}
	rows={2}
	className="w-full bg-transparent text-sm resize-none outline-none"
	style={{ color: "var(--text)" }}
	placeholder="Describe the music — style, instruments, key, BPM, mood…"
	/>
	</div>

	{/* Lyrics */}
	<div className="rounded-2xl p-4" style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}>
	<label className="text-[10px] uppercase tracking-widest mb-2 block" style={{ color: "var(--text-dim)" }}>
	Lyrics (use [verse] / [chorus] tags, or [instrumental])
	</label>
	<textarea
	value={lyrics}
	onChange={(e) => setLyrics(e.target.value)}
	onInput={() => setActiveIdx(-1)}
	rows={6}
	className="w-full bg-transparent text-sm resize-none outline-none font-mono"
	style={{ color: "var(--text)" }}
	/>
	</div>

	{/* Controls — pill row */}
	<div className="flex items-center gap-3 flex-wrap">
	<div className="flex items-center gap-2 px-3 py-1.5 rounded-full text-xs"
	style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}>
	<span style={{ color: "var(--text-muted)" }}>Duration</span>
	<input
	type="range"
	min={10}
	max={90}
	step={10}
	value={duration}
	onChange={(e) => setDuration(Number(e.target.value))}
	className="w-24"
	/>
	<span className="font-mono w-8 text-right" style={{ color: "var(--text)" }}>{duration}s</span>
	</div>

	<div className="flex items-center gap-2 px-3 py-1.5 rounded-full text-xs"
	style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}>
	<span style={{ color: "var(--text-muted)" }}>Steps</span>
	<select
	value={numSteps}
	onChange={(e) => setNumSteps(Number(e.target.value))}
	className="bg-transparent outline-none cursor-pointer"
	style={{ color: "var(--text)" }}
	>
	<option value={8}>8 (turbo)</option>
	</select>
	</div>

	<div className="flex items-center gap-2 px-3 py-1.5 rounded-full text-xs"
	style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}>
	<span style={{ color: "var(--text-muted)" }}>Shift</span>
	<select
	value={shift}
	onChange={(e) => setShift(Number(e.target.value))}
	className="bg-transparent outline-none cursor-pointer"
	style={{ color: "var(--text)" }}
	>
	<option value={1.0}>1.0</option>
	<option value={2.0}>2.0</option>
	<option value={3.0}>3.0</option>
	</select>
	</div>
	</div>

	{/* Generate */}
	<button
	onClick={() => generate({ caption, lyrics, duration, shift, numSteps })}
	disabled={isWorking}
	className="w-full py-3.5 rounded-full font-medium text-base transition disabled:opacity-50 disabled:cursor-not-allowed hover:scale-[1.01] cursor-pointer"
	style={{
	background: "var(--accent)",
	color: "var(--bg)",
	letterSpacing: "-0.01em",
	boxShadow: "0 0 40px oklch(0.72 0.17 305 / 0.25)",
	}}
	>
	{status === "generating" ? "Generating music…" : "Generate"}
	</button>

	<GenerationStatus status={status} message={message} />
	<OutputCard audioUrl={audioUrl} audioInfo={audioInfo} />

	{error && (
	<div className="rounded-lg p-3 text-sm" style={{ background: "oklch(0.25 0.08 22 / 0.3)", color: "var(--danger)" }}>
	{error}
	</div>
	)}
	</>
	)}
	</main>

	{/* About / methodology */}
	<section className="w-full max-w-2xl mt-12 text-sm" style={{ color: "var(--text-muted)" }}>
	<details className="rounded-xl px-4 py-3"
	style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}>
	<summary className="cursor-pointer font-medium select-none" style={{ color: "var(--text)" }}>
	How it works & known limitations
	</summary>
	<div className="mt-4 space-y-4 leading-relaxed">
	<div>
	<h3 className="text-[13px] uppercase tracking-widest mb-2" style={{ color: "var(--text-dim)" }}>Pipeline</h3>
	<ol className="list-decimal list-inside space-y-1">
	<li><span style={{ color: "var(--text)" }}>Text encoder</span> (Qwen3-Embedding-0.6B, fp16) turns the caption into conditioning hidden states; the same model provides token embeddings for the lyric path.</li>
	<li><span style={{ color: "var(--text)" }}>5 Hz LM</span> (ACE-Step acestep-5Hz-lm-0.6B, 4-bit MatMulNBits) writes a short chain-of-thought, then emits ~50 audio codes per 10 s of output.</li>
	<li><span style={{ color: "var(--text)" }}>FSQ → detokenizer</span> expands the codes into 25 Hz acoustic features used as cross-attention hints.</li>
	<li><span style={{ color: "var(--text)" }}>DiT decoder</span> (2B parameters, fp16) runs 8 Euler flow-matching steps (shift=3.0) over a random latent conditioned on text, lyrics, and hints.</li>
	<li><span style={{ color: "var(--text)" }}>Oobleck VAE</span> (fp16) decodes the 25 Hz latent into stereo 48 kHz audio.</li>
	</ol>
	</div>

	<div>
	<h3 className="text-[13px] uppercase tracking-widest mb-2" style={{ color: "var(--text-dim)" }}>Why it runs in the browser</h3>
	<p>
	Everything executes on-device via <code className="font-mono text-xs">onnxruntime-web</code> with the WebGPU execution provider. Two Web Workers keep the LM and the diffusion+VAE graphs in separate WASM heaps so neither hits the 4 GB single-heap limit. Total download is ~2 GB (cached in the browser after the first load).
	</p>
	</div>

	<div>
	<h3 className="text-[13px] uppercase tracking-widest mb-2" style={{ color: "var(--text-dim)" }}>Methodology notes</h3>
	<ul className="list-disc list-inside space-y-1">
	<li>Compared stage-by-stage against the PyTorch fp32 reference: every tensor agrees to within 0.2% relative L2, and the generated waveforms sound identical.</li>
	<li>FP16 DiT is exported natively (<code className="font-mono text-xs">model.half()</code> + dynamo). An earlier fp32→fp16 conversion with post-hoc Cast insertion produced a 25 Hz helicopter artifact, now resolved.</li>
	<li>4-bit quantization is MatMulNBits with <code className="font-mono text-xs">block_size=64</code>, asymmetric, <code className="font-mono text-xs">accuracy_level=1</code> (fp32 accumulate).</li>
	</ul>
	</div>

	<div>
	<h3 className="text-[13px] uppercase tracking-widest mb-2" style={{ color: "var(--text-dim)" }}>Known limitations</h3>
	<ul className="list-disc list-inside space-y-1">
	<li><span style={{ color: "var(--text)" }}>First load is slow.</span> ~2 GB of weights must be fetched and cached; subsequent runs start fast.</li>
	<li><span style={{ color: "var(--text)" }}>Vocals need ≥60 s.</span> The 0.6B LM often refuses to emit lyric-aligned audio codes for short durations — instrumentals work at any length.</li>
	<li><span style={{ color: "var(--text)" }}>Turbo quality ceiling.</span> We run 8 diffusion steps (shift=3.0). More steps nudge quality up but aren't supported by the turbo weights we ship.</li>
	<li><span style={{ color: "var(--text)" }}>Condition-encoder drift.</span> The ONNX condition_encoder has a small drift (~0.4 max_diff) vs PyTorch on real inputs — inaudible today but a known residual we haven’t closed.</li>
	<li><span style={{ color: "var(--text)" }}>WebGPU only.</span> No fallback path; the demo gates on WebGPU support (Chrome/Edge 113+, Safari 26+ desktop).</li>
	<li><span style={{ color: "var(--text)" }}>Memory.</span> Two workers each hold ~1–2 GB; low-RAM devices may hit <code className="font-mono text-xs">std::bad_alloc</code> during model creation.</li>
	<li><span style={{ color: "var(--text)" }}>No seed control.</span> Each generation uses a fresh RNG, so re-runs with the same prompt will differ.</li>
	</ul>
	</div>
	</div>
	</details>
	</section>

	{/* Footer */}
	<footer className="mt-12 mb-6 text-center text-xs space-y-2" style={{ color: "var(--text-dim)" }}>
	<div>
	<a href="https://huggingface.co/shreyask/ACE-Step-v1.5-ONNX" target="_blank" rel="noreferrer" className="hover:opacity-80 transition" style={{ color: "var(--text-muted)" }}>
	shreyask/ACE-Step-v1.5-ONNX
	</a>
	<span className="mx-2">·</span>
	<a href="https://huggingface.co/ACE-Step/Ace-Step1.5" target="_blank" rel="noreferrer" className="hover:opacity-80 transition" style={{ color: "var(--text-muted)" }}>
	ACE-Step 1.5
	</a>
	<span className="mx-2">·</span>
	<span>Apache 2.0</span>
	</div>
	<div>
	Made with <a href="https://huggingface.co/docs/transformers.js" target="_blank" rel="noreferrer" className="underline" style={{ color: "var(--text-muted)" }}>🤗 Transformers.js</a>
	</div>
	</footer>
	</div>
	</WebGPUGate>
	);
	}