Spaces:

shreyask
/

ace-step-webgpu

Running

App Files Files Community

shreyask commited on 4 days ago

Commit

24b9788

verified ·

1 Parent(s): fcaf28c

Initial deploy: built app at root + source under _source/

Browse files

Files changed (37) hide show

README.md +44 -5
_source/README.md +16 -0
_source/eslint.config.js +29 -0
_source/index.html +17 -0
_source/package-lock.json +0 -0
_source/package.json +31 -0
_source/public/favicon.svg +1 -0
_source/public/icons.svg +24 -0
_source/public/silence_latent.bin +3 -0
_source/public/silence_latent_meta.json +1 -0
_source/public/silence_roundtripped.bin +3 -0
_source/public/silence_roundtripped_meta.json +1 -0
_source/src/App.jsx +417 -0
_source/src/assets/hero.png +0 -0
_source/src/assets/react.svg +1 -0
_source/src/assets/vite.svg +1 -0
_source/src/components/PulseBars.jsx +17 -0
_source/src/components/Waveform.jsx +134 -0
_source/src/hooks/useModel.js +111 -0
_source/src/index.css +93 -0
_source/src/lm-worker.js +271 -0
_source/src/main.jsx +10 -0
_source/src/worker.js +665 -0
_source/vite.config.js +13 -0
assets/index-C7vMACvi.js +0 -0
assets/index-CccuoAYh.css +2 -0
assets/lm-worker-CMbQRLr6.js +0 -0
assets/ort-wasm-simd-threaded.asyncify-9GUf3Unn.wasm +3 -0
assets/ort-wasm-simd-threaded.asyncify-CtKKja6V.wasm +3 -0
assets/worker-retwKpvq.js +0 -0
favicon.svg +1 -0
icons.svg +24 -0
index.html +16 -17
silence_latent.bin +3 -0
silence_latent_meta.json +1 -0
silence_roundtripped.bin +3 -0
silence_roundtripped_meta.json +1 -0

README.md CHANGED Viewed

@@ -1,10 +1,49 @@
 ---
-title: Ace Step Webgpu
-emoji: 📈
-colorFrom: pink
-colorTo: purple
 sdk: static
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: ACE-Step WebGPU
+emoji: 🎵
+colorFrom: purple
+colorTo: pink
 sdk: static
 pinned: false
+license: apache-2.0
+short_description: Text-to-music in your browser via WebGPU.
 ---
+# ACE-Step WebGPU
+Describe any song. AI writes & produces it — right in your browser.
+The pipeline (5 Hz Qwen3 LM → FSQ → DiT decoder → Oobleck VAE) runs end-to-end
+via [onnxruntime-web](https://onnxruntime.ai/) with the WebGPU execution
+provider. Two Web Workers keep the LM and diffusion+VAE graphs in separate
+WASM heaps so neither hits the 4 GB single-heap limit.
+## Models
+- DiT decoder (2B, fp16) and Oobleck VAE (fp16) from
+  [shreyask/ACE-Step-v1.5-ONNX](https://huggingface.co/shreyask/ACE-Step-v1.5-ONNX)
+- 5 Hz LM (0.6B, 4-bit MatMulNBits) from
+  [ACE-Step/acestep-5Hz-lm-0.6B](https://huggingface.co/ACE-Step/acestep-5Hz-lm-0.6B)
+- Text encoder: [Qwen/Qwen3-Embedding-0.6B](https://huggingface.co/Qwen/Qwen3-Embedding-0.6B)
+Weights are fetched on demand and cached in the browser's Cache Storage after
+the first load (~2 GB total).
+## Requirements
+- WebGPU-capable browser: Chrome/Edge 113+, Safari 26+ desktop
+- ~4 GB free RAM recommended
+## Source
+The `_source/` directory in this Space's Files tab contains the full Vite/React
+project (`src/`, `public/`, configs). Build it locally with:
+```bash
+cd _source
+npm install
+npm run build
+```
+Upstream: [ACE-Step/Ace-Step1.5](https://huggingface.co/ACE-Step/Ace-Step1.5)
+(Apache 2.0).

_source/README.md ADDED Viewed

	@@ -0,0 +1,16 @@

+# React + Vite
+This template provides a minimal setup to get React working in Vite with HMR and some ESLint rules.
+Currently, two official plugins are available:
+- [@vitejs/plugin-react](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react) uses [Oxc](https://oxc.rs)
+- [@vitejs/plugin-react-swc](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react-swc) uses [SWC](https://swc.rs/)
+## React Compiler
+The React Compiler is not enabled on this template because of its impact on dev & build performances. To add it, see [this documentation](https://react.dev/learn/react-compiler/installation).
+## Expanding the ESLint configuration
+If you are developing a production application, we recommend using TypeScript with type-aware lint rules enabled. Check out the [TS template](https://github.com/vitejs/vite/tree/main/packages/create-vite/template-react-ts) for information on how to integrate TypeScript and [`typescript-eslint`](https://typescript-eslint.io) in your project.

_source/eslint.config.js ADDED Viewed

	@@ -0,0 +1,29 @@

+import js from '@eslint/js'
+import globals from 'globals'
+import reactHooks from 'eslint-plugin-react-hooks'
+import reactRefresh from 'eslint-plugin-react-refresh'
+import { defineConfig, globalIgnores } from 'eslint/config'
+export default defineConfig([
+  globalIgnores(['dist']),
+  {
+    files: ['**/*.{js,jsx}'],
+    extends: [
+      js.configs.recommended,
+      reactHooks.configs.flat.recommended,
+      reactRefresh.configs.vite,
+    ],
+    languageOptions: {
+      ecmaVersion: 2020,
+      globals: globals.browser,
+      parserOptions: {
+        ecmaVersion: 'latest',
+        ecmaFeatures: { jsx: true },
+        sourceType: 'module',
+      },
+    },
+    rules: {
+      'no-unused-vars': ['error', { varsIgnorePattern: '^[A-Z_]' }],
+    },
+  },
+])

_source/index.html ADDED Viewed

	@@ -0,0 +1,17 @@

+<!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <link rel="icon" type="image/svg+xml" href="/favicon.svg" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <meta name="description" content="ACE-Step 1.5 text-to-music generation running entirely in your browser via WebGPU" />
+    <link rel="preconnect" href="https://fonts.googleapis.com" />
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
+    <link href="https://fonts.googleapis.com/css2?family=Hanken+Grotesk:wght@300;400;500;600;700&family=JetBrains+Mono:wght@400;500&family=Dancing+Script:wght@500;600;700&display=swap" rel="stylesheet" />
+    <title>ACE-Step WebGPU — Text to Music</title>
+  </head>
+  <body>
+    <div id="root"></div>
+    <script type="module" src="/src/main.jsx"></script>
+  </body>
+</html>

_source/package-lock.json ADDED Viewed

The diff for this file is too large to render. See raw diff

_source/package.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "name": "demo",
+  "private": true,
+  "version": "0.0.0",
+  "type": "module",
+  "scripts": {
+    "dev": "vite",
+    "build": "vite build",
+    "lint": "eslint .",
+    "preview": "vite preview"
+  },
+  "dependencies": {
+    "@huggingface/transformers": "^4.1.0",
+    "onnxruntime-web": "^1.24.3",
+    "react": "^19.2.4",
+    "react-dom": "^19.2.4"
+  },
+  "devDependencies": {
+    "@eslint/js": "^9.39.4",
+    "@tailwindcss/vite": "^4.2.2",
+    "@types/react": "^19.2.14",
+    "@types/react-dom": "^19.2.3",
+    "@vitejs/plugin-react": "^6.0.1",
+    "eslint": "^9.39.4",
+    "eslint-plugin-react-hooks": "^7.0.1",
+    "eslint-plugin-react-refresh": "^0.5.2",
+    "globals": "^17.4.0",
+    "tailwindcss": "^4.2.2",
+    "vite": "^8.0.4"
+  }
+}

_source/public/favicon.svg ADDED Viewed

_source/public/icons.svg ADDED Viewed

_source/public/silence_latent.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a7ee13d8902f0c02def49249f05a3e5dd99550ae8aed263299be43329b330e23
+size 3840000

_source/public/silence_latent_meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"shape": [1, 15000, 64], "dtype": "float32"}

_source/public/silence_roundtripped.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8e22b8c9e8a687c7ebfe57dc3bee42b5c330d35ca350f04575a79bca6045dfcd
+size 192000

_source/public/silence_roundtripped_meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"shape": [1, 750, 64], "dtype": "float32"}

_source/src/App.jsx ADDED Viewed

	@@ -0,0 +1,417 @@

+import { useState } from "react";
+import { useModel } from "./hooks/useModel";
+import Waveform from "./components/Waveform";
+import PulseBars from "./components/PulseBars";
+const PRESETS = [
+  {
+    name: "Pop Ballad",
+    emoji: "💗",
+    duration: 60,
+    caption: "A gentle pop ballad with piano and soft vocals, key of C major, 80 BPM, emotional and dreamy",
+    lyrics: "[verse]\nUnderneath the stars tonight\nWe dance beneath the pale moonlight\nEvery moment feels so right\nHolding you so close and tight\n\n[chorus]\nThis is where I want to be\nRight here with you next to me\nLet the world just fade away\nIn your arms I want to stay",
+  },
+  {
+    name: "Rock Anthem",
+    emoji: "🎸",
+    duration: 60,
+    caption: "An energetic rock anthem with electric guitars and powerful drums, key of E minor, 140 BPM, aggressive and intense",
+    lyrics: "[verse]\nFire burning in my veins\nBreaking free from all these chains\nNothing left to hold me back\nRiding down the beaten track\n\n[chorus]\nWe are the ones who rise\nWith thunder in our eyes\nWe'll never be denied\nWe're burning up the sky",
+  },
+  {
+    name: "Lo-fi Chill",
+    emoji: "☕",
+    duration: 20,
+    caption: "A relaxing lo-fi hip hop beat with jazz piano samples and vinyl crackle, key of F major, 75 BPM, mellow and nostalgic",
+    lyrics: "[instrumental]",
+  },
+];
+function WebGPUGate({ children }) {
+  const supported = typeof navigator !== "undefined" && !!navigator.gpu;
+  if (supported) return children;
+  return (
+    <div className="fixed inset-0 flex items-center justify-center z-50" style={{ background: "var(--bg)" }}>
+      <div className="text-center max-w-md px-6">
+        <div className="text-5xl mb-4">🎹</div>
+        <h1 className="text-2xl font-semibold mb-3" style={{ color: "var(--text)" }}>
+          WebGPU not available
+        </h1>
+        <p style={{ color: "var(--text-muted)" }}>
+          This demo needs WebGPU to run ACE-Step in your browser. Try Chrome 113+, Edge 113+, or Safari 26+ on desktop.
+        </p>
+      </div>
+    </div>
+  );
+}
+function ProgressBar({ progress }) {
+  if (!progress) return null;
+  const pct = Math.max(0, Math.min(100, progress.percent || 0));
+  return (
+    <div className="w-full">
+      <div className="flex justify-between text-[11px] mb-1.5" style={{ color: "var(--text-muted)" }}>
+        <span>{progress.label}</span>
+        <span className="font-mono">
+          {progress.total > 1 && `${(progress.loaded / 1e6).toFixed(0)} / ${(progress.total / 1e6).toFixed(0)} MB · `}
+          {pct.toFixed(0)}%
+        </span>
+      </div>
+      <div className="h-1 rounded-full overflow-hidden" style={{ background: "var(--border)" }}>
+        <div
+          className="h-full rounded-full transition-all duration-300"
+          style={{ width: `${pct}%`, background: "var(--accent)" }}
+        />
+      </div>
+    </div>
+  );
+}
+function LoadGate({ onLoad, status, message, progress, error }) {
+  const loading = status === "loading";
+  return (
+    <div
+      className="rounded-2xl p-8 fade-in"
+      style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}
+    >
+      <div className="flex flex-col items-center text-center">
+        <div className="text-4xl mb-3">🎹</div>
+        <h2 className="text-xl font-semibold mb-2" style={{ color: "var(--text)" }}>
+          Load models
+        </h2>
+        <p className="text-sm max-w-sm mb-5" style={{ color: "var(--text-muted)" }}>
+          Loads ~8 GB of ONNX models. Everything runs in your browser — your prompts never leave this device.
+          Built with{" "}
+          <a href="https://huggingface.co/docs/transformers.js" target="_blank" rel="noreferrer" className="underline" style={{ color: "var(--accent)" }}>
+            🤗 Transformers.js
+          </a>
+          {" + "}
+          <a href="https://onnxruntime.ai/docs/tutorials/web/" target="_blank" rel="noreferrer" className="underline" style={{ color: "var(--accent)" }}>
+            ONNX Runtime Web
+          </a>.
+        </p>
+        {error ? (
+          <div className="w-full text-sm mb-4 p-3 rounded-lg text-left" style={{ background: "oklch(0.25 0.08 22 / 0.3)", color: "var(--danger)" }}>
+            {error}
+          </div>
+        ) : loading ? (
+          <div className="w-full space-y-3">
+            {message && (
+              <p className="text-xs" style={{ color: "var(--text-muted)" }}>
+                {message}
+              </p>
+            )}
+            {progress && <ProgressBar progress={progress} />}
+          </div>
+        ) : (
+          <button
+            onClick={onLoad}
+            disabled={loading}
+            className="px-8 py-2.5 rounded-full font-medium transition hover:scale-[1.02] cursor-pointer"
+            style={{
+              background: "var(--accent)",
+              color: "var(--bg)",
+              letterSpacing: "-0.01em",
+            }}
+          >
+            Load models
+          </button>
+        )}
+      </div>
+    </div>
+  );
+}
+function PresetCard({ preset, active, onClick }) {
+  return (
+    <button
+      onClick={onClick}
+      className="flex-1 min-w-0 p-3 rounded-xl text-left transition-all cursor-pointer hover:scale-[1.02]"
+      style={{
+        background: active ? "var(--accent-soft)" : "var(--bg-elev)",
+        border: `1px solid ${active ? "var(--accent)" : "var(--border)"}`,
+      }}
+    >
+      <div className="text-xl mb-1">{preset.emoji}</div>
+      <div className="text-sm font-medium truncate" style={{ color: "var(--text)" }}>
+        {preset.name}
+      </div>
+      <div className="text-[10px] uppercase tracking-wider mt-0.5" style={{ color: "var(--text-dim)" }}>
+        {preset.duration}s · {preset.lyrics === "[instrumental]" ? "instrumental" : "vocal"}
+      </div>
+    </button>
+  );
+}
+function GenerationStatus({ status, message }) {
+  if (status !== "generating") return null;
+  return (
+    <div
+      className="rounded-2xl p-5 fade-in"
+      style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}
+    >
+      <PulseBars count={60} />
+      <div className="mt-3 flex items-center justify-between text-xs">
+        <span style={{ color: "var(--text)" }}>{message || "Generating…"}</span>
+        <span className="font-mono" style={{ color: "var(--text-muted)" }}>
+          this takes 1–4 min
+        </span>
+      </div>
+    </div>
+  );
+}
+function OutputCard({ audioUrl, audioInfo }) {
+  if (!audioUrl) return null;
+  return (
+    <div
+      className="rounded-2xl p-5 fade-in space-y-3"
+      style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}
+    >
+      <Waveform src={audioUrl} duration={audioInfo?.duration} />
+      <div className="flex items-center justify-between text-xs pt-2" style={{ borderTop: "1px solid var(--border)" }}>
+        <div className="font-mono" style={{ color: "var(--text-muted)" }}>
+          48 kHz · stereo
+          {audioInfo?.totalTime && ` · ${audioInfo.totalTime}s gen`}
+        </div>
+        <a
+          href={audioUrl}
+          download={audioInfo?.filename || "ace-step.wav"}
+          className="px-3 py-1.5 rounded-md text-xs font-medium transition hover:opacity-80 cursor-pointer"
+          style={{ background: "var(--surface)", color: "var(--text)" }}
+        >
+          ⬇ Download WAV
+        </a>
+      </div>
+    </div>
+  );
+}
+export default function App() {
+  const { status, message, progress, audioUrl, audioInfo, error, isLoaded, loadModel, generate } = useModel();
+  const [activeIdx, setActiveIdx] = useState(0);
+  const [caption, setCaption] = useState(PRESETS[0].caption);
+  const [lyrics, setLyrics] = useState(PRESETS[0].lyrics);
+  const [duration, setDuration] = useState(PRESETS[0].duration);
+  const [shift, setShift] = useState(3.0);
+  const [numSteps, setNumSteps] = useState(8);
+  const isWorking = status === "loading" || status === "generating";
+  const applyPreset = (i) => {
+    setActiveIdx(i);
+    setCaption(PRESETS[i].caption);
+    setLyrics(PRESETS[i].lyrics);
+    setDuration(PRESETS[i].duration);
+  };
+  return (
+    <WebGPUGate>
+      <div className="min-h-screen flex flex-col items-center px-4 py-10" style={{ background: "var(--bg)" }}>
+        {/* Hero */}
+        <header className="mb-10 w-full max-w-2xl fade-in">
+          <h1 className="leading-none mb-2 flex items-baseline gap-3 flex-wrap" style={{
+            fontSize: "clamp(2.5rem, 5vw, 3.5rem)",
+            color: "var(--text)",
+          }}>
+            <span style={{ fontFamily: "'Dancing Script', cursive", fontWeight: 600 }}>
+              ACE-Step
+            </span>
+            <span style={{ fontWeight: 600, letterSpacing: "-0.03em" }}>
+              WebGPU
+            </span>
+          </h1>
+          <p className="text-lg" style={{ color: "var(--text-muted)" }}>
+            Describe any song. AI writes &amp; produces it.
+          </p>
+        </header>
+        <main className="w-full max-w-2xl space-y-4">
+          {!isLoaded ? (
+            <LoadGate onLoad={loadModel} status={status} message={message} progress={progress} error={error} />
+          ) : (
+            <>
+              {/* Presets */}
+              <div className="flex gap-2">
+                {PRESETS.map((p, i) => (
+                  <PresetCard key={p.name} preset={p} active={i === activeIdx} onClick={() => applyPreset(i)} />
+                ))}
+              </div>
+              {/* Caption */}
+              <div className="rounded-2xl p-4" style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}>
+                <label className="text-[10px] uppercase tracking-widest mb-2 block" style={{ color: "var(--text-dim)" }}>
+                  Description
+                </label>
+                <textarea
+                  value={caption}
+                  onChange={(e) => setCaption(e.target.value)}
+                  onInput={() => setActiveIdx(-1)}
+                  rows={2}
+                  className="w-full bg-transparent text-sm resize-none outline-none"
+                  style={{ color: "var(--text)" }}
+                  placeholder="Describe the music — style, instruments, key, BPM, mood…"
+                />
+              </div>
+              {/* Lyrics */}
+              <div className="rounded-2xl p-4" style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}>
+                <label className="text-[10px] uppercase tracking-widest mb-2 block" style={{ color: "var(--text-dim)" }}>
+                  Lyrics (use [verse] / [chorus] tags, or [instrumental])
+                </label>
+                <textarea
+                  value={lyrics}
+                  onChange={(e) => setLyrics(e.target.value)}
+                  onInput={() => setActiveIdx(-1)}
+                  rows={6}
+                  className="w-full bg-transparent text-sm resize-none outline-none font-mono"
+                  style={{ color: "var(--text)" }}
+                />
+              </div>
+              {/* Controls — pill row */}
+              <div className="flex items-center gap-3 flex-wrap">
+                <div className="flex items-center gap-2 px-3 py-1.5 rounded-full text-xs"
+                  style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}>
+                  <span style={{ color: "var(--text-muted)" }}>Duration</span>
+                  <input
+                    type="range"
+                    min={10}
+                    max={90}
+                    step={10}
+                    value={duration}
+                    onChange={(e) => setDuration(Number(e.target.value))}
+                    className="w-24"
+                  />
+                  <span className="font-mono w-8 text-right" style={{ color: "var(--text)" }}>{duration}s</span>
+                </div>
+                <div className="flex items-center gap-2 px-3 py-1.5 rounded-full text-xs"
+                  style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}>
+                  <span style={{ color: "var(--text-muted)" }}>Steps</span>
+                  <select
+                    value={numSteps}
+                    onChange={(e) => setNumSteps(Number(e.target.value))}
+                    className="bg-transparent outline-none cursor-pointer"
+                    style={{ color: "var(--text)" }}
+                  >
+                    <option value={8}>8 (turbo)</option>
+                  </select>
+                </div>
+                <div className="flex items-center gap-2 px-3 py-1.5 rounded-full text-xs"
+                  style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}>
+                  <span style={{ color: "var(--text-muted)" }}>Shift</span>
+                  <select
+                    value={shift}
+                    onChange={(e) => setShift(Number(e.target.value))}
+                    className="bg-transparent outline-none cursor-pointer"
+                    style={{ color: "var(--text)" }}
+                  >
+                    <option value={1.0}>1.0</option>
+                    <option value={2.0}>2.0</option>
+                    <option value={3.0}>3.0</option>
+                  </select>
+                </div>
+              </div>
+              {/* Generate */}
+              <button
+                onClick={() => generate({ caption, lyrics, duration, shift, numSteps })}
+                disabled={isWorking}
+                className="w-full py-3.5 rounded-full font-medium text-base transition disabled:opacity-50 disabled:cursor-not-allowed hover:scale-[1.01] cursor-pointer"
+                style={{
+                  background: "var(--accent)",
+                  color: "var(--bg)",
+                  letterSpacing: "-0.01em",
+                  boxShadow: "0 0 40px oklch(0.72 0.17 305 / 0.25)",
+                }}
+              >
+                {status === "generating" ? "Generating music…" : "Generate"}
+              </button>
+              <GenerationStatus status={status} message={message} />
+              <OutputCard audioUrl={audioUrl} audioInfo={audioInfo} />
+              {error && (
+                <div className="rounded-lg p-3 text-sm" style={{ background: "oklch(0.25 0.08 22 / 0.3)", color: "var(--danger)" }}>
+                  {error}
+                </div>
+              )}
+            </>
+          )}
+        </main>
+        {/* About / methodology */}
+        <section className="w-full max-w-2xl mt-12 text-sm" style={{ color: "var(--text-muted)" }}>
+          <details className="rounded-xl px-4 py-3"
+            style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}>
+            <summary className="cursor-pointer font-medium select-none" style={{ color: "var(--text)" }}>
+              How it works &amp; known limitations
+            </summary>
+            <div className="mt-4 space-y-4 leading-relaxed">
+              <div>
+                <h3 className="text-[13px] uppercase tracking-widest mb-2" style={{ color: "var(--text-dim)" }}>Pipeline</h3>
+                <ol className="list-decimal list-inside space-y-1">
+                  <li><span style={{ color: "var(--text)" }}>Text encoder</span> (Qwen3-Embedding-0.6B, fp16) turns the caption into conditioning hidden states; the same model provides token embeddings for the lyric path.</li>
+                  <li><span style={{ color: "var(--text)" }}>5&nbsp;Hz LM</span> (ACE-Step acestep-5Hz-lm-0.6B, 4-bit MatMulNBits) writes a short chain-of-thought, then emits ~50 audio codes per 10&nbsp;s of output.</li>
+                  <li><span style={{ color: "var(--text)" }}>FSQ → detokenizer</span> expands the codes into 25&nbsp;Hz acoustic features used as cross-attention hints.</li>
+                  <li><span style={{ color: "var(--text)" }}>DiT decoder</span> (2B parameters, fp16) runs 8 Euler flow-matching steps (shift=3.0) over a random latent conditioned on text, lyrics, and hints.</li>
+                  <li><span style={{ color: "var(--text)" }}>Oobleck VAE</span> (fp16) decodes the 25&nbsp;Hz latent into stereo 48&nbsp;kHz audio.</li>
+                </ol>
+              </div>
+              <div>
+                <h3 className="text-[13px] uppercase tracking-widest mb-2" style={{ color: "var(--text-dim)" }}>Why it runs in the browser</h3>
+                <p>
+                  Everything executes on-device via <code className="font-mono text-xs">onnxruntime-web</code> with the WebGPU execution provider. Two Web Workers keep the LM and the diffusion+VAE graphs in separate WASM heaps so neither hits the 4&nbsp;GB single-heap limit. Total download is ~2&nbsp;GB (cached in the browser after the first load).
+                </p>
+              </div>
+              <div>
+                <h3 className="text-[13px] uppercase tracking-widest mb-2" style={{ color: "var(--text-dim)" }}>Methodology notes</h3>
+                <ul className="list-disc list-inside space-y-1">
+                  <li>Compared stage-by-stage against the PyTorch fp32 reference: every tensor agrees to within 0.2% relative L2, and the generated waveforms sound identical.</li>
+                  <li>FP16 DiT is exported natively (<code className="font-mono text-xs">model.half()</code> + dynamo). An earlier fp32→fp16 conversion with post-hoc Cast insertion produced a 25&nbsp;Hz helicopter artifact, now resolved.</li>
+                  <li>4-bit quantization is MatMulNBits with <code className="font-mono text-xs">block_size=64</code>, asymmetric, <code className="font-mono text-xs">accuracy_level=1</code> (fp32 accumulate).</li>
+                </ul>
+              </div>
+              <div>
+                <h3 className="text-[13px] uppercase tracking-widest mb-2" style={{ color: "var(--text-dim)" }}>Known limitations</h3>
+                <ul className="list-disc list-inside space-y-1">
+                  <li><span style={{ color: "var(--text)" }}>First load is slow.</span> ~2&nbsp;GB of weights must be fetched and cached; subsequent runs start fast.</li>
+                  <li><span style={{ color: "var(--text)" }}>Vocals need ≥60&nbsp;s.</span> The 0.6B LM often refuses to emit lyric-aligned audio codes for short durations — instrumentals work at any length.</li>
+                  <li><span style={{ color: "var(--text)" }}>Turbo quality ceiling.</span> We run 8 diffusion steps (shift=3.0). More steps nudge quality up but aren't supported by the turbo weights we ship.</li>
+                  <li><span style={{ color: "var(--text)" }}>Condition-encoder drift.</span> The ONNX condition_encoder has a small drift (~0.4 max_diff) vs PyTorch on real inputs — inaudible today but a known residual we haven&rsquo;t closed.</li>
+                  <li><span style={{ color: "var(--text)" }}>WebGPU only.</span> No fallback path; the demo gates on WebGPU support (Chrome/Edge 113+, Safari 26+ desktop).</li>
+                  <li><span style={{ color: "var(--text)" }}>Memory.</span> Two workers each hold ~1–2&nbsp;GB; low-RAM devices may hit <code className="font-mono text-xs">std::bad_alloc</code> during model creation.</li>
+                  <li><span style={{ color: "var(--text)" }}>No seed control.</span> Each generation uses a fresh RNG, so re-runs with the same prompt will differ.</li>
+                </ul>
+              </div>
+            </div>
+          </details>
+        </section>
+        {/* Footer */}
+        <footer className="mt-12 mb-6 text-center text-xs space-y-2" style={{ color: "var(--text-dim)" }}>
+          <div>
+            <a href="https://huggingface.co/shreyask/ACE-Step-v1.5-ONNX" target="_blank" rel="noreferrer" className="hover:opacity-80 transition" style={{ color: "var(--text-muted)" }}>
+              shreyask/ACE-Step-v1.5-ONNX
+            </a>
+            <span className="mx-2">·</span>
+            <a href="https://huggingface.co/ACE-Step/Ace-Step1.5" target="_blank" rel="noreferrer" className="hover:opacity-80 transition" style={{ color: "var(--text-muted)" }}>
+              ACE-Step 1.5
+            </a>
+            <span className="mx-2">·</span>
+            <span>Apache 2.0</span>
+          </div>
+          <div>
+            Made with <a href="https://huggingface.co/docs/transformers.js" target="_blank" rel="noreferrer" className="underline" style={{ color: "var(--text-muted)" }}>🤗 Transformers.js</a>
+          </div>
+        </footer>
+      </div>
+    </WebGPUGate>
+  );
+}

_source/src/assets/hero.png ADDED Viewed

_source/src/assets/react.svg ADDED Viewed

_source/src/assets/vite.svg ADDED Viewed

_source/src/components/PulseBars.jsx ADDED Viewed

	@@ -0,0 +1,17 @@

+// Animated placeholder — shown while generating. Matches ace-step-jam fake-waveform.
+export default function PulseBars({ count = 60 }) {
+  return (
+    <div className="flex items-end gap-[2px] h-14 w-full select-none">
+      {Array.from({ length: count }).map((_, i) => (
+        <div
+          key={i}
+          className="flex-1 rounded-[2px] pulse-bar"
+          style={{
+            background: "var(--accent)",
+            animationDelay: `${(i * 40) % 1200}ms`,
+          }}
+        />
+      ))}
+    </div>
+  );
+}

_source/src/components/Waveform.jsx ADDED Viewed

	@@ -0,0 +1,134 @@

+import { useEffect, useRef, useState } from "react";
+// Custom audio player with bar-waveform viz and click-to-seek.
+// Pattern from victor/ace-step-jam; we render N bars pulled from decoded audio buffer peaks.
+const NUM_BARS = 80;
+export default function Waveform({ src, duration }) {
+  const audioRef = useRef(null);
+  const [peaks, setPeaks] = useState(null);
+  const [playing, setPlaying] = useState(false);
+  const [progress, setProgress] = useState(0);
+  // Decode audio to extract bar peaks
+  useEffect(() => {
+    if (!src) return;
+    let cancelled = false;
+    (async () => {
+      try {
+        const res = await fetch(src);
+        const buf = await res.arrayBuffer();
+        const ctx = new (window.AudioContext || window.webkitAudioContext)();
+        const audio = await ctx.decodeAudioData(buf.slice(0));
+        const channel = audio.getChannelData(0);
+        const samplesPerBar = Math.floor(channel.length / NUM_BARS);
+        const out = new Float32Array(NUM_BARS);
+        let globalMax = 0;
+        for (let b = 0; b < NUM_BARS; b++) {
+          let max = 0;
+          const start = b * samplesPerBar;
+          const end = Math.min(start + samplesPerBar, channel.length);
+          for (let i = start; i < end; i++) {
+            const v = Math.abs(channel[i]);
+            if (Number.isFinite(v) && v > max) max = v;
+          }
+          out[b] = max;
+          if (max > globalMax) globalMax = max;
+        }
+        // Normalize — if silent or NaN, fall back to flat low bars
+        const peak = Number.isFinite(globalMax) && globalMax > 1e-5 ? globalMax : 1;
+        for (let i = 0; i < NUM_BARS; i++) {
+          const n = out[i] / peak;
+          out[i] = Number.isFinite(n) ? Math.max(0.05, Math.min(1, n)) : 0.05;
+        }
+        if (!cancelled) setPeaks(out);
+        ctx.close?.();
+      } catch (e) {
+        console.warn("waveform decode failed:", e);
+        // Still show fallback bars so UI isn't broken
+        if (!cancelled) setPeaks(new Float32Array(NUM_BARS).fill(0.1));
+      }
+    })();
+    return () => { cancelled = true; };
+  }, [src]);
+  useEffect(() => {
+    const a = audioRef.current;
+    if (!a) return;
+    const onTime = () => setProgress(a.duration ? a.currentTime / a.duration : 0);
+    const onEnd = () => setPlaying(false);
+    a.addEventListener("timeupdate", onTime);
+    a.addEventListener("ended", onEnd);
+    return () => {
+      a.removeEventListener("timeupdate", onTime);
+      a.removeEventListener("ended", onEnd);
+    };
+  }, [src]);
+  const toggle = () => {
+    const a = audioRef.current;
+    if (!a) return;
+    if (a.paused) { a.play(); setPlaying(true); }
+    else { a.pause(); setPlaying(false); }
+  };
+  const seek = (e) => {
+    const a = audioRef.current;
+    if (!a || !a.duration) return;
+    const rect = e.currentTarget.getBoundingClientRect();
+    const x = (e.clientX - rect.left) / rect.width;
+    a.currentTime = Math.max(0, Math.min(1, x)) * a.duration;
+    setProgress(x);
+  };
+  return (
+    <div className="flex items-center gap-3 w-full">
+      <audio ref={audioRef} src={src} preload="auto" />
+      <button
+        onClick={toggle}
+        className="flex-shrink-0 w-10 h-10 rounded-full flex items-center justify-center hover:scale-105 transition cursor-pointer"
+        style={{ background: "var(--accent)", color: "var(--bg)" }}
+        aria-label={playing ? "Pause" : "Play"}
+      >
+        {playing ? (
+          <svg width="14" height="14" viewBox="0 0 16 16" fill="currentColor"><rect x="3" y="2" width="3.5" height="12" rx="1" /><rect x="9.5" y="2" width="3.5" height="12" rx="1" /></svg>
+        ) : (
+          <svg width="14" height="14" viewBox="0 0 16 16" fill="currentColor"><path d="M3.5 2.5v11a0.5 0.5 0 0 0 .8 .4l9 -5.5a0.5 0.5 0 0 0 0 -.8l-9 -5.5a0.5 0.5 0 0 0 -.8 .4z" /></svg>
+        )}
+      </button>
+      <div
+        onClick={seek}
+        className="flex-1 flex items-end gap-[2px] h-14 cursor-pointer select-none overflow-hidden"
+      >
+        {Array.from({ length: NUM_BARS }, (_, i) => {
+          // Compute height defensively — never rely on peaks array directly
+          let v = 0.15;
+          if (peaks && peaks[i] != null) {
+            const p = Number(peaks[i]);
+            if (Number.isFinite(p)) v = Math.max(0.05, Math.min(1, p));
+          }
+          const prog = Number.isFinite(progress) ? progress : 0;
+          const active = (i / NUM_BARS) < prog;
+          const heightPct = Math.max(4, Math.min(100, v * 100));
+          return (
+            <div
+              key={i}
+              className="flex-1 rounded-[2px] transition-colors"
+              style={{
+                height: `${heightPct}%`,
+                background: active ? "var(--accent)" : "var(--border)",
+              }}
+            />
+          );
+        })}
+      </div>
+      {Number.isFinite(Number(duration)) && Number(duration) > 0 && (
+        <div className="flex-shrink-0 text-xs font-mono" style={{ color: "var(--text-muted)" }}>
+          {Number(duration)}s
+        </div>
+      )}
+    </div>
+  );
+}

_source/src/hooks/useModel.js ADDED Viewed

	@@ -0,0 +1,111 @@

+import { useState, useRef, useCallback, useEffect } from "react";
+export function useModel() {
+  const workerRef = useRef(null);
+  const audioUrlRef = useRef(null);
+  const [status, setStatus] = useState("idle");
+  const [message, setMessage] = useState("");
+  const [progress, setProgress] = useState(null);
+  const [audioUrl, setAudioUrl] = useState(null);
+  const [audioInfo, setAudioInfo] = useState(null);
+  const [error, setError] = useState(null);
+  const [isLoaded, setIsLoaded] = useState(false);
+  // Revoke a URL owned by this hook and forget it.
+  const revokeCurrentAudioUrl = useCallback(() => {
+    if (audioUrlRef.current) {
+      URL.revokeObjectURL(audioUrlRef.current);
+      audioUrlRef.current = null;
+    }
+  }, []);
+  useEffect(() => {
+    const worker = new Worker(new URL("../worker.js", import.meta.url), {
+      type: "module",
+    });
+    worker.onmessage = (e) => {
+      const { type, ...data } = e.data;
+      switch (type) {
+        case "status":
+          setMessage(data.message);
+          break;
+        case "progress":
+          setProgress(data);
+          break;
+        case "loaded":
+          setIsLoaded(true);
+          setStatus("ready");
+          setProgress(null);
+          break;
+        case "audio": {
+          // Revoke any previous URL owned by this hook before overwriting.
+          if (audioUrlRef.current) URL.revokeObjectURL(audioUrlRef.current);
+          const blob = new Blob([data.wavBuffer], { type: "audio/wav" });
+          const url = URL.createObjectURL(blob);
+          audioUrlRef.current = url;
+          setAudioUrl(url);
+          setAudioInfo({
+            duration: data.duration,
+            diffusionTime: data.diffusionTime,
+            totalTime: data.totalTime,
+            filename: `ace-step-${data.filenameStamp || Date.now()}.wav`,
+          });
+          setStatus("ready");
+          setMessage("Generation complete!");
+          break;
+        }
+        case "error":
+          setError(data.message);
+          setStatus("error");
+          console.error("Worker error:", data.message, data.stack);
+          break;
+      }
+    };
+    workerRef.current = worker;
+    return () => {
+      worker.terminate();
+      if (audioUrlRef.current) {
+        URL.revokeObjectURL(audioUrlRef.current);
+        audioUrlRef.current = null;
+      }
+    };
+  }, []);
+  const loadModel = useCallback(() => {
+    setStatus("loading");
+    setError(null);
+    workerRef.current?.postMessage({ type: "load" });
+  }, []);
+  const generate = useCallback(({ caption, lyrics, duration, shift, numSteps }) => {
+    setStatus("generating");
+    setError(null);
+    // Revoke the previous URL when user starts a new gen so the next "audio" message
+    // doesn't compete with a still-displayed blob.
+    revokeCurrentAudioUrl();
+    setAudioUrl(null);
+    setAudioInfo(null);
+    workerRef.current?.postMessage({
+      type: "generate",
+      caption,
+      lyrics,
+      duration,
+      shift,
+      numSteps,
+    });
+  }, [revokeCurrentAudioUrl]);
+  return {
+    status,
+    message,
+    progress,
+    audioUrl,
+    audioInfo,
+    error,
+    isLoaded,
+    loadModel,
+    generate,
+  };
+}

_source/src/index.css ADDED Viewed

	@@ -0,0 +1,93 @@

+@import "tailwindcss";
+:root {
+  --bg: oklch(0.13 0.006 260);
+  --bg-elev: oklch(0.17 0.008 260);
+  --surface: oklch(0.22 0.01 260);
+  --border: oklch(0.28 0.01 260);
+  --text: oklch(0.95 0.005 260);
+  --text-muted: oklch(0.65 0.01 260);
+  --text-dim: oklch(0.45 0.008 260);
+  --accent: oklch(0.72 0.17 305);
+  --accent-glow: oklch(0.80 0.18 305);
+  --accent-soft: oklch(0.72 0.17 305 / 0.15);
+  --success: oklch(0.72 0.14 155);
+  --danger: oklch(0.65 0.2 22);
+}
+html, body, #root {
+  min-height: 100vh;
+  margin: 0;
+}
+body {
+  background: var(--bg);
+  color: var(--text);
+  font-family: "Hanken Grotesk", system-ui, -apple-system, sans-serif;
+  font-weight: 400;
+  letter-spacing: -0.005em;
+  -webkit-font-smoothing: antialiased;
+}
+code, pre, .font-mono {
+  font-family: "JetBrains Mono", ui-monospace, "Consolas", monospace;
+}
+/* Animated generation placeholder — pulse bars like ace-step-jam */
+@keyframes wave-pulse {
+  0%, 100% { transform: scaleY(0.3); opacity: 0.2; }
+  50% { transform: scaleY(1); opacity: 0.6; }
+}
+@keyframes fade-in {
+  from { opacity: 0; transform: translateY(4px); }
+  to { opacity: 1; transform: translateY(0); }
+}
+@keyframes soft-glow {
+  0%, 100% { box-shadow: 0 0 20px oklch(0.72 0.17 305 / 0.2); }
+  50% { box-shadow: 0 0 40px oklch(0.72 0.17 305 / 0.5); }
+}
+.pulse-bar {
+  animation: wave-pulse 1.2s ease-in-out infinite;
+  transform-origin: bottom;
+}
+.fade-in {
+  animation: fade-in 0.3s ease-out;
+}
+.glow {
+  animation: soft-glow 2s ease-in-out infinite;
+}
+/* Range slider styling */
+input[type="range"] {
+  -webkit-appearance: none;
+  appearance: none;
+  background: transparent;
+  cursor: pointer;
+}
+input[type="range"]::-webkit-slider-runnable-track {
+  height: 4px;
+  background: var(--border);
+  border-radius: 4px;
+}
+input[type="range"]::-webkit-slider-thumb {
+  -webkit-appearance: none;
+  appearance: none;
+  height: 16px;
+  width: 16px;
+  background: var(--accent);
+  border-radius: 50%;
+  margin-top: -6px;
+  box-shadow: 0 0 12px oklch(0.72 0.17 305 / 0.4);
+}
+input[type="range"]:focus { outline: none; }
+/* Scrollbar in textareas */
+textarea::-webkit-scrollbar { width: 6px; }
+textarea::-webkit-scrollbar-track { background: transparent; }
+textarea::-webkit-scrollbar-thumb { background: var(--border); border-radius: 3px; }
+textarea::-webkit-scrollbar-thumb:hover { background: var(--text-dim); }

_source/src/lm-worker.js ADDED Viewed

	@@ -0,0 +1,271 @@

+// Dedicated worker for the 5Hz LM. Isolated WASM heap lets the 1.77 GB model
+// load without competing with DiT + encoders in the main worker.
+import { AutoTokenizer } from "@huggingface/transformers";
+import * as ort from "onnxruntime-web/webgpu";
+const MODEL_REPO = "shreyask/ACE-Step-v1.5-ONNX";
+const MODEL_REVISION = "bdabfb5684fd70fcc76f98cbb51bb9ebc47ee342";
+const ONNX_BASE = `https://huggingface.co/${MODEL_REPO}/resolve/${MODEL_REVISION}/onnx`;
+const LM_TOKENIZER_REPO = "ACE-Step/acestep-5Hz-lm-0.6B";
+const CACHE_NAME = "ace-step-onnx-v12";
+const NUM_KV_LAYERS = 28;
+const NUM_KV_HEADS = 8;
+const KV_HEAD_DIM = 128;
+const VOCAB_SIZE = 217204;
+const NUM_CODES = 64000;
+const POOL_WINDOW = 5;
+const EOS_ID = 151645;
+let tokenizer = null;
+let session = null;
+function post(type, data = {}) {
+  self.postMessage({ type, ...data });
+}
+async function fetchBuffer(url, label) {
+  const cache = await caches.open(CACHE_NAME);
+  const cached = await cache.match(url);
+  if (cached) {
+    post("progress", { label, loaded: 1, total: 1, percent: 100 });
+    return await cached.arrayBuffer();
+  }
+  const response = await fetch(url);
+  const total = parseInt(response.headers.get("content-length") || "0");
+  const reader = response.body.getReader();
+  const chunks = [];
+  let loaded = 0;
+  while (true) {
+    const { done, value } = await reader.read();
+    if (done) break;
+    chunks.push(value);
+    loaded += value.length;
+    if (total > 0) post("progress", { label, loaded, total, percent: (loaded / total) * 100 });
+  }
+  const buf = new Uint8Array(loaded);
+  let offset = 0;
+  for (const c of chunks) { buf.set(c, offset); offset += c.length; }
+  try {
+    await cache.put(url, new Response(buf.buffer.slice(0), { headers: { "Content-Type": "application/octet-stream" } }));
+  } catch (_) {}
+  return buf.buffer;
+}
+function tensor(data, dims, type = "float32") {
+  return new ort.Tensor(type, data, dims);
+}
+async function loadModel() {
+  ort.env.wasm.numThreads = 1;
+  ort.env.wasm.simd = true;
+  post("status", { message: "Loading LM tokenizer..." });
+  tokenizer = await AutoTokenizer.from_pretrained(LM_TOKENIZER_REPO);
+  post("status", { message: "Loading LM graph..." });
+  const graphBuf = await fetchBuffer(`${ONNX_BASE}/lm_kv_q4.onnx`, "LM graph");
+  post("status", { message: "Loading LM weights (1.24 GB q4)..." });
+  const weightsBuf = await fetchBuffer(`${ONNX_BASE}/lm_kv_q4.onnx.data`, "LM weights");
+  post("status", { message: "Creating LM session..." });
+  // Try WebGPU first (faster), fall back to WASM if unsupported ops
+  try {
+    session = await ort.InferenceSession.create(graphBuf, {
+      executionProviders: ["webgpu"],
+      externalData: [{ path: "lm_kv_q4.onnx.data", data: weightsBuf }],
+    });
+    post("status", { message: "LM on WebGPU" });
+  } catch (err) {
+    console.warn("LM WebGPU failed, falling back to WASM:", err.message);
+    session = await ort.InferenceSession.create(graphBuf, {
+      executionProviders: ["wasm"],
+      externalData: [{ path: "lm_kv_q4.onnx.data", data: weightsBuf }],
+    });
+    post("status", { message: "LM on WASM (WebGPU unsupported)" });
+  }
+  post("status", { message: "LM ready" });
+  post("loaded");
+}
+function createEmptyKV() {
+  const kv = {};
+  for (let i = 0; i < NUM_KV_LAYERS; i++) {
+    kv[`past_key_values.${i}.key`] = tensor(new Float32Array(0), [1, NUM_KV_HEADS, 0, KV_HEAD_DIM]);
+    kv[`past_key_values.${i}.value`] = tensor(new Float32Array(0), [1, NUM_KV_HEADS, 0, KV_HEAD_DIM]);
+  }
+  return kv;
+}
+function extractKV(outputs) {
+  const kv = {};
+  for (let i = 0; i < NUM_KV_LAYERS; i++) {
+    kv[`past_key_values.${i}.key`] = outputs[`present.${i}.key`];
+    kv[`past_key_values.${i}.value`] = outputs[`present.${i}.value`];
+  }
+  return kv;
+}
+function sampleToken(logits, recentTokens, { temperature = 0.8, topK = 200, topP = 0.95, repetitionPenalty = 1.05, repWindow = 64 } = {}) {
+  const V = logits.length;
+  const scores = new Float32Array(V);
+  scores.set(logits);
+  // Repetition penalty
+  if (repetitionPenalty !== 1.0 && recentTokens.length > 0) {
+    const window = recentTokens.slice(-repWindow);
+    const seen = new Set(window);
+    for (const tok of seen) {
+      if (tok >= 0 && tok < V) {
+        scores[tok] = scores[tok] > 0 ? scores[tok] / repetitionPenalty : scores[tok] * repetitionPenalty;
+      }
+    }
+  }
+  // Temperature
+  if (temperature !== 1.0 && temperature > 0) {
+    const invT = 1.0 / temperature;
+    for (let i = 0; i < V; i++) scores[i] *= invT;
+  }
+  // Top-K via full sort (good enough — sort overhead << LM forward pass)
+  const k = Math.min(topK, V);
+  const idx = new Array(V);
+  for (let i = 0; i < V; i++) idx[i] = i;
+  idx.sort((a, b) => scores[b] - scores[a]);
+  const topIdx = idx.slice(0, k);
+  // Softmax with log-sum-exp trick
+  let maxS = -Infinity;
+  for (const i of topIdx) if (scores[i] > maxS) maxS = scores[i];
+  const exps = new Float64Array(k);
+  let sumE = 0;
+  for (let i = 0; i < k; i++) {
+    const e = Math.exp(scores[topIdx[i]] - maxS);
+    exps[i] = e; sumE += e;
+  }
+  const probs = new Float64Array(k);
+  for (let i = 0; i < k; i++) probs[i] = exps[i] / sumE;
+  // Top-P (nucleus)
+  let cum = 0, nuc = k;
+  for (let i = 0; i < k; i++) {
+    cum += probs[i];
+    if (cum >= topP) { nuc = i + 1; break; }
+  }
+  // Multinomial sample within nucleus
+  let nSum = 0;
+  for (let i = 0; i < nuc; i++) nSum += probs[i];
+  const r = Math.random() * nSum;
+  let acc = 0;
+  for (let i = 0; i < nuc; i++) {
+    acc += probs[i];
+    if (r < acc) return topIdx[i];
+  }
+  return topIdx[nuc - 1];
+}
+function buildPrompt(caption, lyrics, duration, language = "en") {
+  const instruction = "Generate audio semantic tokens based on the given conditions";
+  const lyricsSection = lyrics.trim()
+    ? `# Languages\n${language}\n\n# Lyrics\n${lyrics}`
+    : "# Lyrics\n[instrumental]";
+  const userPrompt = `# Instruction\n${instruction}\n\n# Caption\n${caption}\n\n${lyricsSection}\n\n# Metas\n- language: ${language}\n- duration: ${duration} seconds\n<|endoftext|>\n`;
+  return `<|im_start|>user\n${userPrompt}<|im_end|>\n<|im_start|>assistant\n`;
+}
+async function generate({ caption, lyrics, duration, numLatentFrames }) {
+  const numCodes5Hz = Math.ceil(numLatentFrames / POOL_WINDOW);
+  post("status", { message: `LM: generating ~${numCodes5Hz} codes...` });
+  const prompt = buildPrompt(caption, lyrics, Math.round(duration));
+  const encoded = tokenizer(prompt);
+  const promptIds = Array.from(encoded.input_ids.data, Number);
+  // CoT metadata ~150 tokens + numCodes5Hz audio codes + some slack
+  const maxNewTokens = Math.min(numCodes5Hz + 250, 600);
+  const audioCodeTokenRegex = /<\|audio_code_(\d+)\|>/g;
+  const startTime = performance.now();
+  const allIds = [...promptIds];
+  // Prefill
+  post("status", { message: `LM prefill (${promptIds.length} tokens)...` });
+  const prefillIds = new BigInt64Array(promptIds.map(BigInt));
+  const prefillMask = new BigInt64Array(promptIds.length).fill(1n);
+  const prefillPos = new BigInt64Array(promptIds.map((_, i) => BigInt(i)));
+  let outputs = await session.run({
+    input_ids: tensor(prefillIds, [1, promptIds.length], "int64"),
+    attention_mask: tensor(prefillMask, [1, promptIds.length], "int64"),
+    position_ids: tensor(prefillPos, [1, promptIds.length], "int64"),
+    ...createEmptyKV(),
+  });
+  let kv = extractKV(outputs);
+  let lastLogits = outputs.logits.data.slice((promptIds.length - 1) * VOCAB_SIZE, promptIds.length * VOCAB_SIZE);
+  let nextToken = sampleToken(lastLogits, allIds);
+  allIds.push(nextToken);
+  // Decode loop — exit early once we have enough audio codes
+  let codesSoFar = 0;
+  for (let step = 0; step < maxNewTokens - 1; step++) {
+    if (nextToken === EOS_ID) break;
+    if (codesSoFar >= numCodes5Hz) break;  // have enough codes, stop early
+    if (step % 20 === 0) {
+      const elapsed = ((performance.now() - startTime) / 1000).toFixed(1);
+      const tps = (step / Math.max(parseFloat(elapsed), 0.1)).toFixed(1);
+      post("status", { message: `LM: ${step} tokens, ${codesSoFar}/${numCodes5Hz} codes (${tps} tok/s)` });
+    }
+    const seqLen = allIds.length;
+    outputs = await session.run({
+      input_ids: tensor(new BigInt64Array([BigInt(nextToken)]), [1, 1], "int64"),
+      attention_mask: tensor(new BigInt64Array(seqLen).fill(1n), [1, seqLen], "int64"),
+      position_ids: tensor(new BigInt64Array([BigInt(seqLen - 1)]), [1, 1], "int64"),
+      ...kv,
+    });
+    kv = extractKV(outputs);
+    lastLogits = outputs.logits.data.slice(0, VOCAB_SIZE);
+    nextToken = sampleToken(lastLogits, allIds);
+    allIds.push(nextToken);
+    // Streaming decode — check if this token is an audio code
+    const tokText = tokenizer.decode([nextToken], { skip_special_tokens: false });
+    if (audioCodeTokenRegex.test(tokText)) codesSoFar++;
+    audioCodeTokenRegex.lastIndex = 0;
+  }
+  const elapsed = ((performance.now() - startTime) / 1000).toFixed(1);
+  const generatedIds = allIds.slice(promptIds.length);
+  const outputText = tokenizer.decode(generatedIds, { skip_special_tokens: false });
+  console.log(`[lm] ${generatedIds.length} tokens in ${elapsed}s`);
+  // Find end of thinking
+  const thinkEnd = outputText.indexOf("</think>");
+  console.log("[lm] CoT length:", thinkEnd >= 0 ? thinkEnd : "no </think> found");
+  console.log("[lm] preview (CoT):", thinkEnd >= 0 ? outputText.slice(0, thinkEnd + 10) : outputText.slice(0, 500));
+  console.log("[lm] preview (after think):", thinkEnd >= 0 ? outputText.slice(thinkEnd, thinkEnd + 500) : "(n/a)");
+  const audioCodes = [];
+  for (const m of outputText.matchAll(/<\|audio_code_(\d+)\|>/g)) {
+    audioCodes.push(Math.min(Math.max(parseInt(m[1]), 0), NUM_CODES - 1));
+  }
+  console.log(`[lm] extracted ${audioCodes.length} audio codes, first 10:`, audioCodes.slice(0, 10));
+  // Truncate if too many but DON'T zero-pad — main worker uses last-frame padding in 25Hz space (matches MLX port)
+  const codes = new Int32Array(audioCodes.slice(0, numCodes5Hz));
+  post("audio_codes", { codes, elapsed, tokenCount: generatedIds.length });
+}
+self.onmessage = async (e) => {
+  const { type, ...data } = e.data;
+  try {
+    if (type === "load") await loadModel();
+    else if (type === "generate") await generate(data);
+  } catch (err) {
+    post("error", { message: err.message, stack: err.stack });
+  }
+};

_source/src/main.jsx ADDED Viewed

	@@ -0,0 +1,10 @@

+import { StrictMode } from 'react'
+import { createRoot } from 'react-dom/client'
+import './index.css'
+import App from './App.jsx'
+createRoot(document.getElementById('root')).render(
+  <StrictMode>
+    <App />
+  </StrictMode>,
+)

_source/src/worker.js ADDED Viewed

	@@ -0,0 +1,665 @@

+// Main worker: DiT + encoders + VAE on WebGPU. Spawns a dedicated LM worker
+// (isolated WASM heap) for autoregressive generation.
+import { AutoTokenizer } from "@huggingface/transformers";
+import * as ort from "onnxruntime-web/webgpu";
+const MODEL_REPO = "shreyask/ACE-Step-v1.5-ONNX";
+const MODEL_REVISION = "bdabfb5684fd70fcc76f98cbb51bb9ebc47ee342";
+const ONNX_BASE = `https://huggingface.co/${MODEL_REPO}/resolve/${MODEL_REVISION}/onnx`;
+const TEXT_TOKENIZER_REPO = "Qwen/Qwen3-Embedding-0.6B";
+const SAMPLE_RATE = 48000;
+const LATENT_RATE = 25;
+const LATENT_CHANNELS = 64;
+const HIDDEN_SIZE = 2048;
+const POOL_WINDOW = 5;
+const FSQ_DIM = 6;
+const NUM_CODES = 64000;
+// 8-step turbo schedules (from ACE-Step)
+const SHIFT_TIMESTEPS_8 = {
+  1.0: [1.0, 0.875, 0.75, 0.625, 0.5, 0.375, 0.25, 0.125],
+  2.0: [1.0, 0.9333, 0.8571, 0.7692, 0.6667, 0.5455, 0.4, 0.2222],
+  3.0: [1.0, 0.9545, 0.9, 0.8333, 0.75, 0.6429, 0.5, 0.3],
+};
+// Generate N-step shifted schedule matching MLX port:
+//   timesteps = linspace(1.0, 0.001, N)
+//   sigmas = shift * t / (1 + (shift-1) * t)
+function buildSchedule(numSteps, shift) {
+  if (numSteps === 8 && SHIFT_TIMESTEPS_8[shift]) return SHIFT_TIMESTEPS_8[shift];
+  const sigmaMax = 1.0;
+  const sigmaMin = 0.001;
+  const schedule = [];
+  for (let i = 0; i < numSteps; i++) {
+    // linspace inclusive of both endpoints
+    const t = sigmaMax + (sigmaMin - sigmaMax) * (i / (numSteps - 1));
+    const tShifted = (shift * t) / (1.0 + (shift - 1.0) * t);
+    schedule.push(tShifted);
+  }
+  return schedule;
+}
+const CACHE_NAME = "ace-step-onnx-v12";
+let textTokenizer = null;
+let sessions = {};
+let silenceLatent = null;
+let fsqCodebooks = null;
+let fsqScales = null;
+let fsqProjectOutW = null;
+let fsqProjectOutB = null;
+let lmWorker = null;
+let lmLoaded = false;
+function post(type, data = {}) {
+  self.postMessage({ type, ...data });
+}
+async function fetchBuffer(url, label) {
+  const cache = await caches.open(CACHE_NAME);
+  const cached = await cache.match(url);
+  if (cached) {
+    post("progress", { label, loaded: 1, total: 1, percent: 100 });
+    return await cached.arrayBuffer();
+  }
+  const response = await fetch(url);
+  const total = parseInt(response.headers.get("content-length") || "0");
+  const reader = response.body.getReader();
+  const chunks = [];
+  let loaded = 0;
+  while (true) {
+    const { done, value } = await reader.read();
+    if (done) break;
+    chunks.push(value);
+    loaded += value.length;
+    if (total > 0) post("progress", { label, loaded, total, percent: (loaded / total) * 100 });
+  }
+  const buffer = new Uint8Array(loaded);
+  let offset = 0;
+  for (const chunk of chunks) { buffer.set(chunk, offset); offset += chunk.length; }
+  try {
+    await cache.put(url, new Response(buffer.buffer.slice(0), {
+      headers: { "Content-Type": "application/octet-stream" },
+    }));
+  } catch (_) {}
+  return buffer.buffer;
+}
+async function loadSession(name, filename, useUrlData = false, providers = ["webgpu"]) {
+  post("status", { message: `Loading ${name}...` });
+  try {
+    const modelBuffer = await fetchBuffer(`${ONNX_BASE}/${filename}`, `${name} graph`);
+    if (useUrlData) {
+      return await ort.InferenceSession.create(modelBuffer, {
+        executionProviders: providers,
+        externalData: [{ path: `${filename}.data`, data: `${ONNX_BASE}/${filename}.data` }],
+      });
+    }
+    const weightsBuffer = await fetchBuffer(`${ONNX_BASE}/${filename}.data`, `${name} weights`);
+    return await ort.InferenceSession.create(modelBuffer, {
+      executionProviders: providers,
+      externalData: [{ path: `${filename}.data`, data: weightsBuffer }],
+    });
+  } catch (err) {
+    throw new Error(`Failed loading ${name}: ${err.message}`);
+  }
+}
+function tensor(data, dims, type = "float32") {
+  return new ort.Tensor(type, data, dims);
+}
+function tensorStats(name, data) {
+  const arr = data instanceof Float32Array ? data : new Float32Array(data);
+  let min = Infinity, max = -Infinity, sum = 0;
+  for (let i = 0; i < arr.length; i++) {
+    if (arr[i] < min) min = arr[i];
+    if (arr[i] > max) max = arr[i];
+    sum += arr[i];
+  }
+  console.log(`[stats] ${name}: len=${arr.length} min=${min.toFixed(4)} max=${max.toFixed(4)} mean=${(sum / arr.length).toFixed(4)}`);
+}
+function randn(shape) {
+  const size = shape.reduce((a, b) => a * b, 1);
+  const data = new Float32Array(size);
+  for (let i = 0; i < size; i += 2) {
+    const u1 = Math.random();
+    const u2 = Math.random();
+    const r = Math.sqrt(-2 * Math.log(u1));
+    data[i] = r * Math.cos(2 * Math.PI * u2);
+    if (i + 1 < size) data[i + 1] = r * Math.sin(2 * Math.PI * u2);
+  }
+  return data;
+}
+function packSequences(hidden1, mask1, hidden2, mask2, batchSize, dim) {
+  const l1 = hidden1.length / (batchSize * dim);
+  const l2 = hidden2.length / (batchSize * dim);
+  const totalLen = l1 + l2;
+  const packedHidden = new Float32Array(batchSize * totalLen * dim);
+  const packedMask = new Float32Array(batchSize * totalLen);
+  for (let b = 0; b < batchSize; b++) {
+    const indices = [];
+    for (let i = 0; i < l1; i++) indices.push({ src: 1, idx: i, mask: mask1[b * l1 + i] });
+    for (let i = 0; i < l2; i++) indices.push({ src: 2, idx: i, mask: mask2[b * l2 + i] });
+    indices.sort((a, c) => c.mask - a.mask);
+    for (let pos = 0; pos < totalLen; pos++) {
+      const entry = indices[pos];
+      const srcArray = entry.src === 1 ? hidden1 : hidden2;
+      const srcLen = entry.src === 1 ? l1 : l2;
+      const srcOffset = (b * srcLen + entry.idx) * dim;
+      const dstOffset = (b * totalLen + pos) * dim;
+      packedHidden.set(srcArray.slice(srcOffset, srcOffset + dim), dstOffset);
+      packedMask[b * totalLen + pos] = entry.mask > 0 ? 1 : 0;
+    }
+  }
+  return { hidden: packedHidden, mask: packedMask, seqLen: totalLen };
+}
+function fsqLookup(indices, batchSize, seqLen) {
+  const out = new Float32Array(batchSize * seqLen * HIDDEN_SIZE);
+  for (let b = 0; b < batchSize; b++) {
+    for (let t = 0; t < seqLen; t++) {
+      const idx = indices[b * seqLen + t];
+      const codeOffset = idx * FSQ_DIM;
+      const scaledCode = new Float32Array(FSQ_DIM);
+      for (let d = 0; d < FSQ_DIM; d++) scaledCode[d] = fsqCodebooks[codeOffset + d] * fsqScales[d];
+      const outOffset = (b * seqLen + t) * HIDDEN_SIZE;
+      for (let h = 0; h < HIDDEN_SIZE; h++) {
+        let val = fsqProjectOutB[h];
+        for (let d = 0; d < FSQ_DIM; d++) val += scaledCode[d] * fsqProjectOutW[h * FSQ_DIM + d];
+        out[outOffset + h] = val;
+      }
+    }
+  }
+  return out;
+}
+// Spawn the LM worker and forward its status/progress messages up to the main thread
+function spawnLMWorker() {
+  const worker = new Worker(new URL("./lm-worker.js", import.meta.url), { type: "module" });
+  worker.onmessage = (e) => {
+    const { type, ...data } = e.data;
+    if (type === "status" || type === "progress" || type === "error") {
+      self.postMessage(e.data);  // forward as-is
+    }
+    // "loaded" and "audio_codes" are handled by the promise-based callers below
+  };
+  return worker;
+}
+function loadLMWorker() {
+  return new Promise((resolve, reject) => {
+    if (!lmWorker) lmWorker = spawnLMWorker();
+    const onMsg = (e) => {
+      if (e.data.type === "loaded") {
+        lmWorker.removeEventListener("message", onMsg);
+        lmLoaded = true;
+        resolve();
+      } else if (e.data.type === "error") {
+        lmWorker.removeEventListener("message", onMsg);
+        reject(new Error(e.data.message));
+      }
+    };
+    lmWorker.addEventListener("message", onMsg);
+    lmWorker.postMessage({ type: "load" });
+  });
+}
+function generateAudioCodesViaLM({ caption, lyrics, duration, numLatentFrames }) {
+  return new Promise((resolve, reject) => {
+    const onMsg = (e) => {
+      if (e.data.type === "audio_codes") {
+        lmWorker.removeEventListener("message", onMsg);
+        resolve(e.data);
+      } else if (e.data.type === "error") {
+        lmWorker.removeEventListener("message", onMsg);
+        reject(new Error(e.data.message));
+      }
+    };
+    lmWorker.addEventListener("message", onMsg);
+    lmWorker.postMessage({ type: "generate", caption, lyrics, duration, numLatentFrames });
+  });
+}
+async function loadModels() {
+  ort.env.wasm.numThreads = 1;
+  ort.env.wasm.simd = true;
+  ort.env.wasm.proxy = false;
+  console.log(`[models] ONNX revision ${MODEL_REVISION}`);
+  post("status", { message: `Using ONNX revision ${MODEL_REVISION.slice(0, 7)}` });
+  post("status", { message: "Spawning LM worker..." });
+  // Kick off LM loading in parallel with main-worker model loads
+  const lmLoadPromise = loadLMWorker();
+  post("status", { message: "Loading text tokenizer..." });
+  textTokenizer = await AutoTokenizer.from_pretrained(TEXT_TOKENIZER_REPO);
+  sessions.embedTokens = await loadSession("Embed Tokens", "text_embed_tokens_fp16.onnx");
+  sessions.detokenizer = await loadSession("Detokenizer", "detokenizer.onnx");
+  // VAE on WASM — WebGPU produces constant output past ~1.5s for conv1d upsample chain
+  sessions.vaeDecoder = await loadSession("VAE Decoder (CPU)", "vae_decoder_fp16.onnx", false, ["wasm"]);
+  sessions.textEncoder = await loadSession("Text Encoder", "text_encoder_fp16.onnx", true);
+  // FP32 condition_encoder — q4v2 had max_diff=13.92 vs PyTorch with real inputs,
+  // degrading conditioning so badly that DiT output was garbled. FP32 is 2.4GB via URL.
+  sessions.conditionEncoder = await loadSession("Condition Encoder (fp32)", "condition_encoder.onnx", true);
+  // DEBUG: dit_decoder_fp16_v2 is the quality baseline (max_diff=0.021 per step).
+  // dit_cached trades quality for speed (max_diff=0.074). Reverting while we diagnose
+  // the ONNX-vs-MLX spectral gap — compounded drift over 8 steps matters here.
+  sessions.ditDecoder = await loadSession("DiT Decoder (uncached)", "dit_decoder_fp16_v2.onnx", true);
+  post("status", { message: "Loading auxiliary data..." });
+  const [cbBuf, scBuf, powBuf, pobBuf, silBuf] = await Promise.all([
+    fetchBuffer(`${ONNX_BASE}/fsq_codebooks.bin`, "codebooks"),
+    fetchBuffer(`${ONNX_BASE}/fsq_scales.bin`, "scales"),
+    fetchBuffer(`${ONNX_BASE}/fsq_project_out_weight.bin`, "proj_out_w"),
+    fetchBuffer(`${ONNX_BASE}/fsq_project_out_bias.bin`, "proj_out_b"),
+    fetchBuffer("/silence_latent.bin", "silence latent"),
+  ]);
+  fsqCodebooks = new Float32Array(cbBuf);
+  fsqScales = new Float32Array(scBuf);
+  fsqProjectOutW = new Float32Array(powBuf);
+  fsqProjectOutB = new Float32Array(pobBuf);
+  silenceLatent = new Float32Array(silBuf);
+  post("status", { message: "Waiting for LM worker..." });
+  await lmLoadPromise;
+  post("status", { message: "All models loaded!" });
+  post("loaded");
+}
+function buildSFTPrompt(caption, metas) {
+  const instruction = "Fill the audio semantic mask based on the given conditions:";
+  return `# Instruction\n${instruction}\n\n# Caption\n${caption}\n\n# Metas\n${metas}<|endoftext|>`;
+}
+async function encodeText(caption, metas) {
+  const prompt = buildSFTPrompt(caption, metas);
+  const encoded = textTokenizer(prompt, { padding: "max_length", max_length: 256, truncation: true });
+  const idsRaw = encoded.input_ids.data;
+  const inputIds = idsRaw instanceof BigInt64Array ? idsRaw : new BigInt64Array(Array.from(idsRaw, BigInt));
+  const result = await sessions.textEncoder.run({ input_ids: tensor(inputIds, [1, 256], "int64") });
+  const projected = await sessions.textProjector.run({ text_hidden_states: result.hidden_states });
+  const maskRaw = encoded.attention_mask.data;
+  const attentionMask = new Float32Array(maskRaw.length);
+  for (let i = 0; i < maskRaw.length; i++) attentionMask[i] = Number(maskRaw[i]);
+  return { hidden: projected.projected.data, mask: attentionMask, seqLen: 256 };
+}
+async function encodeLyrics(lyrics, language = "en") {
+  const fullText = `# Languages\n${language}\n\n# Lyric\n${lyrics}`;
+  // max_length=2048 matches the original handler (conditioning_text.py)
+  const encoded = textTokenizer(fullText, { padding: "max_length", max_length: 2048, truncation: true });
+  const idsRaw = encoded.input_ids.data;
+  const inputIds = idsRaw instanceof BigInt64Array ? idsRaw : new BigInt64Array(Array.from(idsRaw, BigInt));
+  const seqLen = inputIds.length;
+  const embedResult = await sessions.embedTokens.run({ input_ids: tensor(inputIds, [1, seqLen], "int64") });
+  const maskRaw = encoded.attention_mask.data;
+  const attentionMask = new Float32Array(maskRaw.length);
+  for (let i = 0; i < maskRaw.length; i++) attentionMask[i] = Number(maskRaw[i]);
+  const lyricResult = await sessions.lyricEncoder.run({
+    inputs_embeds: embedResult.hidden_states,
+    attention_mask: tensor(attentionMask, [1, seqLen]),
+  });
+  return { hidden: lyricResult.hidden_states.data, mask: attentionMask, seqLen };
+}
+async function encodeTimbre() {
+  const silenceRef = silenceLatent.slice(0, 750 * LATENT_CHANNELS);
+  const result = await sessions.timbreEncoder.run({
+    refer_audio: tensor(silenceRef, [1, 750, LATENT_CHANNELS]),
+  });
+  const timbreHidden = new Float32Array(HIDDEN_SIZE);
+  timbreHidden.set(result.timbre_embedding.data);
+  return { hidden: timbreHidden, mask: new Float32Array([1.0]), seqLen: 1 };
+}
+async function generateLMHints(caption, lyrics, numLatentFrames, duration) {
+  const { codes, elapsed, tokenCount } = await generateAudioCodesViaLM({ caption, lyrics, duration, numLatentFrames });
+  post("status", { message: `LM: ${codes.length} codes from ${tokenCount} tokens in ${elapsed}s` });
+  if (codes.length === 0) {
+    console.warn("[lm] No audio codes generated, returning silence");
+    return new Float32Array(numLatentFrames * LATENT_CHANNELS);
+  }
+  const numCodes5Hz = codes.length;
+  post("status", { message: "FSQ codebook lookup..." });
+  const lmHints5Hz = fsqLookup(codes, 1, numCodes5Hz);
+  tensorStats("lm_hints_5hz", lmHints5Hz);
+  post("status", { message: "Detokenizing 5Hz → 25Hz..." });
+  const detokResult = await sessions.detokenizer.run({
+    quantized: tensor(lmHints5Hz, [1, numCodes5Hz, HIDDEN_SIZE]),
+  });
+  const lmHints25HzRaw = detokResult.lm_hints_25hz.data;
+  const rawLen = lmHints25HzRaw.length / LATENT_CHANNELS;
+  tensorStats("lm_hints_25hz_raw", lmHints25HzRaw);
+  // Pad with last frame (MLX port behavior) or truncate
+  const lmHints25Hz = new Float32Array(numLatentFrames * LATENT_CHANNELS);
+  if (rawLen >= numLatentFrames) {
+    lmHints25Hz.set(lmHints25HzRaw.slice(0, numLatentFrames * LATENT_CHANNELS));
+  } else {
+    lmHints25Hz.set(lmHints25HzRaw);
+    // Repeat last frame to fill remaining
+    const lastFrameStart = (rawLen - 1) * LATENT_CHANNELS;
+    const lastFrame = lmHints25HzRaw.slice(lastFrameStart, lastFrameStart + LATENT_CHANNELS);
+    for (let t = rawLen; t < numLatentFrames; t++) {
+      lmHints25Hz.set(lastFrame, t * LATENT_CHANNELS);
+    }
+    console.log(`[hints] padded ${rawLen} → ${numLatentFrames} frames with last-frame replication`);
+  }
+  tensorStats("lm_hints_25hz_final", lmHints25Hz);
+  return lmHints25Hz;
+}
+async function generateAudio({ caption, lyrics, duration, shift, numSteps = 8 }) {
+  const totalStartTime = performance.now();
+  const filenameStamp = Date.now();
+  const batchSize = 1;
+  const numLatentFrames = Math.round(duration * LATENT_RATE);
+  const tSchedule = buildSchedule(numSteps, shift);
+  const metas = `duration: ${duration}s`;
+  // 1. Text → Qwen3 embedding (1024-dim hidden states, BEFORE projection)
+  post("status", { message: "Encoding text..." });
+  const sftPrompt = buildSFTPrompt(caption, metas);
+  const textEnc = textTokenizer(sftPrompt, { padding: "max_length", max_length: 256, truncation: true });
+  const textIdsRaw = textEnc.input_ids.data;
+  const textIds = textIdsRaw instanceof BigInt64Array ? textIdsRaw : new BigInt64Array(Array.from(textIdsRaw, BigInt));
+  const textHiddenRes = await sessions.textEncoder.run({ input_ids: tensor(textIds, [1, 256], "int64") });
+  const textHidden = textHiddenRes.hidden_states;
+  const textMaskRaw = textEnc.attention_mask.data;
+  const textMask = new Float32Array(textMaskRaw.length);
+  for (let i = 0; i < textMaskRaw.length; i++) textMask[i] = Number(textMaskRaw[i]);
+  // 2. Lyric tokens → embed_tokens (1024-dim, passed into condition_encoder's lyric_encoder)
+  post("status", { message: "Embedding lyrics..." });
+  const lyricFullText = `# Languages\nen\n\n# Lyric\n${lyrics}`;
+  const lyricEnc = textTokenizer(lyricFullText, { padding: "max_length", max_length: 2048, truncation: true });
+  const lyricIdsRaw = lyricEnc.input_ids.data;
+  const lyricIds = lyricIdsRaw instanceof BigInt64Array ? lyricIdsRaw : new BigInt64Array(Array.from(lyricIdsRaw, BigInt));
+  const lyricEmbRes = await sessions.embedTokens.run({ input_ids: tensor(lyricIds, [1, 2048], "int64") });
+  const lyricEmb = lyricEmbRes.hidden_states;
+  const lyricMaskRaw = lyricEnc.attention_mask.data;
+  const lyricMask = new Float32Array(lyricMaskRaw.length);
+  for (let i = 0; i < lyricMaskRaw.length; i++) lyricMask[i] = Number(lyricMaskRaw[i]);
+  // 3. LM hints (mandatory for turbo model)
+  const lmHints25Hz = await generateLMHints(caption, lyrics, numLatentFrames, duration);
+  // 4. Silence for ref audio (timbre) and src_latents
+  const silenceRef = silenceLatent.slice(0, 750 * LATENT_CHANNELS);
+  const srcLatents = new Float32Array(numLatentFrames * LATENT_CHANNELS);
+  const chunkMasks = new Float32Array(numLatentFrames * LATENT_CHANNELS).fill(1.0);
+  const isCovers = new Float32Array([1.0]);  // force use of LM hints
+  // 5. condition_encoder: does text_projector + lyric_encoder + timbre_encoder + pack_sequences + context_latents
+  post("status", { message: "Running condition encoder..." });
+  const condResult = await sessions.conditionEncoder.run({
+    text_hidden_states: textHidden,
+    text_attention_mask: tensor(textMask, [1, 256]),
+    lyric_hidden_states: lyricEmb,
+    lyric_attention_mask: tensor(lyricMask, [1, 2048]),
+    refer_audio_acoustic_hidden_states_packed: tensor(silenceRef, [1, 750, LATENT_CHANNELS]),
+    refer_audio_order_mask: tensor(new BigInt64Array([0n]), [1], "int64"),
+    src_latents: tensor(srcLatents, [1, numLatentFrames, LATENT_CHANNELS]),
+    chunk_masks: tensor(chunkMasks, [1, numLatentFrames, LATENT_CHANNELS]),
+    is_covers: tensor(isCovers, [1]),
+    precomputed_lm_hints_25hz: tensor(lmHints25Hz, [1, numLatentFrames, LATENT_CHANNELS]),
+  });
+  const encoderHiddenStates = condResult.encoder_hidden_states;
+  const contextLatentsTensor = condResult.context_latents;
+  tensorStats("encoder_hidden_states", encoderHiddenStates.data);
+  tensorStats("context_latents", contextLatentsTensor.data);
+  post("status", { message: "Starting denoising..." });
+  let xt = randn([batchSize, numLatentFrames, LATENT_CHANNELS]);
+  const startTime = performance.now();
+  for (let step = 0; step < tSchedule.length; step++) {
+    const tCurr = tSchedule[step];
+    post("status", { message: `Denoising step ${step + 1}/${tSchedule.length}...` });
+    const timestepData = new Float32Array(batchSize).fill(tCurr);
+    const result = await sessions.ditDecoder.run({
+      hidden_states: tensor(xt, [batchSize, numLatentFrames, LATENT_CHANNELS]),
+      timestep: tensor(timestepData, [batchSize]),
+      encoder_hidden_states: encoderHiddenStates,
+      context_latents: contextLatentsTensor,
+    });
+    const vt = result.velocity.data;
+    if (step === tSchedule.length - 1) {
+      for (let i = 0; i < xt.length; i++) xt[i] = xt[i] - vt[i] * tCurr;
+    } else {
+      const dt = tCurr - tSchedule[step + 1];
+      for (let i = 0; i < xt.length; i++) xt[i] = xt[i] - vt[i] * dt;
+    }
+  }
+  const diffusionTime = ((performance.now() - startTime) / 1000).toFixed(2);
+  tensorStats("final_latent", xt);
+  // Per-frame variance check — detects if later frames are constant
+  const perFrameVariance = new Float32Array(numLatentFrames);
+  for (let t = 0; t < numLatentFrames; t++) {
+    let mean = 0;
+    for (let c = 0; c < LATENT_CHANNELS; c++) mean += xt[t * LATENT_CHANNELS + c];
+    mean /= LATENT_CHANNELS;
+    let varSum = 0;
+    for (let c = 0; c < LATENT_CHANNELS; c++) {
+      const d = xt[t * LATENT_CHANNELS + c] - mean;
+      varSum += d * d;
+    }
+    perFrameVariance[t] = varSum / LATENT_CHANNELS;
+  }
+  console.log("[perframe] variance samples:", Array.from(perFrameVariance.filter((_, i) => i % 25 === 0)).map(v => v.toFixed(3)));
+  // Also check LM hints per-frame variance
+  const hintsVar = new Float32Array(numLatentFrames);
+  for (let t = 0; t < numLatentFrames; t++) {
+    let mean = 0;
+    for (let c = 0; c < LATENT_CHANNELS; c++) mean += lmHints25Hz[t * LATENT_CHANNELS + c];
+    mean /= LATENT_CHANNELS;
+    let varSum = 0;
+    for (let c = 0; c < LATENT_CHANNELS; c++) {
+      const d = lmHints25Hz[t * LATENT_CHANNELS + c] - mean;
+      varSum += d * d;
+    }
+    hintsVar[t] = varSum / LATENT_CHANNELS;
+  }
+  console.log("[hints var] samples:", Array.from(hintsVar.filter((_, i) => i % 25 === 0)).map(v => v.toFixed(3)));
+  post("status", { message: "Decoding audio..." });
+  const latentsForVae = new Float32Array(batchSize * LATENT_CHANNELS * numLatentFrames);
+  for (let t = 0; t < numLatentFrames; t++) {
+    for (let c = 0; c < LATENT_CHANNELS; c++) {
+      latentsForVae[c * numLatentFrames + t] = xt[t * LATENT_CHANNELS + c];
+    }
+  }
+  const vaeResult = await sessions.vaeDecoder.run({
+    latents: tensor(latentsForVae, [batchSize, LATENT_CHANNELS, numLatentFrames]),
+  });
+  const waveform = vaeResult.waveform.data;
+  tensorStats("waveform", waveform);
+  masterWaveform(waveform, SAMPLE_RATE, 2);
+  const wavBuffer = float32ToWav(waveform, SAMPLE_RATE, 2);
+  // totalTime measures the whole pipeline (LM + encoders + diffusion + VAE),
+  // not just the diffusion loop. diffusionTime is reported separately below.
+  const totalTime = ((performance.now() - totalStartTime) / 1000).toFixed(2);
+  post("audio", { wavBuffer, duration, diffusionTime, totalTime, filenameStamp }, [wavBuffer]);
+}
+function measureAudio(samples) {
+  let peak = 0;
+  let sumSq = 0;
+  for (let i = 0; i < samples.length; i++) {
+    const v = samples[i];
+    const abs = Math.abs(v);
+    if (abs > peak) peak = abs;
+    sumSq += v * v;
+  }
+  return { peak, rms: Math.sqrt(sumSq / Math.max(1, samples.length)) };
+}
+function goertzelPower(data, sampleRate, freq) {
+  const omega = 2 * Math.PI * freq / sampleRate;
+  const coeff = 2 * Math.cos(omega);
+  let s0 = 0, s1 = 0, s2 = 0;
+  for (let i = 0; i < data.length; i++) {
+    s0 = data[i] + coeff * s1 - s2;
+    s2 = s1;
+    s1 = s0;
+  }
+  return s1 * s1 + s2 * s2 - coeff * s1 * s2;
+}
+function detectDronePeaks(samples, sampleRate, channels) {
+  const numSamples = samples.length / channels;
+  const step = Math.max(1, Math.floor(sampleRate / 4000));
+  const downsampleRate = sampleRate / step;
+  const downsampledLength = Math.floor(numSamples / step);
+  if (downsampledLength < 1024) return [];
+  const mono = new Float32Array(downsampledLength);
+  let mean = 0;
+  for (let i = 0; i < downsampledLength; i++) {
+    const src = i * step;
+    let v = 0;
+    for (let ch = 0; ch < channels; ch++) v += samples[ch * numSamples + src];
+    v /= channels;
+    mono[i] = v;
+    mean += v;
+  }
+  mean /= downsampledLength;
+  for (let i = 0; i < mono.length; i++) mono[i] -= mean;
+  const bins = [];
+  for (let freq = 250; freq <= 950; freq += 12.5) {
+    bins.push({ freq, power: goertzelPower(mono, downsampleRate, freq) });
+  }
+  const sortedPowers = bins.map((bin) => bin.power).sort((a, b) => a - b);
+  const median = sortedPowers[Math.floor(sortedPowers.length / 2)] + 1e-12;
+  bins.sort((a, b) => b.power - a.power);
+  const peaks = [];
+  for (const bin of bins) {
+    const score = bin.power / median;
+    if (score < 12) break;
+    if (peaks.every((peak) => Math.abs(peak.freq - bin.freq) >= 50)) {
+      peaks.push({ freq: bin.freq, score });
+      if (peaks.length >= 2) break;
+    }
+  }
+  return peaks;
+}
+function applyNotch(samples, sampleRate, channels, freq, q = 20, depth = 0.45) {
+  const numSamples = samples.length / channels;
+  const w0 = 2 * Math.PI * freq / sampleRate;
+  const cos = Math.cos(w0);
+  const alpha = Math.sin(w0) / (2 * q);
+  const a0 = 1 + alpha;
+  const b0 = 1 / a0;
+  const b1 = (-2 * cos) / a0;
+  const b2 = 1 / a0;
+  const a1 = (-2 * cos) / a0;
+  const a2 = (1 - alpha) / a0;
+  for (let ch = 0; ch < channels; ch++) {
+    const offset = ch * numSamples;
+    let x1 = 0, x2 = 0, y1 = 0, y2 = 0;
+    for (let i = 0; i < numSamples; i++) {
+      const x0 = samples[offset + i];
+      const y0 = b0 * x0 + b1 * x1 + b2 * x2 - a1 * y1 - a2 * y2;
+      samples[offset + i] = x0 * (1 - depth) + y0 * depth;
+      x2 = x1; x1 = x0;
+      y2 = y1; y1 = y0;
+    }
+  }
+}
+function masterWaveform(samples, sampleRate, channels) {
+  const before = measureAudio(samples);
+  if (before.peak <= 0.001) return;
+  const dronePeaks = detectDronePeaks(samples, sampleRate, channels);
+  for (const peak of dronePeaks) applyNotch(samples, sampleRate, channels, peak.freq);
+  const afterEq = measureAudio(samples);
+  const targetRms = 0.085;
+  const maxPeak = 0.891;
+  const maxGain = 12.0;
+  const gain = Math.min(
+    maxGain,
+    targetRms / Math.max(afterEq.rms, 1e-6),
+    maxPeak / Math.max(afterEq.peak, 1e-6),
+  );
+  for (let i = 0; i < samples.length; i++) samples[i] *= gain;
+  const after = measureAudio(samples);
+  const peakText = dronePeaks.map((peak) => `${peak.freq.toFixed(1)}Hz/${peak.score.toFixed(0)}x`).join(", ") || "none";
+  console.log(
+    `[master] rawPeak=${before.peak.toFixed(4)} rawRms=${before.rms.toFixed(4)} ` +
+    `dronePeaks=${peakText} gain=${gain.toFixed(2)}x peak=${after.peak.toFixed(4)} rms=${after.rms.toFixed(4)}`,
+  );
+}
+function float32ToWav(samples, sampleRate, channels = 2) {
+  const numSamples = samples.length / channels;
+  const bitsPerSample = 16;
+  const blockAlign = channels * (bitsPerSample / 8);
+  const byteRate = sampleRate * blockAlign;
+  const dataSize = numSamples * blockAlign;
+  const buffer = new ArrayBuffer(44 + dataSize);
+  const view = new DataView(buffer);
+  const w = (o, s) => { for (let i = 0; i < s.length; i++) view.setUint8(o + i, s.charCodeAt(i)); };
+  w(0, "RIFF"); view.setUint32(4, 36 + dataSize, true);
+  w(8, "WAVE"); w(12, "fmt "); view.setUint32(16, 16, true);
+  view.setUint16(20, 1, true); view.setUint16(22, channels, true);
+  view.setUint32(24, sampleRate, true); view.setUint32(28, byteRate, true);
+  view.setUint16(32, blockAlign, true); view.setUint16(34, bitsPerSample, true);
+  w(36, "data"); view.setUint32(40, dataSize, true);
+  let offset = 44;
+  for (let i = 0; i < numSamples; i++) {
+    for (let ch = 0; ch < channels; ch++) {
+      const sample = Math.max(-1, Math.min(1, samples[ch * numSamples + i]));
+      view.setInt16(offset, sample * 32767, true);
+      offset += 2;
+    }
+  }
+  return buffer;
+}
+self.onmessage = async (e) => {
+  const { type, ...data } = e.data;
+  try {
+    if (type === "load") await loadModels();
+    else if (type === "generate") await generateAudio(data);
+  } catch (err) {
+    post("error", { message: err.message, stack: err.stack });
+  }
+};

_source/vite.config.js ADDED Viewed

	@@ -0,0 +1,13 @@

+import { defineConfig } from 'vite'
+import react from '@vitejs/plugin-react'
+import tailwindcss from '@tailwindcss/vite'
+export default defineConfig({
+  plugins: [react(), tailwindcss()],
+  optimizeDeps: {
+    exclude: ['onnxruntime-web'],
+  },
+  worker: {
+    format: 'es',
+  },
+})

assets/index-C7vMACvi.js ADDED Viewed

The diff for this file is too large to render. See raw diff

assets/index-CccuoAYh.css ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ /! tailwindcss v4.2.2 \| MIT License \| https://tailwindcss.com /
2	+ @layer properties{@supports (((-webkit-hyphens:none)) and (not (margin-trim:inline))) or ((-moz-orient:inline) and (not (color:rgb(from red r g b)))){,:before,:after,::backdrop{--tw-rotate-x:initial;--tw-rotate-y:initial;--tw-rotate-z:initial;--tw-skew-x:initial;--tw-skew-y:initial;--tw-space-y-reverse:0;--tw-border-style:solid;--tw-leading:initial;--tw-font-weight:initial;--tw-tracking:initial;--tw-blur:initial;--tw-brightness:initial;--tw-contrast:initial;--tw-grayscale:initial;--tw-hue-rotate:initial;--tw-invert:initial;--tw-opacity:initial;--tw-saturate:initial;--tw-sepia:initial;--tw-drop-shadow:initial;--tw-drop-shadow-color:initial;--tw-drop-shadow-alpha:100%;--tw-drop-shadow-size:initial;--tw-duration:initial;--tw-scale-x:1;--tw-scale-y:1;--tw-scale-z:1}}}@layer theme{:root,:host{--font-sans:ui-sans-serif, system-ui, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji";--font-mono:ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;--spacing:.25rem;--container-sm:24rem;--container-md:28rem;--container-2xl:42rem;--text-xs:.75rem;--text-xs--line-height:calc(1 / .75);--text-sm:.875rem;--text-sm--line-height:calc(1.25 / .875);--text-base:1rem;--text-base--line-height:calc(1.5 / 1);--text-lg:1.125rem;--text-lg--line-height:calc(1.75 / 1.125);--text-xl:1.25rem;--text-xl--line-height:calc(1.75 / 1.25);--text-2xl:1.5rem;--text-2xl--line-height:calc(2 / 1.5);--text-4xl:2.25rem;--text-4xl--line-height:calc(2.5 / 2.25);--text-5xl:3rem;--text-5xl--line-height:1;--font-weight-medium:500;--font-weight-semibold:600;--tracking-wider:.05em;--tracking-widest:.1em;--leading-relaxed:1.625;--radius-md:.375rem;--radius-lg:.5rem;--radius-xl:.75rem;--radius-2xl:1rem;--default-transition-duration:.15s;--default-transition-timing-function:cubic-bezier(.4, 0, .2, 1);--default-font-family:var(--font-sans);--default-mono-font-family:var(--font-mono)}}@layer base{,:after,:before,::backdrop{box-sizing:border-box;border:0 solid;margin:0;padding:0}::file-selector-button{box-sizing:border-box;border:0 solid;margin:0;padding:0}html,:host{-webkit-text-size-adjust:100%;tab-size:4;line-height:1.5;font-family:var(--default-font-family,ui-sans-serif, system-ui, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji");font-feature-settings:var(--default-font-feature-settings,normal);font-variation-settings:var(--default-font-variation-settings,normal);-webkit-tap-highlight-color:transparent}hr{height:0;color:inherit;border-top-width:1px}abbr:where([title]){-webkit-text-decoration:underline dotted;text-decoration:underline dotted}h1,h2,h3,h4,h5,h6{font-size:inherit;font-weight:inherit}a{color:inherit;-webkit-text-decoration:inherit;-webkit-text-decoration:inherit;-webkit-text-decoration:inherit;-webkit-text-decoration:inherit;text-decoration:inherit}b,strong{font-weight:bolder}code,kbd,samp,pre{font-family:var(--default-mono-font-family,ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace);font-feature-settings:var(--default-mono-font-feature-settings,normal);font-variation-settings:var(--default-mono-font-variation-settings,normal);font-size:1em}small{font-size:80%}sub,sup{vertical-align:baseline;font-size:75%;line-height:0;position:relative}sub{bottom:-.25em}sup{top:-.5em}table{text-indent:0;border-color:inherit;border-collapse:collapse}:-moz-focusring{outline:auto}progress{vertical-align:baseline}summary{display:list-item}ol,ul,menu{list-style:none}img,svg,video,canvas,audio,iframe,embed,object{vertical-align:middle;display:block}img,video{max-width:100%;height:auto}button,input,select,optgroup,textarea{font:inherit;font-feature-settings:inherit;font-variation-settings:inherit;letter-spacing:inherit;color:inherit;opacity:1;background-color:#0000;border-radius:0}::file-selector-button{font:inherit;font-feature-settings:inherit;font-variation-settings:inherit;letter-spacing:inherit;color:inherit;opacity:1;background-color:#0000;border-radius:0}:where(select:is([multiple],[size])) optgroup{font-weight:bolder}:where(select:is([multiple],[size])) optgroup option{padding-inline-start:20px}::file-selector-button{margin-inline-end:4px}::placeholder{opacity:1}@supports (not ((-webkit-appearance:-apple-pay-button))) or (contain-intrinsic-size:1px){::placeholder{color:currentColor}@supports (color:color-mix(in lab, red, red)){::placeholder{color:color-mix(in oklab, currentcolor 50%, transparent)}}}textarea{resize:vertical}::-webkit-search-decoration{-webkit-appearance:none}::-webkit-date-and-time-value{min-height:1lh;text-align:inherit}::-webkit-datetime-edit{display:inline-flex}::-webkit-datetime-edit-fields-wrapper{padding:0}::-webkit-datetime-edit{padding-block:0}::-webkit-datetime-edit-year-field{padding-block:0}::-webkit-datetime-edit-month-field{padding-block:0}::-webkit-datetime-edit-day-field{padding-block:0}::-webkit-datetime-edit-hour-field{padding-block:0}::-webkit-datetime-edit-minute-field{padding-block:0}::-webkit-datetime-edit-second-field{padding-block:0}::-webkit-datetime-edit-millisecond-field{padding-block:0}::-webkit-datetime-edit-meridiem-field{padding-block:0}::-webkit-calendar-picker-indicator{line-height:1}:-moz-ui-invalid{box-shadow:none}button,input:where([type=button],[type=reset],[type=submit]){appearance:button}::file-selector-button{appearance:button}::-webkit-inner-spin-button{height:auto}::-webkit-outer-spin-button{height:auto}[hidden]:where(:not([hidden=until-found])){display:none!important}}@layer components;@layer utilities{.fixed{position:fixed}.relative{position:relative}.static{position:static}.inset-0{inset:calc(var(--spacing) * 0)}.start{inset-inline-start:var(--spacing)}.end{inset-inline-end:var(--spacing)}.z-50{z-index:50}.mx-2{margin-inline:calc(var(--spacing) * 2)}.mt-0\.5{margin-top:calc(var(--spacing) * .5)}.mt-3{margin-top:calc(var(--spacing) * 3)}.mt-4{margin-top:calc(var(--spacing) * 4)}.mt-12{margin-top:calc(var(--spacing) * 12)}.mb-1{margin-bottom:calc(var(--spacing) * 1)}.mb-1\.5{margin-bottom:calc(var(--spacing) * 1.5)}.mb-2{margin-bottom:calc(var(--spacing) * 2)}.mb-3{margin-bottom:calc(var(--spacing) * 3)}.mb-4{margin-bottom:calc(var(--spacing) * 4)}.mb-5{margin-bottom:calc(var(--spacing) * 5)}.mb-6{margin-bottom:calc(var(--spacing) * 6)}.mb-10{margin-bottom:calc(var(--spacing) * 10)}.block{display:block}.contents{display:contents}.flex{display:flex}.hidden{display:none}.table{display:table}.h-1{height:calc(var(--spacing) * 1)}.h-10{height:calc(var(--spacing) * 10)}.h-14{height:calc(var(--spacing) * 14)}.h-full{height:100%}.min-h-screen{min-height:100vh}.w-8{width:calc(var(--spacing) * 8)}.w-10{width:calc(var(--spacing) * 10)}.w-24{width:calc(var(--spacing) * 24)}.w-full{width:100%}.max-w-2xl{max-width:var(--container-2xl)}.max-w-md{max-width:var(--container-md)}.max-w-sm{max-width:var(--container-sm)}.min-w-0{min-width:calc(var(--spacing) * 0)}.flex-1{flex:1}.flex-shrink-0{flex-shrink:0}.transform{transform:var(--tw-rotate-x,) var(--tw-rotate-y,) var(--tw-rotate-z,) var(--tw-skew-x,) var(--tw-skew-y,)}.cursor-pointer{cursor:pointer}.resize-none{resize:none}.list-inside{list-style-position:inside}.list-decimal{list-style-type:decimal}.list-disc{list-style-type:disc}.flex-col{flex-direction:column}.flex-wrap{flex-wrap:wrap}.items-baseline{align-items:baseline}.items-center{align-items:center}.items-end{align-items:flex-end}.justify-between{justify-content:space-between}.justify-center{justify-content:center}.gap-2{gap:calc(var(--spacing) * 2)}.gap-3{gap:calc(var(--spacing) * 3)}.gap-\[2px\]{gap:2px}:where(.space-y-1>:not(:last-child)){--tw-space-y-reverse:0;margin-block-start:calc(calc(var(--spacing) * 1) * var(--tw-space-y-reverse));margin-block-end:calc(calc(var(--spacing) * 1) * calc(1 - var(--tw-space-y-reverse)))}:where(.space-y-2>:not(:last-child)){--tw-space-y-reverse:0;margin-block-start:calc(calc(var(--spacing) * 2) * var(--tw-space-y-reverse));margin-block-end:calc(calc(var(--spacing) * 2) * calc(1 - var(--tw-space-y-reverse)))}:where(.space-y-3>:not(:last-child)){--tw-space-y-reverse:0;margin-block-start:calc(calc(var(--spacing) * 3) * var(--tw-space-y-reverse));margin-block-end:calc(calc(var(--spacing) * 3) * calc(1 - var(--tw-space-y-reverse)))}:where(.space-y-4>:not(:last-child)){--tw-space-y-reverse:0;margin-block-start:calc(calc(var(--spacing) * 4) * var(--tw-space-y-reverse));margin-block-end:calc(calc(var(--spacing) * 4) * calc(1 - var(--tw-space-y-reverse)))}.truncate{text-overflow:ellipsis;white-space:nowrap;overflow:hidden}.overflow-hidden{overflow:hidden}.rounded{border-radius:.25rem}.rounded-2xl{border-radius:var(--radius-2xl)}.rounded-\[2px\]{border-radius:2px}.rounded-full{border-radius:3.40282e38px}.rounded-lg{border-radius:var(--radius-lg)}.rounded-md{border-radius:var(--radius-md)}.rounded-xl{border-radius:var(--radius-xl)}.border{border-style:var(--tw-border-style);border-width:1px}.bg-transparent{background-color:#0000}.p-3{padding:calc(var(--spacing) * 3)}.p-4{padding:calc(var(--spacing) * 4)}.p-5{padding:calc(var(--spacing) * 5)}.p-8{padding:calc(var(--spacing) * 8)}.px-3{padding-inline:calc(var(--spacing) * 3)}.px-4{padding-inline:calc(var(--spacing) * 4)}.px-6{padding-inline:calc(var(--spacing) * 6)}.px-8{padding-inline:calc(var(--spacing) * 8)}.py-1\.5{padding-block:calc(var(--spacing) * 1.5)}.py-2\.5{padding-block:calc(var(--spacing) * 2.5)}.py-3{padding-block:calc(var(--spacing) * 3)}.py-3\.5{padding-block:calc(var(--spacing) * 3.5)}.py-10{padding-block:calc(var(--spacing) * 10)}.pt-2{padding-top:calc(var(--spacing) * 2)}.text-center{text-align:center}.text-left{text-align:left}.text-right{text-align:right}.font-mono{font-family:var(--font-mono)}.text-2xl{font-size:var(--text-2xl);line-height:var(--tw-leading,var(--text-2xl--line-height))}.text-4xl{font-size:var(--text-4xl);line-height:var(--tw-leading,var(--text-4xl--line-height))}.text-5xl{font-size:var(--text-5xl);line-height:var(--tw-leading,var(--text-5xl--line-height))}.text-base{font-size:var(--text-base);line-height:var(--tw-leading,var(--text-base--line-height))}.text-lg{font-size:var(--text-lg);line-height:var(--tw-leading,var(--text-lg--line-height))}.text-sm{font-size:var(--text-sm);line-height:var(--tw-leading,var(--text-sm--line-height))}.text-xl{font-size:var(--text-xl);line-height:var(--tw-leading,var(--text-xl--line-height))}.text-xs{font-size:var(--text-xs);line-height:var(--tw-leading,var(--text-xs--line-height))}.text-\[10px\]{font-size:10px}.text-\[11px\]{font-size:11px}.text-\[13px\]{font-size:13px}.leading-none{--tw-leading:1;line-height:1}.leading-relaxed{--tw-leading:var(--leading-relaxed);line-height:var(--leading-relaxed)}.font-medium{--tw-font-weight:var(--font-weight-medium);font-weight:var(--font-weight-medium)}.font-semibold{--tw-font-weight:var(--font-weight-semibold);font-weight:var(--font-weight-semibold)}.tracking-wider{--tw-tracking:var(--tracking-wider);letter-spacing:var(--tracking-wider)}.tracking-widest{--tw-tracking:var(--tracking-widest);letter-spacing:var(--tracking-widest)}.uppercase{text-transform:uppercase}.underline{text-decoration-line:underline}.filter{filter:var(--tw-blur,) var(--tw-brightness,) var(--tw-contrast,) var(--tw-grayscale,) var(--tw-hue-rotate,) var(--tw-invert,) var(--tw-saturate,) var(--tw-sepia,) var(--tw-drop-shadow,)}.transition{transition-property:color,background-color,border-color,outline-color,text-decoration-color,fill,stroke,--tw-gradient-from,--tw-gradient-via,--tw-gradient-to,opacity,box-shadow,transform,translate,scale,rotate,filter,-webkit-backdrop-filter,backdrop-filter,display,content-visibility,overlay,pointer-events;transition-timing-function:var(--tw-ease,var(--default-transition-timing-function));transition-duration:var(--tw-duration,var(--default-transition-duration))}.transition-all{transition-property:all;transition-timing-function:var(--tw-ease,var(--default-transition-timing-function));transition-duration:var(--tw-duration,var(--default-transition-duration))}.transition-colors{transition-property:color,background-color,border-color,outline-color,text-decoration-color,fill,stroke,--tw-gradient-from,--tw-gradient-via,--tw-gradient-to;transition-timing-function:var(--tw-ease,var(--default-transition-timing-function));transition-duration:var(--tw-duration,var(--default-transition-duration))}.duration-300{--tw-duration:.3s;transition-duration:.3s}.outline-none{--tw-outline-style:none;outline-style:none}.select-none{-webkit-user-select:none;user-select:none}@media (hover:hover){.hover\:scale-105:hover{--tw-scale-x:105%;--tw-scale-y:105%;--tw-scale-z:105%;scale:var(--tw-scale-x) var(--tw-scale-y)}.hover\:scale-\[1\.01\]:hover{scale:1.01}.hover\:scale-\[1\.02\]:hover{scale:1.02}.hover\:opacity-80:hover{opacity:.8}}.disabled\:cursor-not-allowed:disabled{cursor:not-allowed}.disabled\:opacity-50:disabled{opacity:.5}}:root{--bg:oklch(13% .006 260);--bg-elev:oklch(17% .008 260);--surface:oklch(22% .01 260);--border:oklch(28% .01 260);--text:oklch(95% .005 260);--text-muted:oklch(65% .01 260);--text-dim:oklch(45% .008 260);--accent:oklch(72% .17 305);--accent-glow:oklch(80% .18 305);--accent-soft:oklch(72% .17 305/.15);--success:oklch(72% .14 155);--danger:oklch(65% .2 22)}html,body,#root{min-height:100vh;margin:0}body{background:var(--bg);color:var(--text);letter-spacing:-.005em;-webkit-font-smoothing:antialiased;font-family:Hanken Grotesk,system-ui,-apple-system,sans-serif;font-weight:400}code,pre,.font-mono{font-family:JetBrains Mono,ui-monospace,Consolas,monospace}@keyframes wave-pulse{0%,to{opacity:.2;transform:scaleY(.3)}50%{opacity:.6;transform:scaleY(1)}}@keyframes fade-in{0%{opacity:0;transform:translateY(4px)}to{opacity:1;transform:translateY(0)}}@keyframes soft-glow{0%,to{box-shadow:0 0 20px oklch(72% .17 305/.2)}50%{box-shadow:0 0 40px oklch(72% .17 305/.5)}}.pulse-bar{transform-origin:bottom;animation:1.2s ease-in-out infinite wave-pulse}.fade-in{animation:.3s ease-out fade-in}.glow{animation:2s ease-in-out infinite soft-glow}input[type=range]{appearance:none;cursor:pointer;background:0 0}input[type=range]::-webkit-slider-runnable-track{background:var(--border);border-radius:4px;height:4px}input[type=range]::-webkit-slider-thumb{appearance:none;background:var(--accent);border-radius:50%;width:16px;height:16px;margin-top:-6px;box-shadow:0 0 12px oklch(72% .17 305/.4)}input[type=range]:focus{outline:none}textarea::-webkit-scrollbar{width:6px}textarea::-webkit-scrollbar-track{background:0 0}textarea::-webkit-scrollbar-thumb{background:var(--border);border-radius:3px}textarea::-webkit-scrollbar-thumb:hover{background:var(--text-dim)}@property --tw-rotate-x{syntax:"";inherits:false}@property --tw-rotate-y{syntax:"";inherits:false}@property --tw-rotate-z{syntax:"";inherits:false}@property --tw-skew-x{syntax:"";inherits:false}@property --tw-skew-y{syntax:"";inherits:false}@property --tw-space-y-reverse{syntax:"";inherits:false;initial-value:0}@property --tw-border-style{syntax:"";inherits:false;initial-value:solid}@property --tw-leading{syntax:"";inherits:false}@property --tw-font-weight{syntax:"";inherits:false}@property --tw-tracking{syntax:"";inherits:false}@property --tw-blur{syntax:"";inherits:false}@property --tw-brightness{syntax:"";inherits:false}@property --tw-contrast{syntax:"";inherits:false}@property --tw-grayscale{syntax:"";inherits:false}@property --tw-hue-rotate{syntax:"";inherits:false}@property --tw-invert{syntax:"";inherits:false}@property --tw-opacity{syntax:"";inherits:false}@property --tw-saturate{syntax:"";inherits:false}@property --tw-sepia{syntax:"";inherits:false}@property --tw-drop-shadow{syntax:"";inherits:false}@property --tw-drop-shadow-color{syntax:"";inherits:false}@property --tw-drop-shadow-alpha{syntax:"<percentage>";inherits:false;initial-value:100%}@property --tw-drop-shadow-size{syntax:"";inherits:false}@property --tw-duration{syntax:"";inherits:false}@property --tw-scale-x{syntax:"";inherits:false;initial-value:1}@property --tw-scale-y{syntax:"";inherits:false;initial-value:1}@property --tw-scale-z{syntax:"";inherits:false;initial-value:1}

assets/lm-worker-CMbQRLr6.js ADDED Viewed

The diff for this file is too large to render. See raw diff

assets/ort-wasm-simd-threaded.asyncify-9GUf3Unn.wasm ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f33595b9f7ea51aa6f646dd5a2bde6fbb1c7bcde0b9d2b5f240011a09c1830d0
+size 27190919

assets/ort-wasm-simd-threaded.asyncify-CtKKja6V.wasm ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d6ee4ff60d7f0e6b6efa34469157d3bdcbe2f3b0dbcea2a645bb41361a85973
+size 23543806

assets/worker-retwKpvq.js ADDED Viewed

The diff for this file is too large to render. See raw diff

favicon.svg ADDED Viewed

icons.svg ADDED Viewed

index.html CHANGED Viewed

@@ -1,19 +1,18 @@
 <!doctype html>
-<html>
-	<head>
-		<meta charset="utf-8" />
-		<meta name="viewport" content="width=device-width" />
-		<title>My static Space</title>
-		<link rel="stylesheet" href="style.css" />
-	</head>
-	<body>
-		<div class="card">
-			<h1>Welcome to your static Space!</h1>
-			<p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
-			<p>
-				Also don't forget to check the
-				<a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
-			</p>
-		</div>
-	</body>
 </html>

 <!doctype html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <link rel="icon" type="image/svg+xml" href="/favicon.svg" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <meta name="description" content="ACE-Step 1.5 text-to-music generation running entirely in your browser via WebGPU" />
+    <link rel="preconnect" href="https://fonts.googleapis.com" />
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
+    <link href="https://fonts.googleapis.com/css2?family=Hanken+Grotesk:wght@300;400;500;600;700&family=JetBrains+Mono:wght@400;500&family=Dancing+Script:wght@500;600;700&display=swap" rel="stylesheet" />
+    <title>ACE-Step WebGPU — Text to Music</title>
+    <script type="module" crossorigin src="/assets/index-C7vMACvi.js"></script>
+    <link rel="stylesheet" crossorigin href="/assets/index-CccuoAYh.css">
+  </head>
+  <body>
+    <div id="root"></div>
+  </body>
 </html>

silence_latent.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a7ee13d8902f0c02def49249f05a3e5dd99550ae8aed263299be43329b330e23
+size 3840000

silence_latent_meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"shape": [1, 15000, 64], "dtype": "float32"}

silence_roundtripped.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8e22b8c9e8a687c7ebfe57dc3bee42b5c330d35ca350f04575a79bca6045dfcd
+size 192000

silence_roundtripped_meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"shape": [1, 750, 64], "dtype": "float32"}