Spaces:
Running
Running
Initial deploy: built app at root + source under _source/
Browse files- README.md +44 -5
- _source/README.md +16 -0
- _source/eslint.config.js +29 -0
- _source/index.html +17 -0
- _source/package-lock.json +0 -0
- _source/package.json +31 -0
- _source/public/favicon.svg +1 -0
- _source/public/icons.svg +24 -0
- _source/public/silence_latent.bin +3 -0
- _source/public/silence_latent_meta.json +1 -0
- _source/public/silence_roundtripped.bin +3 -0
- _source/public/silence_roundtripped_meta.json +1 -0
- _source/src/App.jsx +417 -0
- _source/src/assets/hero.png +0 -0
- _source/src/assets/react.svg +1 -0
- _source/src/assets/vite.svg +1 -0
- _source/src/components/PulseBars.jsx +17 -0
- _source/src/components/Waveform.jsx +134 -0
- _source/src/hooks/useModel.js +111 -0
- _source/src/index.css +93 -0
- _source/src/lm-worker.js +271 -0
- _source/src/main.jsx +10 -0
- _source/src/worker.js +665 -0
- _source/vite.config.js +13 -0
- assets/index-C7vMACvi.js +0 -0
- assets/index-CccuoAYh.css +2 -0
- assets/lm-worker-CMbQRLr6.js +0 -0
- assets/ort-wasm-simd-threaded.asyncify-9GUf3Unn.wasm +3 -0
- assets/ort-wasm-simd-threaded.asyncify-CtKKja6V.wasm +3 -0
- assets/worker-retwKpvq.js +0 -0
- favicon.svg +1 -0
- icons.svg +24 -0
- index.html +16 -17
- silence_latent.bin +3 -0
- silence_latent_meta.json +1 -0
- silence_roundtripped.bin +3 -0
- silence_roundtripped_meta.json +1 -0
README.md
CHANGED
|
@@ -1,10 +1,49 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: static
|
| 7 |
pinned: false
|
|
|
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: ACE-Step WebGPU
|
| 3 |
+
emoji: 🎵
|
| 4 |
+
colorFrom: purple
|
| 5 |
+
colorTo: pink
|
| 6 |
sdk: static
|
| 7 |
pinned: false
|
| 8 |
+
license: apache-2.0
|
| 9 |
+
short_description: Text-to-music in your browser via WebGPU.
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# ACE-Step WebGPU
|
| 13 |
+
|
| 14 |
+
Describe any song. AI writes & produces it — right in your browser.
|
| 15 |
+
|
| 16 |
+
The pipeline (5 Hz Qwen3 LM → FSQ → DiT decoder → Oobleck VAE) runs end-to-end
|
| 17 |
+
via [onnxruntime-web](https://onnxruntime.ai/) with the WebGPU execution
|
| 18 |
+
provider. Two Web Workers keep the LM and diffusion+VAE graphs in separate
|
| 19 |
+
WASM heaps so neither hits the 4 GB single-heap limit.
|
| 20 |
+
|
| 21 |
+
## Models
|
| 22 |
+
|
| 23 |
+
- DiT decoder (2B, fp16) and Oobleck VAE (fp16) from
|
| 24 |
+
[shreyask/ACE-Step-v1.5-ONNX](https://huggingface.co/shreyask/ACE-Step-v1.5-ONNX)
|
| 25 |
+
- 5 Hz LM (0.6B, 4-bit MatMulNBits) from
|
| 26 |
+
[ACE-Step/acestep-5Hz-lm-0.6B](https://huggingface.co/ACE-Step/acestep-5Hz-lm-0.6B)
|
| 27 |
+
- Text encoder: [Qwen/Qwen3-Embedding-0.6B](https://huggingface.co/Qwen/Qwen3-Embedding-0.6B)
|
| 28 |
+
|
| 29 |
+
Weights are fetched on demand and cached in the browser's Cache Storage after
|
| 30 |
+
the first load (~2 GB total).
|
| 31 |
+
|
| 32 |
+
## Requirements
|
| 33 |
+
|
| 34 |
+
- WebGPU-capable browser: Chrome/Edge 113+, Safari 26+ desktop
|
| 35 |
+
- ~4 GB free RAM recommended
|
| 36 |
+
|
| 37 |
+
## Source
|
| 38 |
+
|
| 39 |
+
The `_source/` directory in this Space's Files tab contains the full Vite/React
|
| 40 |
+
project (`src/`, `public/`, configs). Build it locally with:
|
| 41 |
+
|
| 42 |
+
```bash
|
| 43 |
+
cd _source
|
| 44 |
+
npm install
|
| 45 |
+
npm run build
|
| 46 |
+
```
|
| 47 |
+
|
| 48 |
+
Upstream: [ACE-Step/Ace-Step1.5](https://huggingface.co/ACE-Step/Ace-Step1.5)
|
| 49 |
+
(Apache 2.0).
|
_source/README.md
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# React + Vite
|
| 2 |
+
|
| 3 |
+
This template provides a minimal setup to get React working in Vite with HMR and some ESLint rules.
|
| 4 |
+
|
| 5 |
+
Currently, two official plugins are available:
|
| 6 |
+
|
| 7 |
+
- [@vitejs/plugin-react](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react) uses [Oxc](https://oxc.rs)
|
| 8 |
+
- [@vitejs/plugin-react-swc](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react-swc) uses [SWC](https://swc.rs/)
|
| 9 |
+
|
| 10 |
+
## React Compiler
|
| 11 |
+
|
| 12 |
+
The React Compiler is not enabled on this template because of its impact on dev & build performances. To add it, see [this documentation](https://react.dev/learn/react-compiler/installation).
|
| 13 |
+
|
| 14 |
+
## Expanding the ESLint configuration
|
| 15 |
+
|
| 16 |
+
If you are developing a production application, we recommend using TypeScript with type-aware lint rules enabled. Check out the [TS template](https://github.com/vitejs/vite/tree/main/packages/create-vite/template-react-ts) for information on how to integrate TypeScript and [`typescript-eslint`](https://typescript-eslint.io) in your project.
|
_source/eslint.config.js
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import js from '@eslint/js'
|
| 2 |
+
import globals from 'globals'
|
| 3 |
+
import reactHooks from 'eslint-plugin-react-hooks'
|
| 4 |
+
import reactRefresh from 'eslint-plugin-react-refresh'
|
| 5 |
+
import { defineConfig, globalIgnores } from 'eslint/config'
|
| 6 |
+
|
| 7 |
+
export default defineConfig([
|
| 8 |
+
globalIgnores(['dist']),
|
| 9 |
+
{
|
| 10 |
+
files: ['**/*.{js,jsx}'],
|
| 11 |
+
extends: [
|
| 12 |
+
js.configs.recommended,
|
| 13 |
+
reactHooks.configs.flat.recommended,
|
| 14 |
+
reactRefresh.configs.vite,
|
| 15 |
+
],
|
| 16 |
+
languageOptions: {
|
| 17 |
+
ecmaVersion: 2020,
|
| 18 |
+
globals: globals.browser,
|
| 19 |
+
parserOptions: {
|
| 20 |
+
ecmaVersion: 'latest',
|
| 21 |
+
ecmaFeatures: { jsx: true },
|
| 22 |
+
sourceType: 'module',
|
| 23 |
+
},
|
| 24 |
+
},
|
| 25 |
+
rules: {
|
| 26 |
+
'no-unused-vars': ['error', { varsIgnorePattern: '^[A-Z_]' }],
|
| 27 |
+
},
|
| 28 |
+
},
|
| 29 |
+
])
|
_source/index.html
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!doctype html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8" />
|
| 5 |
+
<link rel="icon" type="image/svg+xml" href="/favicon.svg" />
|
| 6 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
| 7 |
+
<meta name="description" content="ACE-Step 1.5 text-to-music generation running entirely in your browser via WebGPU" />
|
| 8 |
+
<link rel="preconnect" href="https://fonts.googleapis.com" />
|
| 9 |
+
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
|
| 10 |
+
<link href="https://fonts.googleapis.com/css2?family=Hanken+Grotesk:wght@300;400;500;600;700&family=JetBrains+Mono:wght@400;500&family=Dancing+Script:wght@500;600;700&display=swap" rel="stylesheet" />
|
| 11 |
+
<title>ACE-Step WebGPU — Text to Music</title>
|
| 12 |
+
</head>
|
| 13 |
+
<body>
|
| 14 |
+
<div id="root"></div>
|
| 15 |
+
<script type="module" src="/src/main.jsx"></script>
|
| 16 |
+
</body>
|
| 17 |
+
</html>
|
_source/package-lock.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
_source/package.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "demo",
|
| 3 |
+
"private": true,
|
| 4 |
+
"version": "0.0.0",
|
| 5 |
+
"type": "module",
|
| 6 |
+
"scripts": {
|
| 7 |
+
"dev": "vite",
|
| 8 |
+
"build": "vite build",
|
| 9 |
+
"lint": "eslint .",
|
| 10 |
+
"preview": "vite preview"
|
| 11 |
+
},
|
| 12 |
+
"dependencies": {
|
| 13 |
+
"@huggingface/transformers": "^4.1.0",
|
| 14 |
+
"onnxruntime-web": "^1.24.3",
|
| 15 |
+
"react": "^19.2.4",
|
| 16 |
+
"react-dom": "^19.2.4"
|
| 17 |
+
},
|
| 18 |
+
"devDependencies": {
|
| 19 |
+
"@eslint/js": "^9.39.4",
|
| 20 |
+
"@tailwindcss/vite": "^4.2.2",
|
| 21 |
+
"@types/react": "^19.2.14",
|
| 22 |
+
"@types/react-dom": "^19.2.3",
|
| 23 |
+
"@vitejs/plugin-react": "^6.0.1",
|
| 24 |
+
"eslint": "^9.39.4",
|
| 25 |
+
"eslint-plugin-react-hooks": "^7.0.1",
|
| 26 |
+
"eslint-plugin-react-refresh": "^0.5.2",
|
| 27 |
+
"globals": "^17.4.0",
|
| 28 |
+
"tailwindcss": "^4.2.2",
|
| 29 |
+
"vite": "^8.0.4"
|
| 30 |
+
}
|
| 31 |
+
}
|
_source/public/favicon.svg
ADDED
|
|
_source/public/icons.svg
ADDED
|
|
_source/public/silence_latent.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a7ee13d8902f0c02def49249f05a3e5dd99550ae8aed263299be43329b330e23
|
| 3 |
+
size 3840000
|
_source/public/silence_latent_meta.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"shape": [1, 15000, 64], "dtype": "float32"}
|
_source/public/silence_roundtripped.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8e22b8c9e8a687c7ebfe57dc3bee42b5c330d35ca350f04575a79bca6045dfcd
|
| 3 |
+
size 192000
|
_source/public/silence_roundtripped_meta.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"shape": [1, 750, 64], "dtype": "float32"}
|
_source/src/App.jsx
ADDED
|
@@ -0,0 +1,417 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useState } from "react";
|
| 2 |
+
import { useModel } from "./hooks/useModel";
|
| 3 |
+
import Waveform from "./components/Waveform";
|
| 4 |
+
import PulseBars from "./components/PulseBars";
|
| 5 |
+
|
| 6 |
+
const PRESETS = [
|
| 7 |
+
{
|
| 8 |
+
name: "Pop Ballad",
|
| 9 |
+
emoji: "💗",
|
| 10 |
+
duration: 60,
|
| 11 |
+
caption: "A gentle pop ballad with piano and soft vocals, key of C major, 80 BPM, emotional and dreamy",
|
| 12 |
+
lyrics: "[verse]\nUnderneath the stars tonight\nWe dance beneath the pale moonlight\nEvery moment feels so right\nHolding you so close and tight\n\n[chorus]\nThis is where I want to be\nRight here with you next to me\nLet the world just fade away\nIn your arms I want to stay",
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
name: "Rock Anthem",
|
| 16 |
+
emoji: "🎸",
|
| 17 |
+
duration: 60,
|
| 18 |
+
caption: "An energetic rock anthem with electric guitars and powerful drums, key of E minor, 140 BPM, aggressive and intense",
|
| 19 |
+
lyrics: "[verse]\nFire burning in my veins\nBreaking free from all these chains\nNothing left to hold me back\nRiding down the beaten track\n\n[chorus]\nWe are the ones who rise\nWith thunder in our eyes\nWe'll never be denied\nWe're burning up the sky",
|
| 20 |
+
},
|
| 21 |
+
{
|
| 22 |
+
name: "Lo-fi Chill",
|
| 23 |
+
emoji: "☕",
|
| 24 |
+
duration: 20,
|
| 25 |
+
caption: "A relaxing lo-fi hip hop beat with jazz piano samples and vinyl crackle, key of F major, 75 BPM, mellow and nostalgic",
|
| 26 |
+
lyrics: "[instrumental]",
|
| 27 |
+
},
|
| 28 |
+
];
|
| 29 |
+
|
| 30 |
+
function WebGPUGate({ children }) {
|
| 31 |
+
const supported = typeof navigator !== "undefined" && !!navigator.gpu;
|
| 32 |
+
if (supported) return children;
|
| 33 |
+
return (
|
| 34 |
+
<div className="fixed inset-0 flex items-center justify-center z-50" style={{ background: "var(--bg)" }}>
|
| 35 |
+
<div className="text-center max-w-md px-6">
|
| 36 |
+
<div className="text-5xl mb-4">🎹</div>
|
| 37 |
+
<h1 className="text-2xl font-semibold mb-3" style={{ color: "var(--text)" }}>
|
| 38 |
+
WebGPU not available
|
| 39 |
+
</h1>
|
| 40 |
+
<p style={{ color: "var(--text-muted)" }}>
|
| 41 |
+
This demo needs WebGPU to run ACE-Step in your browser. Try Chrome 113+, Edge 113+, or Safari 26+ on desktop.
|
| 42 |
+
</p>
|
| 43 |
+
</div>
|
| 44 |
+
</div>
|
| 45 |
+
);
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
function ProgressBar({ progress }) {
|
| 49 |
+
if (!progress) return null;
|
| 50 |
+
const pct = Math.max(0, Math.min(100, progress.percent || 0));
|
| 51 |
+
return (
|
| 52 |
+
<div className="w-full">
|
| 53 |
+
<div className="flex justify-between text-[11px] mb-1.5" style={{ color: "var(--text-muted)" }}>
|
| 54 |
+
<span>{progress.label}</span>
|
| 55 |
+
<span className="font-mono">
|
| 56 |
+
{progress.total > 1 && `${(progress.loaded / 1e6).toFixed(0)} / ${(progress.total / 1e6).toFixed(0)} MB · `}
|
| 57 |
+
{pct.toFixed(0)}%
|
| 58 |
+
</span>
|
| 59 |
+
</div>
|
| 60 |
+
<div className="h-1 rounded-full overflow-hidden" style={{ background: "var(--border)" }}>
|
| 61 |
+
<div
|
| 62 |
+
className="h-full rounded-full transition-all duration-300"
|
| 63 |
+
style={{ width: `${pct}%`, background: "var(--accent)" }}
|
| 64 |
+
/>
|
| 65 |
+
</div>
|
| 66 |
+
</div>
|
| 67 |
+
);
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
function LoadGate({ onLoad, status, message, progress, error }) {
|
| 71 |
+
const loading = status === "loading";
|
| 72 |
+
return (
|
| 73 |
+
<div
|
| 74 |
+
className="rounded-2xl p-8 fade-in"
|
| 75 |
+
style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}
|
| 76 |
+
>
|
| 77 |
+
<div className="flex flex-col items-center text-center">
|
| 78 |
+
<div className="text-4xl mb-3">🎹</div>
|
| 79 |
+
<h2 className="text-xl font-semibold mb-2" style={{ color: "var(--text)" }}>
|
| 80 |
+
Load models
|
| 81 |
+
</h2>
|
| 82 |
+
<p className="text-sm max-w-sm mb-5" style={{ color: "var(--text-muted)" }}>
|
| 83 |
+
Loads ~8 GB of ONNX models. Everything runs in your browser — your prompts never leave this device.
|
| 84 |
+
Built with{" "}
|
| 85 |
+
<a href="https://huggingface.co/docs/transformers.js" target="_blank" rel="noreferrer" className="underline" style={{ color: "var(--accent)" }}>
|
| 86 |
+
🤗 Transformers.js
|
| 87 |
+
</a>
|
| 88 |
+
{" + "}
|
| 89 |
+
<a href="https://onnxruntime.ai/docs/tutorials/web/" target="_blank" rel="noreferrer" className="underline" style={{ color: "var(--accent)" }}>
|
| 90 |
+
ONNX Runtime Web
|
| 91 |
+
</a>.
|
| 92 |
+
</p>
|
| 93 |
+
|
| 94 |
+
{error ? (
|
| 95 |
+
<div className="w-full text-sm mb-4 p-3 rounded-lg text-left" style={{ background: "oklch(0.25 0.08 22 / 0.3)", color: "var(--danger)" }}>
|
| 96 |
+
{error}
|
| 97 |
+
</div>
|
| 98 |
+
) : loading ? (
|
| 99 |
+
<div className="w-full space-y-3">
|
| 100 |
+
{message && (
|
| 101 |
+
<p className="text-xs" style={{ color: "var(--text-muted)" }}>
|
| 102 |
+
{message}
|
| 103 |
+
</p>
|
| 104 |
+
)}
|
| 105 |
+
{progress && <ProgressBar progress={progress} />}
|
| 106 |
+
</div>
|
| 107 |
+
) : (
|
| 108 |
+
<button
|
| 109 |
+
onClick={onLoad}
|
| 110 |
+
disabled={loading}
|
| 111 |
+
className="px-8 py-2.5 rounded-full font-medium transition hover:scale-[1.02] cursor-pointer"
|
| 112 |
+
style={{
|
| 113 |
+
background: "var(--accent)",
|
| 114 |
+
color: "var(--bg)",
|
| 115 |
+
letterSpacing: "-0.01em",
|
| 116 |
+
}}
|
| 117 |
+
>
|
| 118 |
+
Load models
|
| 119 |
+
</button>
|
| 120 |
+
)}
|
| 121 |
+
</div>
|
| 122 |
+
</div>
|
| 123 |
+
);
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
function PresetCard({ preset, active, onClick }) {
|
| 127 |
+
return (
|
| 128 |
+
<button
|
| 129 |
+
onClick={onClick}
|
| 130 |
+
className="flex-1 min-w-0 p-3 rounded-xl text-left transition-all cursor-pointer hover:scale-[1.02]"
|
| 131 |
+
style={{
|
| 132 |
+
background: active ? "var(--accent-soft)" : "var(--bg-elev)",
|
| 133 |
+
border: `1px solid ${active ? "var(--accent)" : "var(--border)"}`,
|
| 134 |
+
}}
|
| 135 |
+
>
|
| 136 |
+
<div className="text-xl mb-1">{preset.emoji}</div>
|
| 137 |
+
<div className="text-sm font-medium truncate" style={{ color: "var(--text)" }}>
|
| 138 |
+
{preset.name}
|
| 139 |
+
</div>
|
| 140 |
+
<div className="text-[10px] uppercase tracking-wider mt-0.5" style={{ color: "var(--text-dim)" }}>
|
| 141 |
+
{preset.duration}s · {preset.lyrics === "[instrumental]" ? "instrumental" : "vocal"}
|
| 142 |
+
</div>
|
| 143 |
+
</button>
|
| 144 |
+
);
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
function GenerationStatus({ status, message }) {
|
| 148 |
+
if (status !== "generating") return null;
|
| 149 |
+
return (
|
| 150 |
+
<div
|
| 151 |
+
className="rounded-2xl p-5 fade-in"
|
| 152 |
+
style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}
|
| 153 |
+
>
|
| 154 |
+
<PulseBars count={60} />
|
| 155 |
+
<div className="mt-3 flex items-center justify-between text-xs">
|
| 156 |
+
<span style={{ color: "var(--text)" }}>{message || "Generating…"}</span>
|
| 157 |
+
<span className="font-mono" style={{ color: "var(--text-muted)" }}>
|
| 158 |
+
this takes 1–4 min
|
| 159 |
+
</span>
|
| 160 |
+
</div>
|
| 161 |
+
</div>
|
| 162 |
+
);
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
function OutputCard({ audioUrl, audioInfo }) {
|
| 166 |
+
if (!audioUrl) return null;
|
| 167 |
+
return (
|
| 168 |
+
<div
|
| 169 |
+
className="rounded-2xl p-5 fade-in space-y-3"
|
| 170 |
+
style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}
|
| 171 |
+
>
|
| 172 |
+
<Waveform src={audioUrl} duration={audioInfo?.duration} />
|
| 173 |
+
<div className="flex items-center justify-between text-xs pt-2" style={{ borderTop: "1px solid var(--border)" }}>
|
| 174 |
+
<div className="font-mono" style={{ color: "var(--text-muted)" }}>
|
| 175 |
+
48 kHz · stereo
|
| 176 |
+
{audioInfo?.totalTime && ` · ${audioInfo.totalTime}s gen`}
|
| 177 |
+
</div>
|
| 178 |
+
<a
|
| 179 |
+
href={audioUrl}
|
| 180 |
+
download={audioInfo?.filename || "ace-step.wav"}
|
| 181 |
+
className="px-3 py-1.5 rounded-md text-xs font-medium transition hover:opacity-80 cursor-pointer"
|
| 182 |
+
style={{ background: "var(--surface)", color: "var(--text)" }}
|
| 183 |
+
>
|
| 184 |
+
⬇ Download WAV
|
| 185 |
+
</a>
|
| 186 |
+
</div>
|
| 187 |
+
</div>
|
| 188 |
+
);
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
export default function App() {
|
| 192 |
+
const { status, message, progress, audioUrl, audioInfo, error, isLoaded, loadModel, generate } = useModel();
|
| 193 |
+
const [activeIdx, setActiveIdx] = useState(0);
|
| 194 |
+
const [caption, setCaption] = useState(PRESETS[0].caption);
|
| 195 |
+
const [lyrics, setLyrics] = useState(PRESETS[0].lyrics);
|
| 196 |
+
const [duration, setDuration] = useState(PRESETS[0].duration);
|
| 197 |
+
const [shift, setShift] = useState(3.0);
|
| 198 |
+
const [numSteps, setNumSteps] = useState(8);
|
| 199 |
+
|
| 200 |
+
const isWorking = status === "loading" || status === "generating";
|
| 201 |
+
|
| 202 |
+
const applyPreset = (i) => {
|
| 203 |
+
setActiveIdx(i);
|
| 204 |
+
setCaption(PRESETS[i].caption);
|
| 205 |
+
setLyrics(PRESETS[i].lyrics);
|
| 206 |
+
setDuration(PRESETS[i].duration);
|
| 207 |
+
};
|
| 208 |
+
|
| 209 |
+
return (
|
| 210 |
+
<WebGPUGate>
|
| 211 |
+
<div className="min-h-screen flex flex-col items-center px-4 py-10" style={{ background: "var(--bg)" }}>
|
| 212 |
+
{/* Hero */}
|
| 213 |
+
<header className="mb-10 w-full max-w-2xl fade-in">
|
| 214 |
+
<h1 className="leading-none mb-2 flex items-baseline gap-3 flex-wrap" style={{
|
| 215 |
+
fontSize: "clamp(2.5rem, 5vw, 3.5rem)",
|
| 216 |
+
color: "var(--text)",
|
| 217 |
+
}}>
|
| 218 |
+
<span style={{ fontFamily: "'Dancing Script', cursive", fontWeight: 600 }}>
|
| 219 |
+
ACE-Step
|
| 220 |
+
</span>
|
| 221 |
+
<span style={{ fontWeight: 600, letterSpacing: "-0.03em" }}>
|
| 222 |
+
WebGPU
|
| 223 |
+
</span>
|
| 224 |
+
</h1>
|
| 225 |
+
<p className="text-lg" style={{ color: "var(--text-muted)" }}>
|
| 226 |
+
Describe any song. AI writes & produces it.
|
| 227 |
+
</p>
|
| 228 |
+
</header>
|
| 229 |
+
|
| 230 |
+
<main className="w-full max-w-2xl space-y-4">
|
| 231 |
+
{!isLoaded ? (
|
| 232 |
+
<LoadGate onLoad={loadModel} status={status} message={message} progress={progress} error={error} />
|
| 233 |
+
) : (
|
| 234 |
+
<>
|
| 235 |
+
{/* Presets */}
|
| 236 |
+
<div className="flex gap-2">
|
| 237 |
+
{PRESETS.map((p, i) => (
|
| 238 |
+
<PresetCard key={p.name} preset={p} active={i === activeIdx} onClick={() => applyPreset(i)} />
|
| 239 |
+
))}
|
| 240 |
+
</div>
|
| 241 |
+
|
| 242 |
+
{/* Caption */}
|
| 243 |
+
<div className="rounded-2xl p-4" style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}>
|
| 244 |
+
<label className="text-[10px] uppercase tracking-widest mb-2 block" style={{ color: "var(--text-dim)" }}>
|
| 245 |
+
Description
|
| 246 |
+
</label>
|
| 247 |
+
<textarea
|
| 248 |
+
value={caption}
|
| 249 |
+
onChange={(e) => setCaption(e.target.value)}
|
| 250 |
+
onInput={() => setActiveIdx(-1)}
|
| 251 |
+
rows={2}
|
| 252 |
+
className="w-full bg-transparent text-sm resize-none outline-none"
|
| 253 |
+
style={{ color: "var(--text)" }}
|
| 254 |
+
placeholder="Describe the music — style, instruments, key, BPM, mood…"
|
| 255 |
+
/>
|
| 256 |
+
</div>
|
| 257 |
+
|
| 258 |
+
{/* Lyrics */}
|
| 259 |
+
<div className="rounded-2xl p-4" style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}>
|
| 260 |
+
<label className="text-[10px] uppercase tracking-widest mb-2 block" style={{ color: "var(--text-dim)" }}>
|
| 261 |
+
Lyrics (use [verse] / [chorus] tags, or [instrumental])
|
| 262 |
+
</label>
|
| 263 |
+
<textarea
|
| 264 |
+
value={lyrics}
|
| 265 |
+
onChange={(e) => setLyrics(e.target.value)}
|
| 266 |
+
onInput={() => setActiveIdx(-1)}
|
| 267 |
+
rows={6}
|
| 268 |
+
className="w-full bg-transparent text-sm resize-none outline-none font-mono"
|
| 269 |
+
style={{ color: "var(--text)" }}
|
| 270 |
+
/>
|
| 271 |
+
</div>
|
| 272 |
+
|
| 273 |
+
{/* Controls — pill row */}
|
| 274 |
+
<div className="flex items-center gap-3 flex-wrap">
|
| 275 |
+
<div className="flex items-center gap-2 px-3 py-1.5 rounded-full text-xs"
|
| 276 |
+
style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}>
|
| 277 |
+
<span style={{ color: "var(--text-muted)" }}>Duration</span>
|
| 278 |
+
<input
|
| 279 |
+
type="range"
|
| 280 |
+
min={10}
|
| 281 |
+
max={90}
|
| 282 |
+
step={10}
|
| 283 |
+
value={duration}
|
| 284 |
+
onChange={(e) => setDuration(Number(e.target.value))}
|
| 285 |
+
className="w-24"
|
| 286 |
+
/>
|
| 287 |
+
<span className="font-mono w-8 text-right" style={{ color: "var(--text)" }}>{duration}s</span>
|
| 288 |
+
</div>
|
| 289 |
+
|
| 290 |
+
<div className="flex items-center gap-2 px-3 py-1.5 rounded-full text-xs"
|
| 291 |
+
style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}>
|
| 292 |
+
<span style={{ color: "var(--text-muted)" }}>Steps</span>
|
| 293 |
+
<select
|
| 294 |
+
value={numSteps}
|
| 295 |
+
onChange={(e) => setNumSteps(Number(e.target.value))}
|
| 296 |
+
className="bg-transparent outline-none cursor-pointer"
|
| 297 |
+
style={{ color: "var(--text)" }}
|
| 298 |
+
>
|
| 299 |
+
<option value={8}>8 (turbo)</option>
|
| 300 |
+
</select>
|
| 301 |
+
</div>
|
| 302 |
+
|
| 303 |
+
<div className="flex items-center gap-2 px-3 py-1.5 rounded-full text-xs"
|
| 304 |
+
style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}>
|
| 305 |
+
<span style={{ color: "var(--text-muted)" }}>Shift</span>
|
| 306 |
+
<select
|
| 307 |
+
value={shift}
|
| 308 |
+
onChange={(e) => setShift(Number(e.target.value))}
|
| 309 |
+
className="bg-transparent outline-none cursor-pointer"
|
| 310 |
+
style={{ color: "var(--text)" }}
|
| 311 |
+
>
|
| 312 |
+
<option value={1.0}>1.0</option>
|
| 313 |
+
<option value={2.0}>2.0</option>
|
| 314 |
+
<option value={3.0}>3.0</option>
|
| 315 |
+
</select>
|
| 316 |
+
</div>
|
| 317 |
+
</div>
|
| 318 |
+
|
| 319 |
+
{/* Generate */}
|
| 320 |
+
<button
|
| 321 |
+
onClick={() => generate({ caption, lyrics, duration, shift, numSteps })}
|
| 322 |
+
disabled={isWorking}
|
| 323 |
+
className="w-full py-3.5 rounded-full font-medium text-base transition disabled:opacity-50 disabled:cursor-not-allowed hover:scale-[1.01] cursor-pointer"
|
| 324 |
+
style={{
|
| 325 |
+
background: "var(--accent)",
|
| 326 |
+
color: "var(--bg)",
|
| 327 |
+
letterSpacing: "-0.01em",
|
| 328 |
+
boxShadow: "0 0 40px oklch(0.72 0.17 305 / 0.25)",
|
| 329 |
+
}}
|
| 330 |
+
>
|
| 331 |
+
{status === "generating" ? "Generating music…" : "Generate"}
|
| 332 |
+
</button>
|
| 333 |
+
|
| 334 |
+
<GenerationStatus status={status} message={message} />
|
| 335 |
+
<OutputCard audioUrl={audioUrl} audioInfo={audioInfo} />
|
| 336 |
+
|
| 337 |
+
{error && (
|
| 338 |
+
<div className="rounded-lg p-3 text-sm" style={{ background: "oklch(0.25 0.08 22 / 0.3)", color: "var(--danger)" }}>
|
| 339 |
+
{error}
|
| 340 |
+
</div>
|
| 341 |
+
)}
|
| 342 |
+
</>
|
| 343 |
+
)}
|
| 344 |
+
</main>
|
| 345 |
+
|
| 346 |
+
{/* About / methodology */}
|
| 347 |
+
<section className="w-full max-w-2xl mt-12 text-sm" style={{ color: "var(--text-muted)" }}>
|
| 348 |
+
<details className="rounded-xl px-4 py-3"
|
| 349 |
+
style={{ background: "var(--bg-elev)", border: "1px solid var(--border)" }}>
|
| 350 |
+
<summary className="cursor-pointer font-medium select-none" style={{ color: "var(--text)" }}>
|
| 351 |
+
How it works & known limitations
|
| 352 |
+
</summary>
|
| 353 |
+
<div className="mt-4 space-y-4 leading-relaxed">
|
| 354 |
+
<div>
|
| 355 |
+
<h3 className="text-[13px] uppercase tracking-widest mb-2" style={{ color: "var(--text-dim)" }}>Pipeline</h3>
|
| 356 |
+
<ol className="list-decimal list-inside space-y-1">
|
| 357 |
+
<li><span style={{ color: "var(--text)" }}>Text encoder</span> (Qwen3-Embedding-0.6B, fp16) turns the caption into conditioning hidden states; the same model provides token embeddings for the lyric path.</li>
|
| 358 |
+
<li><span style={{ color: "var(--text)" }}>5 Hz LM</span> (ACE-Step acestep-5Hz-lm-0.6B, 4-bit MatMulNBits) writes a short chain-of-thought, then emits ~50 audio codes per 10 s of output.</li>
|
| 359 |
+
<li><span style={{ color: "var(--text)" }}>FSQ → detokenizer</span> expands the codes into 25 Hz acoustic features used as cross-attention hints.</li>
|
| 360 |
+
<li><span style={{ color: "var(--text)" }}>DiT decoder</span> (2B parameters, fp16) runs 8 Euler flow-matching steps (shift=3.0) over a random latent conditioned on text, lyrics, and hints.</li>
|
| 361 |
+
<li><span style={{ color: "var(--text)" }}>Oobleck VAE</span> (fp16) decodes the 25 Hz latent into stereo 48 kHz audio.</li>
|
| 362 |
+
</ol>
|
| 363 |
+
</div>
|
| 364 |
+
|
| 365 |
+
<div>
|
| 366 |
+
<h3 className="text-[13px] uppercase tracking-widest mb-2" style={{ color: "var(--text-dim)" }}>Why it runs in the browser</h3>
|
| 367 |
+
<p>
|
| 368 |
+
Everything executes on-device via <code className="font-mono text-xs">onnxruntime-web</code> with the WebGPU execution provider. Two Web Workers keep the LM and the diffusion+VAE graphs in separate WASM heaps so neither hits the 4 GB single-heap limit. Total download is ~2 GB (cached in the browser after the first load).
|
| 369 |
+
</p>
|
| 370 |
+
</div>
|
| 371 |
+
|
| 372 |
+
<div>
|
| 373 |
+
<h3 className="text-[13px] uppercase tracking-widest mb-2" style={{ color: "var(--text-dim)" }}>Methodology notes</h3>
|
| 374 |
+
<ul className="list-disc list-inside space-y-1">
|
| 375 |
+
<li>Compared stage-by-stage against the PyTorch fp32 reference: every tensor agrees to within 0.2% relative L2, and the generated waveforms sound identical.</li>
|
| 376 |
+
<li>FP16 DiT is exported natively (<code className="font-mono text-xs">model.half()</code> + dynamo). An earlier fp32→fp16 conversion with post-hoc Cast insertion produced a 25 Hz helicopter artifact, now resolved.</li>
|
| 377 |
+
<li>4-bit quantization is MatMulNBits with <code className="font-mono text-xs">block_size=64</code>, asymmetric, <code className="font-mono text-xs">accuracy_level=1</code> (fp32 accumulate).</li>
|
| 378 |
+
</ul>
|
| 379 |
+
</div>
|
| 380 |
+
|
| 381 |
+
<div>
|
| 382 |
+
<h3 className="text-[13px] uppercase tracking-widest mb-2" style={{ color: "var(--text-dim)" }}>Known limitations</h3>
|
| 383 |
+
<ul className="list-disc list-inside space-y-1">
|
| 384 |
+
<li><span style={{ color: "var(--text)" }}>First load is slow.</span> ~2 GB of weights must be fetched and cached; subsequent runs start fast.</li>
|
| 385 |
+
<li><span style={{ color: "var(--text)" }}>Vocals need ≥60 s.</span> The 0.6B LM often refuses to emit lyric-aligned audio codes for short durations — instrumentals work at any length.</li>
|
| 386 |
+
<li><span style={{ color: "var(--text)" }}>Turbo quality ceiling.</span> We run 8 diffusion steps (shift=3.0). More steps nudge quality up but aren't supported by the turbo weights we ship.</li>
|
| 387 |
+
<li><span style={{ color: "var(--text)" }}>Condition-encoder drift.</span> The ONNX condition_encoder has a small drift (~0.4 max_diff) vs PyTorch on real inputs — inaudible today but a known residual we haven’t closed.</li>
|
| 388 |
+
<li><span style={{ color: "var(--text)" }}>WebGPU only.</span> No fallback path; the demo gates on WebGPU support (Chrome/Edge 113+, Safari 26+ desktop).</li>
|
| 389 |
+
<li><span style={{ color: "var(--text)" }}>Memory.</span> Two workers each hold ~1–2 GB; low-RAM devices may hit <code className="font-mono text-xs">std::bad_alloc</code> during model creation.</li>
|
| 390 |
+
<li><span style={{ color: "var(--text)" }}>No seed control.</span> Each generation uses a fresh RNG, so re-runs with the same prompt will differ.</li>
|
| 391 |
+
</ul>
|
| 392 |
+
</div>
|
| 393 |
+
</div>
|
| 394 |
+
</details>
|
| 395 |
+
</section>
|
| 396 |
+
|
| 397 |
+
{/* Footer */}
|
| 398 |
+
<footer className="mt-12 mb-6 text-center text-xs space-y-2" style={{ color: "var(--text-dim)" }}>
|
| 399 |
+
<div>
|
| 400 |
+
<a href="https://huggingface.co/shreyask/ACE-Step-v1.5-ONNX" target="_blank" rel="noreferrer" className="hover:opacity-80 transition" style={{ color: "var(--text-muted)" }}>
|
| 401 |
+
shreyask/ACE-Step-v1.5-ONNX
|
| 402 |
+
</a>
|
| 403 |
+
<span className="mx-2">·</span>
|
| 404 |
+
<a href="https://huggingface.co/ACE-Step/Ace-Step1.5" target="_blank" rel="noreferrer" className="hover:opacity-80 transition" style={{ color: "var(--text-muted)" }}>
|
| 405 |
+
ACE-Step 1.5
|
| 406 |
+
</a>
|
| 407 |
+
<span className="mx-2">·</span>
|
| 408 |
+
<span>Apache 2.0</span>
|
| 409 |
+
</div>
|
| 410 |
+
<div>
|
| 411 |
+
Made with <a href="https://huggingface.co/docs/transformers.js" target="_blank" rel="noreferrer" className="underline" style={{ color: "var(--text-muted)" }}>🤗 Transformers.js</a>
|
| 412 |
+
</div>
|
| 413 |
+
</footer>
|
| 414 |
+
</div>
|
| 415 |
+
</WebGPUGate>
|
| 416 |
+
);
|
| 417 |
+
}
|
_source/src/assets/hero.png
ADDED
|
_source/src/assets/react.svg
ADDED
|
|
_source/src/assets/vite.svg
ADDED
|
|
_source/src/components/PulseBars.jsx
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Animated placeholder — shown while generating. Matches ace-step-jam fake-waveform.
|
| 2 |
+
export default function PulseBars({ count = 60 }) {
|
| 3 |
+
return (
|
| 4 |
+
<div className="flex items-end gap-[2px] h-14 w-full select-none">
|
| 5 |
+
{Array.from({ length: count }).map((_, i) => (
|
| 6 |
+
<div
|
| 7 |
+
key={i}
|
| 8 |
+
className="flex-1 rounded-[2px] pulse-bar"
|
| 9 |
+
style={{
|
| 10 |
+
background: "var(--accent)",
|
| 11 |
+
animationDelay: `${(i * 40) % 1200}ms`,
|
| 12 |
+
}}
|
| 13 |
+
/>
|
| 14 |
+
))}
|
| 15 |
+
</div>
|
| 16 |
+
);
|
| 17 |
+
}
|
_source/src/components/Waveform.jsx
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useEffect, useRef, useState } from "react";
|
| 2 |
+
|
| 3 |
+
// Custom audio player with bar-waveform viz and click-to-seek.
|
| 4 |
+
// Pattern from victor/ace-step-jam; we render N bars pulled from decoded audio buffer peaks.
|
| 5 |
+
const NUM_BARS = 80;
|
| 6 |
+
|
| 7 |
+
export default function Waveform({ src, duration }) {
|
| 8 |
+
const audioRef = useRef(null);
|
| 9 |
+
const [peaks, setPeaks] = useState(null);
|
| 10 |
+
const [playing, setPlaying] = useState(false);
|
| 11 |
+
const [progress, setProgress] = useState(0);
|
| 12 |
+
|
| 13 |
+
// Decode audio to extract bar peaks
|
| 14 |
+
useEffect(() => {
|
| 15 |
+
if (!src) return;
|
| 16 |
+
let cancelled = false;
|
| 17 |
+
(async () => {
|
| 18 |
+
try {
|
| 19 |
+
const res = await fetch(src);
|
| 20 |
+
const buf = await res.arrayBuffer();
|
| 21 |
+
const ctx = new (window.AudioContext || window.webkitAudioContext)();
|
| 22 |
+
const audio = await ctx.decodeAudioData(buf.slice(0));
|
| 23 |
+
const channel = audio.getChannelData(0);
|
| 24 |
+
const samplesPerBar = Math.floor(channel.length / NUM_BARS);
|
| 25 |
+
const out = new Float32Array(NUM_BARS);
|
| 26 |
+
let globalMax = 0;
|
| 27 |
+
for (let b = 0; b < NUM_BARS; b++) {
|
| 28 |
+
let max = 0;
|
| 29 |
+
const start = b * samplesPerBar;
|
| 30 |
+
const end = Math.min(start + samplesPerBar, channel.length);
|
| 31 |
+
for (let i = start; i < end; i++) {
|
| 32 |
+
const v = Math.abs(channel[i]);
|
| 33 |
+
if (Number.isFinite(v) && v > max) max = v;
|
| 34 |
+
}
|
| 35 |
+
out[b] = max;
|
| 36 |
+
if (max > globalMax) globalMax = max;
|
| 37 |
+
}
|
| 38 |
+
// Normalize — if silent or NaN, fall back to flat low bars
|
| 39 |
+
const peak = Number.isFinite(globalMax) && globalMax > 1e-5 ? globalMax : 1;
|
| 40 |
+
for (let i = 0; i < NUM_BARS; i++) {
|
| 41 |
+
const n = out[i] / peak;
|
| 42 |
+
out[i] = Number.isFinite(n) ? Math.max(0.05, Math.min(1, n)) : 0.05;
|
| 43 |
+
}
|
| 44 |
+
if (!cancelled) setPeaks(out);
|
| 45 |
+
ctx.close?.();
|
| 46 |
+
} catch (e) {
|
| 47 |
+
console.warn("waveform decode failed:", e);
|
| 48 |
+
// Still show fallback bars so UI isn't broken
|
| 49 |
+
if (!cancelled) setPeaks(new Float32Array(NUM_BARS).fill(0.1));
|
| 50 |
+
}
|
| 51 |
+
})();
|
| 52 |
+
return () => { cancelled = true; };
|
| 53 |
+
}, [src]);
|
| 54 |
+
|
| 55 |
+
useEffect(() => {
|
| 56 |
+
const a = audioRef.current;
|
| 57 |
+
if (!a) return;
|
| 58 |
+
const onTime = () => setProgress(a.duration ? a.currentTime / a.duration : 0);
|
| 59 |
+
const onEnd = () => setPlaying(false);
|
| 60 |
+
a.addEventListener("timeupdate", onTime);
|
| 61 |
+
a.addEventListener("ended", onEnd);
|
| 62 |
+
return () => {
|
| 63 |
+
a.removeEventListener("timeupdate", onTime);
|
| 64 |
+
a.removeEventListener("ended", onEnd);
|
| 65 |
+
};
|
| 66 |
+
}, [src]);
|
| 67 |
+
|
| 68 |
+
const toggle = () => {
|
| 69 |
+
const a = audioRef.current;
|
| 70 |
+
if (!a) return;
|
| 71 |
+
if (a.paused) { a.play(); setPlaying(true); }
|
| 72 |
+
else { a.pause(); setPlaying(false); }
|
| 73 |
+
};
|
| 74 |
+
|
| 75 |
+
const seek = (e) => {
|
| 76 |
+
const a = audioRef.current;
|
| 77 |
+
if (!a || !a.duration) return;
|
| 78 |
+
const rect = e.currentTarget.getBoundingClientRect();
|
| 79 |
+
const x = (e.clientX - rect.left) / rect.width;
|
| 80 |
+
a.currentTime = Math.max(0, Math.min(1, x)) * a.duration;
|
| 81 |
+
setProgress(x);
|
| 82 |
+
};
|
| 83 |
+
|
| 84 |
+
return (
|
| 85 |
+
<div className="flex items-center gap-3 w-full">
|
| 86 |
+
<audio ref={audioRef} src={src} preload="auto" />
|
| 87 |
+
<button
|
| 88 |
+
onClick={toggle}
|
| 89 |
+
className="flex-shrink-0 w-10 h-10 rounded-full flex items-center justify-center hover:scale-105 transition cursor-pointer"
|
| 90 |
+
style={{ background: "var(--accent)", color: "var(--bg)" }}
|
| 91 |
+
aria-label={playing ? "Pause" : "Play"}
|
| 92 |
+
>
|
| 93 |
+
{playing ? (
|
| 94 |
+
<svg width="14" height="14" viewBox="0 0 16 16" fill="currentColor"><rect x="3" y="2" width="3.5" height="12" rx="1" /><rect x="9.5" y="2" width="3.5" height="12" rx="1" /></svg>
|
| 95 |
+
) : (
|
| 96 |
+
<svg width="14" height="14" viewBox="0 0 16 16" fill="currentColor"><path d="M3.5 2.5v11a0.5 0.5 0 0 0 .8 .4l9 -5.5a0.5 0.5 0 0 0 0 -.8l-9 -5.5a0.5 0.5 0 0 0 -.8 .4z" /></svg>
|
| 97 |
+
)}
|
| 98 |
+
</button>
|
| 99 |
+
|
| 100 |
+
<div
|
| 101 |
+
onClick={seek}
|
| 102 |
+
className="flex-1 flex items-end gap-[2px] h-14 cursor-pointer select-none overflow-hidden"
|
| 103 |
+
>
|
| 104 |
+
{Array.from({ length: NUM_BARS }, (_, i) => {
|
| 105 |
+
// Compute height defensively — never rely on peaks array directly
|
| 106 |
+
let v = 0.15;
|
| 107 |
+
if (peaks && peaks[i] != null) {
|
| 108 |
+
const p = Number(peaks[i]);
|
| 109 |
+
if (Number.isFinite(p)) v = Math.max(0.05, Math.min(1, p));
|
| 110 |
+
}
|
| 111 |
+
const prog = Number.isFinite(progress) ? progress : 0;
|
| 112 |
+
const active = (i / NUM_BARS) < prog;
|
| 113 |
+
const heightPct = Math.max(4, Math.min(100, v * 100));
|
| 114 |
+
return (
|
| 115 |
+
<div
|
| 116 |
+
key={i}
|
| 117 |
+
className="flex-1 rounded-[2px] transition-colors"
|
| 118 |
+
style={{
|
| 119 |
+
height: `${heightPct}%`,
|
| 120 |
+
background: active ? "var(--accent)" : "var(--border)",
|
| 121 |
+
}}
|
| 122 |
+
/>
|
| 123 |
+
);
|
| 124 |
+
})}
|
| 125 |
+
</div>
|
| 126 |
+
|
| 127 |
+
{Number.isFinite(Number(duration)) && Number(duration) > 0 && (
|
| 128 |
+
<div className="flex-shrink-0 text-xs font-mono" style={{ color: "var(--text-muted)" }}>
|
| 129 |
+
{Number(duration)}s
|
| 130 |
+
</div>
|
| 131 |
+
)}
|
| 132 |
+
</div>
|
| 133 |
+
);
|
| 134 |
+
}
|
_source/src/hooks/useModel.js
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useState, useRef, useCallback, useEffect } from "react";
|
| 2 |
+
|
| 3 |
+
export function useModel() {
|
| 4 |
+
const workerRef = useRef(null);
|
| 5 |
+
const audioUrlRef = useRef(null);
|
| 6 |
+
const [status, setStatus] = useState("idle");
|
| 7 |
+
const [message, setMessage] = useState("");
|
| 8 |
+
const [progress, setProgress] = useState(null);
|
| 9 |
+
const [audioUrl, setAudioUrl] = useState(null);
|
| 10 |
+
const [audioInfo, setAudioInfo] = useState(null);
|
| 11 |
+
const [error, setError] = useState(null);
|
| 12 |
+
const [isLoaded, setIsLoaded] = useState(false);
|
| 13 |
+
|
| 14 |
+
// Revoke a URL owned by this hook and forget it.
|
| 15 |
+
const revokeCurrentAudioUrl = useCallback(() => {
|
| 16 |
+
if (audioUrlRef.current) {
|
| 17 |
+
URL.revokeObjectURL(audioUrlRef.current);
|
| 18 |
+
audioUrlRef.current = null;
|
| 19 |
+
}
|
| 20 |
+
}, []);
|
| 21 |
+
|
| 22 |
+
useEffect(() => {
|
| 23 |
+
const worker = new Worker(new URL("../worker.js", import.meta.url), {
|
| 24 |
+
type: "module",
|
| 25 |
+
});
|
| 26 |
+
|
| 27 |
+
worker.onmessage = (e) => {
|
| 28 |
+
const { type, ...data } = e.data;
|
| 29 |
+
switch (type) {
|
| 30 |
+
case "status":
|
| 31 |
+
setMessage(data.message);
|
| 32 |
+
break;
|
| 33 |
+
case "progress":
|
| 34 |
+
setProgress(data);
|
| 35 |
+
break;
|
| 36 |
+
case "loaded":
|
| 37 |
+
setIsLoaded(true);
|
| 38 |
+
setStatus("ready");
|
| 39 |
+
setProgress(null);
|
| 40 |
+
break;
|
| 41 |
+
case "audio": {
|
| 42 |
+
// Revoke any previous URL owned by this hook before overwriting.
|
| 43 |
+
if (audioUrlRef.current) URL.revokeObjectURL(audioUrlRef.current);
|
| 44 |
+
const blob = new Blob([data.wavBuffer], { type: "audio/wav" });
|
| 45 |
+
const url = URL.createObjectURL(blob);
|
| 46 |
+
audioUrlRef.current = url;
|
| 47 |
+
setAudioUrl(url);
|
| 48 |
+
setAudioInfo({
|
| 49 |
+
duration: data.duration,
|
| 50 |
+
diffusionTime: data.diffusionTime,
|
| 51 |
+
totalTime: data.totalTime,
|
| 52 |
+
filename: `ace-step-${data.filenameStamp || Date.now()}.wav`,
|
| 53 |
+
});
|
| 54 |
+
setStatus("ready");
|
| 55 |
+
setMessage("Generation complete!");
|
| 56 |
+
break;
|
| 57 |
+
}
|
| 58 |
+
case "error":
|
| 59 |
+
setError(data.message);
|
| 60 |
+
setStatus("error");
|
| 61 |
+
console.error("Worker error:", data.message, data.stack);
|
| 62 |
+
break;
|
| 63 |
+
}
|
| 64 |
+
};
|
| 65 |
+
|
| 66 |
+
workerRef.current = worker;
|
| 67 |
+
return () => {
|
| 68 |
+
worker.terminate();
|
| 69 |
+
if (audioUrlRef.current) {
|
| 70 |
+
URL.revokeObjectURL(audioUrlRef.current);
|
| 71 |
+
audioUrlRef.current = null;
|
| 72 |
+
}
|
| 73 |
+
};
|
| 74 |
+
}, []);
|
| 75 |
+
|
| 76 |
+
const loadModel = useCallback(() => {
|
| 77 |
+
setStatus("loading");
|
| 78 |
+
setError(null);
|
| 79 |
+
workerRef.current?.postMessage({ type: "load" });
|
| 80 |
+
}, []);
|
| 81 |
+
|
| 82 |
+
const generate = useCallback(({ caption, lyrics, duration, shift, numSteps }) => {
|
| 83 |
+
setStatus("generating");
|
| 84 |
+
setError(null);
|
| 85 |
+
// Revoke the previous URL when user starts a new gen so the next "audio" message
|
| 86 |
+
// doesn't compete with a still-displayed blob.
|
| 87 |
+
revokeCurrentAudioUrl();
|
| 88 |
+
setAudioUrl(null);
|
| 89 |
+
setAudioInfo(null);
|
| 90 |
+
workerRef.current?.postMessage({
|
| 91 |
+
type: "generate",
|
| 92 |
+
caption,
|
| 93 |
+
lyrics,
|
| 94 |
+
duration,
|
| 95 |
+
shift,
|
| 96 |
+
numSteps,
|
| 97 |
+
});
|
| 98 |
+
}, [revokeCurrentAudioUrl]);
|
| 99 |
+
|
| 100 |
+
return {
|
| 101 |
+
status,
|
| 102 |
+
message,
|
| 103 |
+
progress,
|
| 104 |
+
audioUrl,
|
| 105 |
+
audioInfo,
|
| 106 |
+
error,
|
| 107 |
+
isLoaded,
|
| 108 |
+
loadModel,
|
| 109 |
+
generate,
|
| 110 |
+
};
|
| 111 |
+
}
|
_source/src/index.css
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
@import "tailwindcss";
|
| 2 |
+
|
| 3 |
+
:root {
|
| 4 |
+
--bg: oklch(0.13 0.006 260);
|
| 5 |
+
--bg-elev: oklch(0.17 0.008 260);
|
| 6 |
+
--surface: oklch(0.22 0.01 260);
|
| 7 |
+
--border: oklch(0.28 0.01 260);
|
| 8 |
+
--text: oklch(0.95 0.005 260);
|
| 9 |
+
--text-muted: oklch(0.65 0.01 260);
|
| 10 |
+
--text-dim: oklch(0.45 0.008 260);
|
| 11 |
+
--accent: oklch(0.72 0.17 305);
|
| 12 |
+
--accent-glow: oklch(0.80 0.18 305);
|
| 13 |
+
--accent-soft: oklch(0.72 0.17 305 / 0.15);
|
| 14 |
+
--success: oklch(0.72 0.14 155);
|
| 15 |
+
--danger: oklch(0.65 0.2 22);
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
html, body, #root {
|
| 19 |
+
min-height: 100vh;
|
| 20 |
+
margin: 0;
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
body {
|
| 24 |
+
background: var(--bg);
|
| 25 |
+
color: var(--text);
|
| 26 |
+
font-family: "Hanken Grotesk", system-ui, -apple-system, sans-serif;
|
| 27 |
+
font-weight: 400;
|
| 28 |
+
letter-spacing: -0.005em;
|
| 29 |
+
-webkit-font-smoothing: antialiased;
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
code, pre, .font-mono {
|
| 33 |
+
font-family: "JetBrains Mono", ui-monospace, "Consolas", monospace;
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
/* Animated generation placeholder — pulse bars like ace-step-jam */
|
| 37 |
+
@keyframes wave-pulse {
|
| 38 |
+
0%, 100% { transform: scaleY(0.3); opacity: 0.2; }
|
| 39 |
+
50% { transform: scaleY(1); opacity: 0.6; }
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
@keyframes fade-in {
|
| 43 |
+
from { opacity: 0; transform: translateY(4px); }
|
| 44 |
+
to { opacity: 1; transform: translateY(0); }
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
@keyframes soft-glow {
|
| 48 |
+
0%, 100% { box-shadow: 0 0 20px oklch(0.72 0.17 305 / 0.2); }
|
| 49 |
+
50% { box-shadow: 0 0 40px oklch(0.72 0.17 305 / 0.5); }
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
.pulse-bar {
|
| 53 |
+
animation: wave-pulse 1.2s ease-in-out infinite;
|
| 54 |
+
transform-origin: bottom;
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
.fade-in {
|
| 58 |
+
animation: fade-in 0.3s ease-out;
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
.glow {
|
| 62 |
+
animation: soft-glow 2s ease-in-out infinite;
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
/* Range slider styling */
|
| 66 |
+
input[type="range"] {
|
| 67 |
+
-webkit-appearance: none;
|
| 68 |
+
appearance: none;
|
| 69 |
+
background: transparent;
|
| 70 |
+
cursor: pointer;
|
| 71 |
+
}
|
| 72 |
+
input[type="range"]::-webkit-slider-runnable-track {
|
| 73 |
+
height: 4px;
|
| 74 |
+
background: var(--border);
|
| 75 |
+
border-radius: 4px;
|
| 76 |
+
}
|
| 77 |
+
input[type="range"]::-webkit-slider-thumb {
|
| 78 |
+
-webkit-appearance: none;
|
| 79 |
+
appearance: none;
|
| 80 |
+
height: 16px;
|
| 81 |
+
width: 16px;
|
| 82 |
+
background: var(--accent);
|
| 83 |
+
border-radius: 50%;
|
| 84 |
+
margin-top: -6px;
|
| 85 |
+
box-shadow: 0 0 12px oklch(0.72 0.17 305 / 0.4);
|
| 86 |
+
}
|
| 87 |
+
input[type="range"]:focus { outline: none; }
|
| 88 |
+
|
| 89 |
+
/* Scrollbar in textareas */
|
| 90 |
+
textarea::-webkit-scrollbar { width: 6px; }
|
| 91 |
+
textarea::-webkit-scrollbar-track { background: transparent; }
|
| 92 |
+
textarea::-webkit-scrollbar-thumb { background: var(--border); border-radius: 3px; }
|
| 93 |
+
textarea::-webkit-scrollbar-thumb:hover { background: var(--text-dim); }
|
_source/src/lm-worker.js
ADDED
|
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Dedicated worker for the 5Hz LM. Isolated WASM heap lets the 1.77 GB model
|
| 2 |
+
// load without competing with DiT + encoders in the main worker.
|
| 3 |
+
import { AutoTokenizer } from "@huggingface/transformers";
|
| 4 |
+
import * as ort from "onnxruntime-web/webgpu";
|
| 5 |
+
|
| 6 |
+
const MODEL_REPO = "shreyask/ACE-Step-v1.5-ONNX";
|
| 7 |
+
const MODEL_REVISION = "bdabfb5684fd70fcc76f98cbb51bb9ebc47ee342";
|
| 8 |
+
const ONNX_BASE = `https://huggingface.co/${MODEL_REPO}/resolve/${MODEL_REVISION}/onnx`;
|
| 9 |
+
const LM_TOKENIZER_REPO = "ACE-Step/acestep-5Hz-lm-0.6B";
|
| 10 |
+
const CACHE_NAME = "ace-step-onnx-v12";
|
| 11 |
+
|
| 12 |
+
const NUM_KV_LAYERS = 28;
|
| 13 |
+
const NUM_KV_HEADS = 8;
|
| 14 |
+
const KV_HEAD_DIM = 128;
|
| 15 |
+
const VOCAB_SIZE = 217204;
|
| 16 |
+
const NUM_CODES = 64000;
|
| 17 |
+
const POOL_WINDOW = 5;
|
| 18 |
+
const EOS_ID = 151645;
|
| 19 |
+
|
| 20 |
+
let tokenizer = null;
|
| 21 |
+
let session = null;
|
| 22 |
+
|
| 23 |
+
function post(type, data = {}) {
|
| 24 |
+
self.postMessage({ type, ...data });
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
async function fetchBuffer(url, label) {
|
| 28 |
+
const cache = await caches.open(CACHE_NAME);
|
| 29 |
+
const cached = await cache.match(url);
|
| 30 |
+
if (cached) {
|
| 31 |
+
post("progress", { label, loaded: 1, total: 1, percent: 100 });
|
| 32 |
+
return await cached.arrayBuffer();
|
| 33 |
+
}
|
| 34 |
+
const response = await fetch(url);
|
| 35 |
+
const total = parseInt(response.headers.get("content-length") || "0");
|
| 36 |
+
const reader = response.body.getReader();
|
| 37 |
+
const chunks = [];
|
| 38 |
+
let loaded = 0;
|
| 39 |
+
while (true) {
|
| 40 |
+
const { done, value } = await reader.read();
|
| 41 |
+
if (done) break;
|
| 42 |
+
chunks.push(value);
|
| 43 |
+
loaded += value.length;
|
| 44 |
+
if (total > 0) post("progress", { label, loaded, total, percent: (loaded / total) * 100 });
|
| 45 |
+
}
|
| 46 |
+
const buf = new Uint8Array(loaded);
|
| 47 |
+
let offset = 0;
|
| 48 |
+
for (const c of chunks) { buf.set(c, offset); offset += c.length; }
|
| 49 |
+
try {
|
| 50 |
+
await cache.put(url, new Response(buf.buffer.slice(0), { headers: { "Content-Type": "application/octet-stream" } }));
|
| 51 |
+
} catch (_) {}
|
| 52 |
+
return buf.buffer;
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
function tensor(data, dims, type = "float32") {
|
| 56 |
+
return new ort.Tensor(type, data, dims);
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
async function loadModel() {
|
| 60 |
+
ort.env.wasm.numThreads = 1;
|
| 61 |
+
ort.env.wasm.simd = true;
|
| 62 |
+
|
| 63 |
+
post("status", { message: "Loading LM tokenizer..." });
|
| 64 |
+
tokenizer = await AutoTokenizer.from_pretrained(LM_TOKENIZER_REPO);
|
| 65 |
+
|
| 66 |
+
post("status", { message: "Loading LM graph..." });
|
| 67 |
+
const graphBuf = await fetchBuffer(`${ONNX_BASE}/lm_kv_q4.onnx`, "LM graph");
|
| 68 |
+
|
| 69 |
+
post("status", { message: "Loading LM weights (1.24 GB q4)..." });
|
| 70 |
+
const weightsBuf = await fetchBuffer(`${ONNX_BASE}/lm_kv_q4.onnx.data`, "LM weights");
|
| 71 |
+
|
| 72 |
+
post("status", { message: "Creating LM session..." });
|
| 73 |
+
// Try WebGPU first (faster), fall back to WASM if unsupported ops
|
| 74 |
+
try {
|
| 75 |
+
session = await ort.InferenceSession.create(graphBuf, {
|
| 76 |
+
executionProviders: ["webgpu"],
|
| 77 |
+
externalData: [{ path: "lm_kv_q4.onnx.data", data: weightsBuf }],
|
| 78 |
+
});
|
| 79 |
+
post("status", { message: "LM on WebGPU" });
|
| 80 |
+
} catch (err) {
|
| 81 |
+
console.warn("LM WebGPU failed, falling back to WASM:", err.message);
|
| 82 |
+
session = await ort.InferenceSession.create(graphBuf, {
|
| 83 |
+
executionProviders: ["wasm"],
|
| 84 |
+
externalData: [{ path: "lm_kv_q4.onnx.data", data: weightsBuf }],
|
| 85 |
+
});
|
| 86 |
+
post("status", { message: "LM on WASM (WebGPU unsupported)" });
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
post("status", { message: "LM ready" });
|
| 90 |
+
post("loaded");
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
function createEmptyKV() {
|
| 94 |
+
const kv = {};
|
| 95 |
+
for (let i = 0; i < NUM_KV_LAYERS; i++) {
|
| 96 |
+
kv[`past_key_values.${i}.key`] = tensor(new Float32Array(0), [1, NUM_KV_HEADS, 0, KV_HEAD_DIM]);
|
| 97 |
+
kv[`past_key_values.${i}.value`] = tensor(new Float32Array(0), [1, NUM_KV_HEADS, 0, KV_HEAD_DIM]);
|
| 98 |
+
}
|
| 99 |
+
return kv;
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
function extractKV(outputs) {
|
| 103 |
+
const kv = {};
|
| 104 |
+
for (let i = 0; i < NUM_KV_LAYERS; i++) {
|
| 105 |
+
kv[`past_key_values.${i}.key`] = outputs[`present.${i}.key`];
|
| 106 |
+
kv[`past_key_values.${i}.value`] = outputs[`present.${i}.value`];
|
| 107 |
+
}
|
| 108 |
+
return kv;
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
function sampleToken(logits, recentTokens, { temperature = 0.8, topK = 200, topP = 0.95, repetitionPenalty = 1.05, repWindow = 64 } = {}) {
|
| 112 |
+
const V = logits.length;
|
| 113 |
+
const scores = new Float32Array(V);
|
| 114 |
+
scores.set(logits);
|
| 115 |
+
|
| 116 |
+
// Repetition penalty
|
| 117 |
+
if (repetitionPenalty !== 1.0 && recentTokens.length > 0) {
|
| 118 |
+
const window = recentTokens.slice(-repWindow);
|
| 119 |
+
const seen = new Set(window);
|
| 120 |
+
for (const tok of seen) {
|
| 121 |
+
if (tok >= 0 && tok < V) {
|
| 122 |
+
scores[tok] = scores[tok] > 0 ? scores[tok] / repetitionPenalty : scores[tok] * repetitionPenalty;
|
| 123 |
+
}
|
| 124 |
+
}
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
// Temperature
|
| 128 |
+
if (temperature !== 1.0 && temperature > 0) {
|
| 129 |
+
const invT = 1.0 / temperature;
|
| 130 |
+
for (let i = 0; i < V; i++) scores[i] *= invT;
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
// Top-K via full sort (good enough — sort overhead << LM forward pass)
|
| 134 |
+
const k = Math.min(topK, V);
|
| 135 |
+
const idx = new Array(V);
|
| 136 |
+
for (let i = 0; i < V; i++) idx[i] = i;
|
| 137 |
+
idx.sort((a, b) => scores[b] - scores[a]);
|
| 138 |
+
const topIdx = idx.slice(0, k);
|
| 139 |
+
|
| 140 |
+
// Softmax with log-sum-exp trick
|
| 141 |
+
let maxS = -Infinity;
|
| 142 |
+
for (const i of topIdx) if (scores[i] > maxS) maxS = scores[i];
|
| 143 |
+
const exps = new Float64Array(k);
|
| 144 |
+
let sumE = 0;
|
| 145 |
+
for (let i = 0; i < k; i++) {
|
| 146 |
+
const e = Math.exp(scores[topIdx[i]] - maxS);
|
| 147 |
+
exps[i] = e; sumE += e;
|
| 148 |
+
}
|
| 149 |
+
const probs = new Float64Array(k);
|
| 150 |
+
for (let i = 0; i < k; i++) probs[i] = exps[i] / sumE;
|
| 151 |
+
|
| 152 |
+
// Top-P (nucleus)
|
| 153 |
+
let cum = 0, nuc = k;
|
| 154 |
+
for (let i = 0; i < k; i++) {
|
| 155 |
+
cum += probs[i];
|
| 156 |
+
if (cum >= topP) { nuc = i + 1; break; }
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
// Multinomial sample within nucleus
|
| 160 |
+
let nSum = 0;
|
| 161 |
+
for (let i = 0; i < nuc; i++) nSum += probs[i];
|
| 162 |
+
const r = Math.random() * nSum;
|
| 163 |
+
let acc = 0;
|
| 164 |
+
for (let i = 0; i < nuc; i++) {
|
| 165 |
+
acc += probs[i];
|
| 166 |
+
if (r < acc) return topIdx[i];
|
| 167 |
+
}
|
| 168 |
+
return topIdx[nuc - 1];
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
function buildPrompt(caption, lyrics, duration, language = "en") {
|
| 172 |
+
const instruction = "Generate audio semantic tokens based on the given conditions";
|
| 173 |
+
const lyricsSection = lyrics.trim()
|
| 174 |
+
? `# Languages\n${language}\n\n# Lyrics\n${lyrics}`
|
| 175 |
+
: "# Lyrics\n[instrumental]";
|
| 176 |
+
const userPrompt = `# Instruction\n${instruction}\n\n# Caption\n${caption}\n\n${lyricsSection}\n\n# Metas\n- language: ${language}\n- duration: ${duration} seconds\n<|endoftext|>\n`;
|
| 177 |
+
return `<|im_start|>user\n${userPrompt}<|im_end|>\n<|im_start|>assistant\n`;
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
async function generate({ caption, lyrics, duration, numLatentFrames }) {
|
| 181 |
+
const numCodes5Hz = Math.ceil(numLatentFrames / POOL_WINDOW);
|
| 182 |
+
post("status", { message: `LM: generating ~${numCodes5Hz} codes...` });
|
| 183 |
+
|
| 184 |
+
const prompt = buildPrompt(caption, lyrics, Math.round(duration));
|
| 185 |
+
const encoded = tokenizer(prompt);
|
| 186 |
+
const promptIds = Array.from(encoded.input_ids.data, Number);
|
| 187 |
+
// CoT metadata ~150 tokens + numCodes5Hz audio codes + some slack
|
| 188 |
+
const maxNewTokens = Math.min(numCodes5Hz + 250, 600);
|
| 189 |
+
const audioCodeTokenRegex = /<\|audio_code_(\d+)\|>/g;
|
| 190 |
+
|
| 191 |
+
const startTime = performance.now();
|
| 192 |
+
const allIds = [...promptIds];
|
| 193 |
+
|
| 194 |
+
// Prefill
|
| 195 |
+
post("status", { message: `LM prefill (${promptIds.length} tokens)...` });
|
| 196 |
+
const prefillIds = new BigInt64Array(promptIds.map(BigInt));
|
| 197 |
+
const prefillMask = new BigInt64Array(promptIds.length).fill(1n);
|
| 198 |
+
const prefillPos = new BigInt64Array(promptIds.map((_, i) => BigInt(i)));
|
| 199 |
+
|
| 200 |
+
let outputs = await session.run({
|
| 201 |
+
input_ids: tensor(prefillIds, [1, promptIds.length], "int64"),
|
| 202 |
+
attention_mask: tensor(prefillMask, [1, promptIds.length], "int64"),
|
| 203 |
+
position_ids: tensor(prefillPos, [1, promptIds.length], "int64"),
|
| 204 |
+
...createEmptyKV(),
|
| 205 |
+
});
|
| 206 |
+
let kv = extractKV(outputs);
|
| 207 |
+
|
| 208 |
+
let lastLogits = outputs.logits.data.slice((promptIds.length - 1) * VOCAB_SIZE, promptIds.length * VOCAB_SIZE);
|
| 209 |
+
let nextToken = sampleToken(lastLogits, allIds);
|
| 210 |
+
allIds.push(nextToken);
|
| 211 |
+
|
| 212 |
+
// Decode loop — exit early once we have enough audio codes
|
| 213 |
+
let codesSoFar = 0;
|
| 214 |
+
for (let step = 0; step < maxNewTokens - 1; step++) {
|
| 215 |
+
if (nextToken === EOS_ID) break;
|
| 216 |
+
if (codesSoFar >= numCodes5Hz) break; // have enough codes, stop early
|
| 217 |
+
if (step % 20 === 0) {
|
| 218 |
+
const elapsed = ((performance.now() - startTime) / 1000).toFixed(1);
|
| 219 |
+
const tps = (step / Math.max(parseFloat(elapsed), 0.1)).toFixed(1);
|
| 220 |
+
post("status", { message: `LM: ${step} tokens, ${codesSoFar}/${numCodes5Hz} codes (${tps} tok/s)` });
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
const seqLen = allIds.length;
|
| 224 |
+
outputs = await session.run({
|
| 225 |
+
input_ids: tensor(new BigInt64Array([BigInt(nextToken)]), [1, 1], "int64"),
|
| 226 |
+
attention_mask: tensor(new BigInt64Array(seqLen).fill(1n), [1, seqLen], "int64"),
|
| 227 |
+
position_ids: tensor(new BigInt64Array([BigInt(seqLen - 1)]), [1, 1], "int64"),
|
| 228 |
+
...kv,
|
| 229 |
+
});
|
| 230 |
+
kv = extractKV(outputs);
|
| 231 |
+
lastLogits = outputs.logits.data.slice(0, VOCAB_SIZE);
|
| 232 |
+
nextToken = sampleToken(lastLogits, allIds);
|
| 233 |
+
allIds.push(nextToken);
|
| 234 |
+
|
| 235 |
+
// Streaming decode — check if this token is an audio code
|
| 236 |
+
const tokText = tokenizer.decode([nextToken], { skip_special_tokens: false });
|
| 237 |
+
if (audioCodeTokenRegex.test(tokText)) codesSoFar++;
|
| 238 |
+
audioCodeTokenRegex.lastIndex = 0;
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
const elapsed = ((performance.now() - startTime) / 1000).toFixed(1);
|
| 242 |
+
const generatedIds = allIds.slice(promptIds.length);
|
| 243 |
+
const outputText = tokenizer.decode(generatedIds, { skip_special_tokens: false });
|
| 244 |
+
console.log(`[lm] ${generatedIds.length} tokens in ${elapsed}s`);
|
| 245 |
+
|
| 246 |
+
// Find end of thinking
|
| 247 |
+
const thinkEnd = outputText.indexOf("</think>");
|
| 248 |
+
console.log("[lm] CoT length:", thinkEnd >= 0 ? thinkEnd : "no </think> found");
|
| 249 |
+
console.log("[lm] preview (CoT):", thinkEnd >= 0 ? outputText.slice(0, thinkEnd + 10) : outputText.slice(0, 500));
|
| 250 |
+
console.log("[lm] preview (after think):", thinkEnd >= 0 ? outputText.slice(thinkEnd, thinkEnd + 500) : "(n/a)");
|
| 251 |
+
|
| 252 |
+
const audioCodes = [];
|
| 253 |
+
for (const m of outputText.matchAll(/<\|audio_code_(\d+)\|>/g)) {
|
| 254 |
+
audioCodes.push(Math.min(Math.max(parseInt(m[1]), 0), NUM_CODES - 1));
|
| 255 |
+
}
|
| 256 |
+
console.log(`[lm] extracted ${audioCodes.length} audio codes, first 10:`, audioCodes.slice(0, 10));
|
| 257 |
+
// Truncate if too many but DON'T zero-pad — main worker uses last-frame padding in 25Hz space (matches MLX port)
|
| 258 |
+
const codes = new Int32Array(audioCodes.slice(0, numCodes5Hz));
|
| 259 |
+
|
| 260 |
+
post("audio_codes", { codes, elapsed, tokenCount: generatedIds.length });
|
| 261 |
+
}
|
| 262 |
+
|
| 263 |
+
self.onmessage = async (e) => {
|
| 264 |
+
const { type, ...data } = e.data;
|
| 265 |
+
try {
|
| 266 |
+
if (type === "load") await loadModel();
|
| 267 |
+
else if (type === "generate") await generate(data);
|
| 268 |
+
} catch (err) {
|
| 269 |
+
post("error", { message: err.message, stack: err.stack });
|
| 270 |
+
}
|
| 271 |
+
};
|
_source/src/main.jsx
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { StrictMode } from 'react'
|
| 2 |
+
import { createRoot } from 'react-dom/client'
|
| 3 |
+
import './index.css'
|
| 4 |
+
import App from './App.jsx'
|
| 5 |
+
|
| 6 |
+
createRoot(document.getElementById('root')).render(
|
| 7 |
+
<StrictMode>
|
| 8 |
+
<App />
|
| 9 |
+
</StrictMode>,
|
| 10 |
+
)
|
_source/src/worker.js
ADDED
|
@@ -0,0 +1,665 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Main worker: DiT + encoders + VAE on WebGPU. Spawns a dedicated LM worker
|
| 2 |
+
// (isolated WASM heap) for autoregressive generation.
|
| 3 |
+
import { AutoTokenizer } from "@huggingface/transformers";
|
| 4 |
+
import * as ort from "onnxruntime-web/webgpu";
|
| 5 |
+
|
| 6 |
+
const MODEL_REPO = "shreyask/ACE-Step-v1.5-ONNX";
|
| 7 |
+
const MODEL_REVISION = "bdabfb5684fd70fcc76f98cbb51bb9ebc47ee342";
|
| 8 |
+
const ONNX_BASE = `https://huggingface.co/${MODEL_REPO}/resolve/${MODEL_REVISION}/onnx`;
|
| 9 |
+
const TEXT_TOKENIZER_REPO = "Qwen/Qwen3-Embedding-0.6B";
|
| 10 |
+
|
| 11 |
+
const SAMPLE_RATE = 48000;
|
| 12 |
+
const LATENT_RATE = 25;
|
| 13 |
+
const LATENT_CHANNELS = 64;
|
| 14 |
+
const HIDDEN_SIZE = 2048;
|
| 15 |
+
const POOL_WINDOW = 5;
|
| 16 |
+
const FSQ_DIM = 6;
|
| 17 |
+
const NUM_CODES = 64000;
|
| 18 |
+
|
| 19 |
+
// 8-step turbo schedules (from ACE-Step)
|
| 20 |
+
const SHIFT_TIMESTEPS_8 = {
|
| 21 |
+
1.0: [1.0, 0.875, 0.75, 0.625, 0.5, 0.375, 0.25, 0.125],
|
| 22 |
+
2.0: [1.0, 0.9333, 0.8571, 0.7692, 0.6667, 0.5455, 0.4, 0.2222],
|
| 23 |
+
3.0: [1.0, 0.9545, 0.9, 0.8333, 0.75, 0.6429, 0.5, 0.3],
|
| 24 |
+
};
|
| 25 |
+
|
| 26 |
+
// Generate N-step shifted schedule matching MLX port:
|
| 27 |
+
// timesteps = linspace(1.0, 0.001, N)
|
| 28 |
+
// sigmas = shift * t / (1 + (shift-1) * t)
|
| 29 |
+
function buildSchedule(numSteps, shift) {
|
| 30 |
+
if (numSteps === 8 && SHIFT_TIMESTEPS_8[shift]) return SHIFT_TIMESTEPS_8[shift];
|
| 31 |
+
const sigmaMax = 1.0;
|
| 32 |
+
const sigmaMin = 0.001;
|
| 33 |
+
const schedule = [];
|
| 34 |
+
for (let i = 0; i < numSteps; i++) {
|
| 35 |
+
// linspace inclusive of both endpoints
|
| 36 |
+
const t = sigmaMax + (sigmaMin - sigmaMax) * (i / (numSteps - 1));
|
| 37 |
+
const tShifted = (shift * t) / (1.0 + (shift - 1.0) * t);
|
| 38 |
+
schedule.push(tShifted);
|
| 39 |
+
}
|
| 40 |
+
return schedule;
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
const CACHE_NAME = "ace-step-onnx-v12";
|
| 44 |
+
|
| 45 |
+
let textTokenizer = null;
|
| 46 |
+
let sessions = {};
|
| 47 |
+
let silenceLatent = null;
|
| 48 |
+
let fsqCodebooks = null;
|
| 49 |
+
let fsqScales = null;
|
| 50 |
+
let fsqProjectOutW = null;
|
| 51 |
+
let fsqProjectOutB = null;
|
| 52 |
+
let lmWorker = null;
|
| 53 |
+
let lmLoaded = false;
|
| 54 |
+
|
| 55 |
+
function post(type, data = {}) {
|
| 56 |
+
self.postMessage({ type, ...data });
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
async function fetchBuffer(url, label) {
|
| 60 |
+
const cache = await caches.open(CACHE_NAME);
|
| 61 |
+
const cached = await cache.match(url);
|
| 62 |
+
if (cached) {
|
| 63 |
+
post("progress", { label, loaded: 1, total: 1, percent: 100 });
|
| 64 |
+
return await cached.arrayBuffer();
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
const response = await fetch(url);
|
| 68 |
+
const total = parseInt(response.headers.get("content-length") || "0");
|
| 69 |
+
const reader = response.body.getReader();
|
| 70 |
+
const chunks = [];
|
| 71 |
+
let loaded = 0;
|
| 72 |
+
|
| 73 |
+
while (true) {
|
| 74 |
+
const { done, value } = await reader.read();
|
| 75 |
+
if (done) break;
|
| 76 |
+
chunks.push(value);
|
| 77 |
+
loaded += value.length;
|
| 78 |
+
if (total > 0) post("progress", { label, loaded, total, percent: (loaded / total) * 100 });
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
const buffer = new Uint8Array(loaded);
|
| 82 |
+
let offset = 0;
|
| 83 |
+
for (const chunk of chunks) { buffer.set(chunk, offset); offset += chunk.length; }
|
| 84 |
+
|
| 85 |
+
try {
|
| 86 |
+
await cache.put(url, new Response(buffer.buffer.slice(0), {
|
| 87 |
+
headers: { "Content-Type": "application/octet-stream" },
|
| 88 |
+
}));
|
| 89 |
+
} catch (_) {}
|
| 90 |
+
|
| 91 |
+
return buffer.buffer;
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
async function loadSession(name, filename, useUrlData = false, providers = ["webgpu"]) {
|
| 95 |
+
post("status", { message: `Loading ${name}...` });
|
| 96 |
+
try {
|
| 97 |
+
const modelBuffer = await fetchBuffer(`${ONNX_BASE}/${filename}`, `${name} graph`);
|
| 98 |
+
if (useUrlData) {
|
| 99 |
+
return await ort.InferenceSession.create(modelBuffer, {
|
| 100 |
+
executionProviders: providers,
|
| 101 |
+
externalData: [{ path: `${filename}.data`, data: `${ONNX_BASE}/${filename}.data` }],
|
| 102 |
+
});
|
| 103 |
+
}
|
| 104 |
+
const weightsBuffer = await fetchBuffer(`${ONNX_BASE}/${filename}.data`, `${name} weights`);
|
| 105 |
+
return await ort.InferenceSession.create(modelBuffer, {
|
| 106 |
+
executionProviders: providers,
|
| 107 |
+
externalData: [{ path: `${filename}.data`, data: weightsBuffer }],
|
| 108 |
+
});
|
| 109 |
+
} catch (err) {
|
| 110 |
+
throw new Error(`Failed loading ${name}: ${err.message}`);
|
| 111 |
+
}
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
function tensor(data, dims, type = "float32") {
|
| 115 |
+
return new ort.Tensor(type, data, dims);
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
function tensorStats(name, data) {
|
| 119 |
+
const arr = data instanceof Float32Array ? data : new Float32Array(data);
|
| 120 |
+
let min = Infinity, max = -Infinity, sum = 0;
|
| 121 |
+
for (let i = 0; i < arr.length; i++) {
|
| 122 |
+
if (arr[i] < min) min = arr[i];
|
| 123 |
+
if (arr[i] > max) max = arr[i];
|
| 124 |
+
sum += arr[i];
|
| 125 |
+
}
|
| 126 |
+
console.log(`[stats] ${name}: len=${arr.length} min=${min.toFixed(4)} max=${max.toFixed(4)} mean=${(sum / arr.length).toFixed(4)}`);
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
function randn(shape) {
|
| 130 |
+
const size = shape.reduce((a, b) => a * b, 1);
|
| 131 |
+
const data = new Float32Array(size);
|
| 132 |
+
for (let i = 0; i < size; i += 2) {
|
| 133 |
+
const u1 = Math.random();
|
| 134 |
+
const u2 = Math.random();
|
| 135 |
+
const r = Math.sqrt(-2 * Math.log(u1));
|
| 136 |
+
data[i] = r * Math.cos(2 * Math.PI * u2);
|
| 137 |
+
if (i + 1 < size) data[i + 1] = r * Math.sin(2 * Math.PI * u2);
|
| 138 |
+
}
|
| 139 |
+
return data;
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
function packSequences(hidden1, mask1, hidden2, mask2, batchSize, dim) {
|
| 143 |
+
const l1 = hidden1.length / (batchSize * dim);
|
| 144 |
+
const l2 = hidden2.length / (batchSize * dim);
|
| 145 |
+
const totalLen = l1 + l2;
|
| 146 |
+
const packedHidden = new Float32Array(batchSize * totalLen * dim);
|
| 147 |
+
const packedMask = new Float32Array(batchSize * totalLen);
|
| 148 |
+
|
| 149 |
+
for (let b = 0; b < batchSize; b++) {
|
| 150 |
+
const indices = [];
|
| 151 |
+
for (let i = 0; i < l1; i++) indices.push({ src: 1, idx: i, mask: mask1[b * l1 + i] });
|
| 152 |
+
for (let i = 0; i < l2; i++) indices.push({ src: 2, idx: i, mask: mask2[b * l2 + i] });
|
| 153 |
+
indices.sort((a, c) => c.mask - a.mask);
|
| 154 |
+
|
| 155 |
+
for (let pos = 0; pos < totalLen; pos++) {
|
| 156 |
+
const entry = indices[pos];
|
| 157 |
+
const srcArray = entry.src === 1 ? hidden1 : hidden2;
|
| 158 |
+
const srcLen = entry.src === 1 ? l1 : l2;
|
| 159 |
+
const srcOffset = (b * srcLen + entry.idx) * dim;
|
| 160 |
+
const dstOffset = (b * totalLen + pos) * dim;
|
| 161 |
+
packedHidden.set(srcArray.slice(srcOffset, srcOffset + dim), dstOffset);
|
| 162 |
+
packedMask[b * totalLen + pos] = entry.mask > 0 ? 1 : 0;
|
| 163 |
+
}
|
| 164 |
+
}
|
| 165 |
+
return { hidden: packedHidden, mask: packedMask, seqLen: totalLen };
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
function fsqLookup(indices, batchSize, seqLen) {
|
| 169 |
+
const out = new Float32Array(batchSize * seqLen * HIDDEN_SIZE);
|
| 170 |
+
for (let b = 0; b < batchSize; b++) {
|
| 171 |
+
for (let t = 0; t < seqLen; t++) {
|
| 172 |
+
const idx = indices[b * seqLen + t];
|
| 173 |
+
const codeOffset = idx * FSQ_DIM;
|
| 174 |
+
const scaledCode = new Float32Array(FSQ_DIM);
|
| 175 |
+
for (let d = 0; d < FSQ_DIM; d++) scaledCode[d] = fsqCodebooks[codeOffset + d] * fsqScales[d];
|
| 176 |
+
const outOffset = (b * seqLen + t) * HIDDEN_SIZE;
|
| 177 |
+
for (let h = 0; h < HIDDEN_SIZE; h++) {
|
| 178 |
+
let val = fsqProjectOutB[h];
|
| 179 |
+
for (let d = 0; d < FSQ_DIM; d++) val += scaledCode[d] * fsqProjectOutW[h * FSQ_DIM + d];
|
| 180 |
+
out[outOffset + h] = val;
|
| 181 |
+
}
|
| 182 |
+
}
|
| 183 |
+
}
|
| 184 |
+
return out;
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
// Spawn the LM worker and forward its status/progress messages up to the main thread
|
| 188 |
+
function spawnLMWorker() {
|
| 189 |
+
const worker = new Worker(new URL("./lm-worker.js", import.meta.url), { type: "module" });
|
| 190 |
+
worker.onmessage = (e) => {
|
| 191 |
+
const { type, ...data } = e.data;
|
| 192 |
+
if (type === "status" || type === "progress" || type === "error") {
|
| 193 |
+
self.postMessage(e.data); // forward as-is
|
| 194 |
+
}
|
| 195 |
+
// "loaded" and "audio_codes" are handled by the promise-based callers below
|
| 196 |
+
};
|
| 197 |
+
return worker;
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
function loadLMWorker() {
|
| 201 |
+
return new Promise((resolve, reject) => {
|
| 202 |
+
if (!lmWorker) lmWorker = spawnLMWorker();
|
| 203 |
+
const onMsg = (e) => {
|
| 204 |
+
if (e.data.type === "loaded") {
|
| 205 |
+
lmWorker.removeEventListener("message", onMsg);
|
| 206 |
+
lmLoaded = true;
|
| 207 |
+
resolve();
|
| 208 |
+
} else if (e.data.type === "error") {
|
| 209 |
+
lmWorker.removeEventListener("message", onMsg);
|
| 210 |
+
reject(new Error(e.data.message));
|
| 211 |
+
}
|
| 212 |
+
};
|
| 213 |
+
lmWorker.addEventListener("message", onMsg);
|
| 214 |
+
lmWorker.postMessage({ type: "load" });
|
| 215 |
+
});
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
function generateAudioCodesViaLM({ caption, lyrics, duration, numLatentFrames }) {
|
| 219 |
+
return new Promise((resolve, reject) => {
|
| 220 |
+
const onMsg = (e) => {
|
| 221 |
+
if (e.data.type === "audio_codes") {
|
| 222 |
+
lmWorker.removeEventListener("message", onMsg);
|
| 223 |
+
resolve(e.data);
|
| 224 |
+
} else if (e.data.type === "error") {
|
| 225 |
+
lmWorker.removeEventListener("message", onMsg);
|
| 226 |
+
reject(new Error(e.data.message));
|
| 227 |
+
}
|
| 228 |
+
};
|
| 229 |
+
lmWorker.addEventListener("message", onMsg);
|
| 230 |
+
lmWorker.postMessage({ type: "generate", caption, lyrics, duration, numLatentFrames });
|
| 231 |
+
});
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
async function loadModels() {
|
| 235 |
+
ort.env.wasm.numThreads = 1;
|
| 236 |
+
ort.env.wasm.simd = true;
|
| 237 |
+
ort.env.wasm.proxy = false;
|
| 238 |
+
|
| 239 |
+
console.log(`[models] ONNX revision ${MODEL_REVISION}`);
|
| 240 |
+
post("status", { message: `Using ONNX revision ${MODEL_REVISION.slice(0, 7)}` });
|
| 241 |
+
|
| 242 |
+
post("status", { message: "Spawning LM worker..." });
|
| 243 |
+
// Kick off LM loading in parallel with main-worker model loads
|
| 244 |
+
const lmLoadPromise = loadLMWorker();
|
| 245 |
+
|
| 246 |
+
post("status", { message: "Loading text tokenizer..." });
|
| 247 |
+
textTokenizer = await AutoTokenizer.from_pretrained(TEXT_TOKENIZER_REPO);
|
| 248 |
+
|
| 249 |
+
sessions.embedTokens = await loadSession("Embed Tokens", "text_embed_tokens_fp16.onnx");
|
| 250 |
+
sessions.detokenizer = await loadSession("Detokenizer", "detokenizer.onnx");
|
| 251 |
+
// VAE on WASM — WebGPU produces constant output past ~1.5s for conv1d upsample chain
|
| 252 |
+
sessions.vaeDecoder = await loadSession("VAE Decoder (CPU)", "vae_decoder_fp16.onnx", false, ["wasm"]);
|
| 253 |
+
sessions.textEncoder = await loadSession("Text Encoder", "text_encoder_fp16.onnx", true);
|
| 254 |
+
// FP32 condition_encoder — q4v2 had max_diff=13.92 vs PyTorch with real inputs,
|
| 255 |
+
// degrading conditioning so badly that DiT output was garbled. FP32 is 2.4GB via URL.
|
| 256 |
+
sessions.conditionEncoder = await loadSession("Condition Encoder (fp32)", "condition_encoder.onnx", true);
|
| 257 |
+
// DEBUG: dit_decoder_fp16_v2 is the quality baseline (max_diff=0.021 per step).
|
| 258 |
+
// dit_cached trades quality for speed (max_diff=0.074). Reverting while we diagnose
|
| 259 |
+
// the ONNX-vs-MLX spectral gap — compounded drift over 8 steps matters here.
|
| 260 |
+
sessions.ditDecoder = await loadSession("DiT Decoder (uncached)", "dit_decoder_fp16_v2.onnx", true);
|
| 261 |
+
|
| 262 |
+
post("status", { message: "Loading auxiliary data..." });
|
| 263 |
+
const [cbBuf, scBuf, powBuf, pobBuf, silBuf] = await Promise.all([
|
| 264 |
+
fetchBuffer(`${ONNX_BASE}/fsq_codebooks.bin`, "codebooks"),
|
| 265 |
+
fetchBuffer(`${ONNX_BASE}/fsq_scales.bin`, "scales"),
|
| 266 |
+
fetchBuffer(`${ONNX_BASE}/fsq_project_out_weight.bin`, "proj_out_w"),
|
| 267 |
+
fetchBuffer(`${ONNX_BASE}/fsq_project_out_bias.bin`, "proj_out_b"),
|
| 268 |
+
fetchBuffer("/silence_latent.bin", "silence latent"),
|
| 269 |
+
]);
|
| 270 |
+
fsqCodebooks = new Float32Array(cbBuf);
|
| 271 |
+
fsqScales = new Float32Array(scBuf);
|
| 272 |
+
fsqProjectOutW = new Float32Array(powBuf);
|
| 273 |
+
fsqProjectOutB = new Float32Array(pobBuf);
|
| 274 |
+
silenceLatent = new Float32Array(silBuf);
|
| 275 |
+
|
| 276 |
+
post("status", { message: "Waiting for LM worker..." });
|
| 277 |
+
await lmLoadPromise;
|
| 278 |
+
|
| 279 |
+
post("status", { message: "All models loaded!" });
|
| 280 |
+
post("loaded");
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
function buildSFTPrompt(caption, metas) {
|
| 284 |
+
const instruction = "Fill the audio semantic mask based on the given conditions:";
|
| 285 |
+
return `# Instruction\n${instruction}\n\n# Caption\n${caption}\n\n# Metas\n${metas}<|endoftext|>`;
|
| 286 |
+
}
|
| 287 |
+
|
| 288 |
+
async function encodeText(caption, metas) {
|
| 289 |
+
const prompt = buildSFTPrompt(caption, metas);
|
| 290 |
+
const encoded = textTokenizer(prompt, { padding: "max_length", max_length: 256, truncation: true });
|
| 291 |
+
const idsRaw = encoded.input_ids.data;
|
| 292 |
+
const inputIds = idsRaw instanceof BigInt64Array ? idsRaw : new BigInt64Array(Array.from(idsRaw, BigInt));
|
| 293 |
+
|
| 294 |
+
const result = await sessions.textEncoder.run({ input_ids: tensor(inputIds, [1, 256], "int64") });
|
| 295 |
+
const projected = await sessions.textProjector.run({ text_hidden_states: result.hidden_states });
|
| 296 |
+
|
| 297 |
+
const maskRaw = encoded.attention_mask.data;
|
| 298 |
+
const attentionMask = new Float32Array(maskRaw.length);
|
| 299 |
+
for (let i = 0; i < maskRaw.length; i++) attentionMask[i] = Number(maskRaw[i]);
|
| 300 |
+
return { hidden: projected.projected.data, mask: attentionMask, seqLen: 256 };
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
async function encodeLyrics(lyrics, language = "en") {
|
| 304 |
+
const fullText = `# Languages\n${language}\n\n# Lyric\n${lyrics}`;
|
| 305 |
+
// max_length=2048 matches the original handler (conditioning_text.py)
|
| 306 |
+
const encoded = textTokenizer(fullText, { padding: "max_length", max_length: 2048, truncation: true });
|
| 307 |
+
const idsRaw = encoded.input_ids.data;
|
| 308 |
+
const inputIds = idsRaw instanceof BigInt64Array ? idsRaw : new BigInt64Array(Array.from(idsRaw, BigInt));
|
| 309 |
+
const seqLen = inputIds.length;
|
| 310 |
+
|
| 311 |
+
const embedResult = await sessions.embedTokens.run({ input_ids: tensor(inputIds, [1, seqLen], "int64") });
|
| 312 |
+
const maskRaw = encoded.attention_mask.data;
|
| 313 |
+
const attentionMask = new Float32Array(maskRaw.length);
|
| 314 |
+
for (let i = 0; i < maskRaw.length; i++) attentionMask[i] = Number(maskRaw[i]);
|
| 315 |
+
|
| 316 |
+
const lyricResult = await sessions.lyricEncoder.run({
|
| 317 |
+
inputs_embeds: embedResult.hidden_states,
|
| 318 |
+
attention_mask: tensor(attentionMask, [1, seqLen]),
|
| 319 |
+
});
|
| 320 |
+
return { hidden: lyricResult.hidden_states.data, mask: attentionMask, seqLen };
|
| 321 |
+
}
|
| 322 |
+
|
| 323 |
+
async function encodeTimbre() {
|
| 324 |
+
const silenceRef = silenceLatent.slice(0, 750 * LATENT_CHANNELS);
|
| 325 |
+
const result = await sessions.timbreEncoder.run({
|
| 326 |
+
refer_audio: tensor(silenceRef, [1, 750, LATENT_CHANNELS]),
|
| 327 |
+
});
|
| 328 |
+
const timbreHidden = new Float32Array(HIDDEN_SIZE);
|
| 329 |
+
timbreHidden.set(result.timbre_embedding.data);
|
| 330 |
+
return { hidden: timbreHidden, mask: new Float32Array([1.0]), seqLen: 1 };
|
| 331 |
+
}
|
| 332 |
+
|
| 333 |
+
async function generateLMHints(caption, lyrics, numLatentFrames, duration) {
|
| 334 |
+
const { codes, elapsed, tokenCount } = await generateAudioCodesViaLM({ caption, lyrics, duration, numLatentFrames });
|
| 335 |
+
post("status", { message: `LM: ${codes.length} codes from ${tokenCount} tokens in ${elapsed}s` });
|
| 336 |
+
|
| 337 |
+
if (codes.length === 0) {
|
| 338 |
+
console.warn("[lm] No audio codes generated, returning silence");
|
| 339 |
+
return new Float32Array(numLatentFrames * LATENT_CHANNELS);
|
| 340 |
+
}
|
| 341 |
+
|
| 342 |
+
const numCodes5Hz = codes.length;
|
| 343 |
+
post("status", { message: "FSQ codebook lookup..." });
|
| 344 |
+
const lmHints5Hz = fsqLookup(codes, 1, numCodes5Hz);
|
| 345 |
+
tensorStats("lm_hints_5hz", lmHints5Hz);
|
| 346 |
+
|
| 347 |
+
post("status", { message: "Detokenizing 5Hz → 25Hz..." });
|
| 348 |
+
const detokResult = await sessions.detokenizer.run({
|
| 349 |
+
quantized: tensor(lmHints5Hz, [1, numCodes5Hz, HIDDEN_SIZE]),
|
| 350 |
+
});
|
| 351 |
+
const lmHints25HzRaw = detokResult.lm_hints_25hz.data;
|
| 352 |
+
const rawLen = lmHints25HzRaw.length / LATENT_CHANNELS;
|
| 353 |
+
tensorStats("lm_hints_25hz_raw", lmHints25HzRaw);
|
| 354 |
+
|
| 355 |
+
// Pad with last frame (MLX port behavior) or truncate
|
| 356 |
+
const lmHints25Hz = new Float32Array(numLatentFrames * LATENT_CHANNELS);
|
| 357 |
+
if (rawLen >= numLatentFrames) {
|
| 358 |
+
lmHints25Hz.set(lmHints25HzRaw.slice(0, numLatentFrames * LATENT_CHANNELS));
|
| 359 |
+
} else {
|
| 360 |
+
lmHints25Hz.set(lmHints25HzRaw);
|
| 361 |
+
// Repeat last frame to fill remaining
|
| 362 |
+
const lastFrameStart = (rawLen - 1) * LATENT_CHANNELS;
|
| 363 |
+
const lastFrame = lmHints25HzRaw.slice(lastFrameStart, lastFrameStart + LATENT_CHANNELS);
|
| 364 |
+
for (let t = rawLen; t < numLatentFrames; t++) {
|
| 365 |
+
lmHints25Hz.set(lastFrame, t * LATENT_CHANNELS);
|
| 366 |
+
}
|
| 367 |
+
console.log(`[hints] padded ${rawLen} → ${numLatentFrames} frames with last-frame replication`);
|
| 368 |
+
}
|
| 369 |
+
tensorStats("lm_hints_25hz_final", lmHints25Hz);
|
| 370 |
+
return lmHints25Hz;
|
| 371 |
+
}
|
| 372 |
+
|
| 373 |
+
async function generateAudio({ caption, lyrics, duration, shift, numSteps = 8 }) {
|
| 374 |
+
const totalStartTime = performance.now();
|
| 375 |
+
const filenameStamp = Date.now();
|
| 376 |
+
const batchSize = 1;
|
| 377 |
+
const numLatentFrames = Math.round(duration * LATENT_RATE);
|
| 378 |
+
const tSchedule = buildSchedule(numSteps, shift);
|
| 379 |
+
const metas = `duration: ${duration}s`;
|
| 380 |
+
|
| 381 |
+
// 1. Text → Qwen3 embedding (1024-dim hidden states, BEFORE projection)
|
| 382 |
+
post("status", { message: "Encoding text..." });
|
| 383 |
+
const sftPrompt = buildSFTPrompt(caption, metas);
|
| 384 |
+
const textEnc = textTokenizer(sftPrompt, { padding: "max_length", max_length: 256, truncation: true });
|
| 385 |
+
const textIdsRaw = textEnc.input_ids.data;
|
| 386 |
+
const textIds = textIdsRaw instanceof BigInt64Array ? textIdsRaw : new BigInt64Array(Array.from(textIdsRaw, BigInt));
|
| 387 |
+
const textHiddenRes = await sessions.textEncoder.run({ input_ids: tensor(textIds, [1, 256], "int64") });
|
| 388 |
+
const textHidden = textHiddenRes.hidden_states;
|
| 389 |
+
const textMaskRaw = textEnc.attention_mask.data;
|
| 390 |
+
const textMask = new Float32Array(textMaskRaw.length);
|
| 391 |
+
for (let i = 0; i < textMaskRaw.length; i++) textMask[i] = Number(textMaskRaw[i]);
|
| 392 |
+
|
| 393 |
+
// 2. Lyric tokens → embed_tokens (1024-dim, passed into condition_encoder's lyric_encoder)
|
| 394 |
+
post("status", { message: "Embedding lyrics..." });
|
| 395 |
+
const lyricFullText = `# Languages\nen\n\n# Lyric\n${lyrics}`;
|
| 396 |
+
const lyricEnc = textTokenizer(lyricFullText, { padding: "max_length", max_length: 2048, truncation: true });
|
| 397 |
+
const lyricIdsRaw = lyricEnc.input_ids.data;
|
| 398 |
+
const lyricIds = lyricIdsRaw instanceof BigInt64Array ? lyricIdsRaw : new BigInt64Array(Array.from(lyricIdsRaw, BigInt));
|
| 399 |
+
const lyricEmbRes = await sessions.embedTokens.run({ input_ids: tensor(lyricIds, [1, 2048], "int64") });
|
| 400 |
+
const lyricEmb = lyricEmbRes.hidden_states;
|
| 401 |
+
const lyricMaskRaw = lyricEnc.attention_mask.data;
|
| 402 |
+
const lyricMask = new Float32Array(lyricMaskRaw.length);
|
| 403 |
+
for (let i = 0; i < lyricMaskRaw.length; i++) lyricMask[i] = Number(lyricMaskRaw[i]);
|
| 404 |
+
|
| 405 |
+
// 3. LM hints (mandatory for turbo model)
|
| 406 |
+
const lmHints25Hz = await generateLMHints(caption, lyrics, numLatentFrames, duration);
|
| 407 |
+
|
| 408 |
+
// 4. Silence for ref audio (timbre) and src_latents
|
| 409 |
+
const silenceRef = silenceLatent.slice(0, 750 * LATENT_CHANNELS);
|
| 410 |
+
const srcLatents = new Float32Array(numLatentFrames * LATENT_CHANNELS);
|
| 411 |
+
const chunkMasks = new Float32Array(numLatentFrames * LATENT_CHANNELS).fill(1.0);
|
| 412 |
+
const isCovers = new Float32Array([1.0]); // force use of LM hints
|
| 413 |
+
|
| 414 |
+
// 5. condition_encoder: does text_projector + lyric_encoder + timbre_encoder + pack_sequences + context_latents
|
| 415 |
+
post("status", { message: "Running condition encoder..." });
|
| 416 |
+
const condResult = await sessions.conditionEncoder.run({
|
| 417 |
+
text_hidden_states: textHidden,
|
| 418 |
+
text_attention_mask: tensor(textMask, [1, 256]),
|
| 419 |
+
lyric_hidden_states: lyricEmb,
|
| 420 |
+
lyric_attention_mask: tensor(lyricMask, [1, 2048]),
|
| 421 |
+
refer_audio_acoustic_hidden_states_packed: tensor(silenceRef, [1, 750, LATENT_CHANNELS]),
|
| 422 |
+
refer_audio_order_mask: tensor(new BigInt64Array([0n]), [1], "int64"),
|
| 423 |
+
src_latents: tensor(srcLatents, [1, numLatentFrames, LATENT_CHANNELS]),
|
| 424 |
+
chunk_masks: tensor(chunkMasks, [1, numLatentFrames, LATENT_CHANNELS]),
|
| 425 |
+
is_covers: tensor(isCovers, [1]),
|
| 426 |
+
precomputed_lm_hints_25hz: tensor(lmHints25Hz, [1, numLatentFrames, LATENT_CHANNELS]),
|
| 427 |
+
});
|
| 428 |
+
const encoderHiddenStates = condResult.encoder_hidden_states;
|
| 429 |
+
const contextLatentsTensor = condResult.context_latents;
|
| 430 |
+
tensorStats("encoder_hidden_states", encoderHiddenStates.data);
|
| 431 |
+
tensorStats("context_latents", contextLatentsTensor.data);
|
| 432 |
+
|
| 433 |
+
post("status", { message: "Starting denoising..." });
|
| 434 |
+
let xt = randn([batchSize, numLatentFrames, LATENT_CHANNELS]);
|
| 435 |
+
const startTime = performance.now();
|
| 436 |
+
|
| 437 |
+
for (let step = 0; step < tSchedule.length; step++) {
|
| 438 |
+
const tCurr = tSchedule[step];
|
| 439 |
+
post("status", { message: `Denoising step ${step + 1}/${tSchedule.length}...` });
|
| 440 |
+
|
| 441 |
+
const timestepData = new Float32Array(batchSize).fill(tCurr);
|
| 442 |
+
const result = await sessions.ditDecoder.run({
|
| 443 |
+
hidden_states: tensor(xt, [batchSize, numLatentFrames, LATENT_CHANNELS]),
|
| 444 |
+
timestep: tensor(timestepData, [batchSize]),
|
| 445 |
+
encoder_hidden_states: encoderHiddenStates,
|
| 446 |
+
context_latents: contextLatentsTensor,
|
| 447 |
+
});
|
| 448 |
+
|
| 449 |
+
const vt = result.velocity.data;
|
| 450 |
+
if (step === tSchedule.length - 1) {
|
| 451 |
+
for (let i = 0; i < xt.length; i++) xt[i] = xt[i] - vt[i] * tCurr;
|
| 452 |
+
} else {
|
| 453 |
+
const dt = tCurr - tSchedule[step + 1];
|
| 454 |
+
for (let i = 0; i < xt.length; i++) xt[i] = xt[i] - vt[i] * dt;
|
| 455 |
+
}
|
| 456 |
+
}
|
| 457 |
+
|
| 458 |
+
const diffusionTime = ((performance.now() - startTime) / 1000).toFixed(2);
|
| 459 |
+
tensorStats("final_latent", xt);
|
| 460 |
+
|
| 461 |
+
// Per-frame variance check — detects if later frames are constant
|
| 462 |
+
const perFrameVariance = new Float32Array(numLatentFrames);
|
| 463 |
+
for (let t = 0; t < numLatentFrames; t++) {
|
| 464 |
+
let mean = 0;
|
| 465 |
+
for (let c = 0; c < LATENT_CHANNELS; c++) mean += xt[t * LATENT_CHANNELS + c];
|
| 466 |
+
mean /= LATENT_CHANNELS;
|
| 467 |
+
let varSum = 0;
|
| 468 |
+
for (let c = 0; c < LATENT_CHANNELS; c++) {
|
| 469 |
+
const d = xt[t * LATENT_CHANNELS + c] - mean;
|
| 470 |
+
varSum += d * d;
|
| 471 |
+
}
|
| 472 |
+
perFrameVariance[t] = varSum / LATENT_CHANNELS;
|
| 473 |
+
}
|
| 474 |
+
console.log("[perframe] variance samples:", Array.from(perFrameVariance.filter((_, i) => i % 25 === 0)).map(v => v.toFixed(3)));
|
| 475 |
+
|
| 476 |
+
// Also check LM hints per-frame variance
|
| 477 |
+
const hintsVar = new Float32Array(numLatentFrames);
|
| 478 |
+
for (let t = 0; t < numLatentFrames; t++) {
|
| 479 |
+
let mean = 0;
|
| 480 |
+
for (let c = 0; c < LATENT_CHANNELS; c++) mean += lmHints25Hz[t * LATENT_CHANNELS + c];
|
| 481 |
+
mean /= LATENT_CHANNELS;
|
| 482 |
+
let varSum = 0;
|
| 483 |
+
for (let c = 0; c < LATENT_CHANNELS; c++) {
|
| 484 |
+
const d = lmHints25Hz[t * LATENT_CHANNELS + c] - mean;
|
| 485 |
+
varSum += d * d;
|
| 486 |
+
}
|
| 487 |
+
hintsVar[t] = varSum / LATENT_CHANNELS;
|
| 488 |
+
}
|
| 489 |
+
console.log("[hints var] samples:", Array.from(hintsVar.filter((_, i) => i % 25 === 0)).map(v => v.toFixed(3)));
|
| 490 |
+
|
| 491 |
+
post("status", { message: "Decoding audio..." });
|
| 492 |
+
const latentsForVae = new Float32Array(batchSize * LATENT_CHANNELS * numLatentFrames);
|
| 493 |
+
for (let t = 0; t < numLatentFrames; t++) {
|
| 494 |
+
for (let c = 0; c < LATENT_CHANNELS; c++) {
|
| 495 |
+
latentsForVae[c * numLatentFrames + t] = xt[t * LATENT_CHANNELS + c];
|
| 496 |
+
}
|
| 497 |
+
}
|
| 498 |
+
|
| 499 |
+
const vaeResult = await sessions.vaeDecoder.run({
|
| 500 |
+
latents: tensor(latentsForVae, [batchSize, LATENT_CHANNELS, numLatentFrames]),
|
| 501 |
+
});
|
| 502 |
+
|
| 503 |
+
const waveform = vaeResult.waveform.data;
|
| 504 |
+
tensorStats("waveform", waveform);
|
| 505 |
+
|
| 506 |
+
masterWaveform(waveform, SAMPLE_RATE, 2);
|
| 507 |
+
|
| 508 |
+
const wavBuffer = float32ToWav(waveform, SAMPLE_RATE, 2);
|
| 509 |
+
// totalTime measures the whole pipeline (LM + encoders + diffusion + VAE),
|
| 510 |
+
// not just the diffusion loop. diffusionTime is reported separately below.
|
| 511 |
+
const totalTime = ((performance.now() - totalStartTime) / 1000).toFixed(2);
|
| 512 |
+
|
| 513 |
+
post("audio", { wavBuffer, duration, diffusionTime, totalTime, filenameStamp }, [wavBuffer]);
|
| 514 |
+
}
|
| 515 |
+
|
| 516 |
+
function measureAudio(samples) {
|
| 517 |
+
let peak = 0;
|
| 518 |
+
let sumSq = 0;
|
| 519 |
+
for (let i = 0; i < samples.length; i++) {
|
| 520 |
+
const v = samples[i];
|
| 521 |
+
const abs = Math.abs(v);
|
| 522 |
+
if (abs > peak) peak = abs;
|
| 523 |
+
sumSq += v * v;
|
| 524 |
+
}
|
| 525 |
+
return { peak, rms: Math.sqrt(sumSq / Math.max(1, samples.length)) };
|
| 526 |
+
}
|
| 527 |
+
|
| 528 |
+
function goertzelPower(data, sampleRate, freq) {
|
| 529 |
+
const omega = 2 * Math.PI * freq / sampleRate;
|
| 530 |
+
const coeff = 2 * Math.cos(omega);
|
| 531 |
+
let s0 = 0, s1 = 0, s2 = 0;
|
| 532 |
+
for (let i = 0; i < data.length; i++) {
|
| 533 |
+
s0 = data[i] + coeff * s1 - s2;
|
| 534 |
+
s2 = s1;
|
| 535 |
+
s1 = s0;
|
| 536 |
+
}
|
| 537 |
+
return s1 * s1 + s2 * s2 - coeff * s1 * s2;
|
| 538 |
+
}
|
| 539 |
+
|
| 540 |
+
function detectDronePeaks(samples, sampleRate, channels) {
|
| 541 |
+
const numSamples = samples.length / channels;
|
| 542 |
+
const step = Math.max(1, Math.floor(sampleRate / 4000));
|
| 543 |
+
const downsampleRate = sampleRate / step;
|
| 544 |
+
const downsampledLength = Math.floor(numSamples / step);
|
| 545 |
+
if (downsampledLength < 1024) return [];
|
| 546 |
+
|
| 547 |
+
const mono = new Float32Array(downsampledLength);
|
| 548 |
+
let mean = 0;
|
| 549 |
+
for (let i = 0; i < downsampledLength; i++) {
|
| 550 |
+
const src = i * step;
|
| 551 |
+
let v = 0;
|
| 552 |
+
for (let ch = 0; ch < channels; ch++) v += samples[ch * numSamples + src];
|
| 553 |
+
v /= channels;
|
| 554 |
+
mono[i] = v;
|
| 555 |
+
mean += v;
|
| 556 |
+
}
|
| 557 |
+
mean /= downsampledLength;
|
| 558 |
+
for (let i = 0; i < mono.length; i++) mono[i] -= mean;
|
| 559 |
+
|
| 560 |
+
const bins = [];
|
| 561 |
+
for (let freq = 250; freq <= 950; freq += 12.5) {
|
| 562 |
+
bins.push({ freq, power: goertzelPower(mono, downsampleRate, freq) });
|
| 563 |
+
}
|
| 564 |
+
const sortedPowers = bins.map((bin) => bin.power).sort((a, b) => a - b);
|
| 565 |
+
const median = sortedPowers[Math.floor(sortedPowers.length / 2)] + 1e-12;
|
| 566 |
+
bins.sort((a, b) => b.power - a.power);
|
| 567 |
+
|
| 568 |
+
const peaks = [];
|
| 569 |
+
for (const bin of bins) {
|
| 570 |
+
const score = bin.power / median;
|
| 571 |
+
if (score < 12) break;
|
| 572 |
+
if (peaks.every((peak) => Math.abs(peak.freq - bin.freq) >= 50)) {
|
| 573 |
+
peaks.push({ freq: bin.freq, score });
|
| 574 |
+
if (peaks.length >= 2) break;
|
| 575 |
+
}
|
| 576 |
+
}
|
| 577 |
+
return peaks;
|
| 578 |
+
}
|
| 579 |
+
|
| 580 |
+
function applyNotch(samples, sampleRate, channels, freq, q = 20, depth = 0.45) {
|
| 581 |
+
const numSamples = samples.length / channels;
|
| 582 |
+
const w0 = 2 * Math.PI * freq / sampleRate;
|
| 583 |
+
const cos = Math.cos(w0);
|
| 584 |
+
const alpha = Math.sin(w0) / (2 * q);
|
| 585 |
+
const a0 = 1 + alpha;
|
| 586 |
+
const b0 = 1 / a0;
|
| 587 |
+
const b1 = (-2 * cos) / a0;
|
| 588 |
+
const b2 = 1 / a0;
|
| 589 |
+
const a1 = (-2 * cos) / a0;
|
| 590 |
+
const a2 = (1 - alpha) / a0;
|
| 591 |
+
|
| 592 |
+
for (let ch = 0; ch < channels; ch++) {
|
| 593 |
+
const offset = ch * numSamples;
|
| 594 |
+
let x1 = 0, x2 = 0, y1 = 0, y2 = 0;
|
| 595 |
+
for (let i = 0; i < numSamples; i++) {
|
| 596 |
+
const x0 = samples[offset + i];
|
| 597 |
+
const y0 = b0 * x0 + b1 * x1 + b2 * x2 - a1 * y1 - a2 * y2;
|
| 598 |
+
samples[offset + i] = x0 * (1 - depth) + y0 * depth;
|
| 599 |
+
x2 = x1; x1 = x0;
|
| 600 |
+
y2 = y1; y1 = y0;
|
| 601 |
+
}
|
| 602 |
+
}
|
| 603 |
+
}
|
| 604 |
+
|
| 605 |
+
function masterWaveform(samples, sampleRate, channels) {
|
| 606 |
+
const before = measureAudio(samples);
|
| 607 |
+
if (before.peak <= 0.001) return;
|
| 608 |
+
|
| 609 |
+
const dronePeaks = detectDronePeaks(samples, sampleRate, channels);
|
| 610 |
+
for (const peak of dronePeaks) applyNotch(samples, sampleRate, channels, peak.freq);
|
| 611 |
+
|
| 612 |
+
const afterEq = measureAudio(samples);
|
| 613 |
+
const targetRms = 0.085;
|
| 614 |
+
const maxPeak = 0.891;
|
| 615 |
+
const maxGain = 12.0;
|
| 616 |
+
const gain = Math.min(
|
| 617 |
+
maxGain,
|
| 618 |
+
targetRms / Math.max(afterEq.rms, 1e-6),
|
| 619 |
+
maxPeak / Math.max(afterEq.peak, 1e-6),
|
| 620 |
+
);
|
| 621 |
+
for (let i = 0; i < samples.length; i++) samples[i] *= gain;
|
| 622 |
+
|
| 623 |
+
const after = measureAudio(samples);
|
| 624 |
+
const peakText = dronePeaks.map((peak) => `${peak.freq.toFixed(1)}Hz/${peak.score.toFixed(0)}x`).join(", ") || "none";
|
| 625 |
+
console.log(
|
| 626 |
+
`[master] rawPeak=${before.peak.toFixed(4)} rawRms=${before.rms.toFixed(4)} ` +
|
| 627 |
+
`dronePeaks=${peakText} gain=${gain.toFixed(2)}x peak=${after.peak.toFixed(4)} rms=${after.rms.toFixed(4)}`,
|
| 628 |
+
);
|
| 629 |
+
}
|
| 630 |
+
|
| 631 |
+
function float32ToWav(samples, sampleRate, channels = 2) {
|
| 632 |
+
const numSamples = samples.length / channels;
|
| 633 |
+
const bitsPerSample = 16;
|
| 634 |
+
const blockAlign = channels * (bitsPerSample / 8);
|
| 635 |
+
const byteRate = sampleRate * blockAlign;
|
| 636 |
+
const dataSize = numSamples * blockAlign;
|
| 637 |
+
const buffer = new ArrayBuffer(44 + dataSize);
|
| 638 |
+
const view = new DataView(buffer);
|
| 639 |
+
const w = (o, s) => { for (let i = 0; i < s.length; i++) view.setUint8(o + i, s.charCodeAt(i)); };
|
| 640 |
+
w(0, "RIFF"); view.setUint32(4, 36 + dataSize, true);
|
| 641 |
+
w(8, "WAVE"); w(12, "fmt "); view.setUint32(16, 16, true);
|
| 642 |
+
view.setUint16(20, 1, true); view.setUint16(22, channels, true);
|
| 643 |
+
view.setUint32(24, sampleRate, true); view.setUint32(28, byteRate, true);
|
| 644 |
+
view.setUint16(32, blockAlign, true); view.setUint16(34, bitsPerSample, true);
|
| 645 |
+
w(36, "data"); view.setUint32(40, dataSize, true);
|
| 646 |
+
let offset = 44;
|
| 647 |
+
for (let i = 0; i < numSamples; i++) {
|
| 648 |
+
for (let ch = 0; ch < channels; ch++) {
|
| 649 |
+
const sample = Math.max(-1, Math.min(1, samples[ch * numSamples + i]));
|
| 650 |
+
view.setInt16(offset, sample * 32767, true);
|
| 651 |
+
offset += 2;
|
| 652 |
+
}
|
| 653 |
+
}
|
| 654 |
+
return buffer;
|
| 655 |
+
}
|
| 656 |
+
|
| 657 |
+
self.onmessage = async (e) => {
|
| 658 |
+
const { type, ...data } = e.data;
|
| 659 |
+
try {
|
| 660 |
+
if (type === "load") await loadModels();
|
| 661 |
+
else if (type === "generate") await generateAudio(data);
|
| 662 |
+
} catch (err) {
|
| 663 |
+
post("error", { message: err.message, stack: err.stack });
|
| 664 |
+
}
|
| 665 |
+
};
|
_source/vite.config.js
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { defineConfig } from 'vite'
|
| 2 |
+
import react from '@vitejs/plugin-react'
|
| 3 |
+
import tailwindcss from '@tailwindcss/vite'
|
| 4 |
+
|
| 5 |
+
export default defineConfig({
|
| 6 |
+
plugins: [react(), tailwindcss()],
|
| 7 |
+
optimizeDeps: {
|
| 8 |
+
exclude: ['onnxruntime-web'],
|
| 9 |
+
},
|
| 10 |
+
worker: {
|
| 11 |
+
format: 'es',
|
| 12 |
+
},
|
| 13 |
+
})
|
assets/index-C7vMACvi.js
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
assets/index-CccuoAYh.css
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*! tailwindcss v4.2.2 | MIT License | https://tailwindcss.com */
|
| 2 |
+
@layer properties{@supports (((-webkit-hyphens:none)) and (not (margin-trim:inline))) or ((-moz-orient:inline) and (not (color:rgb(from red r g b)))){*,:before,:after,::backdrop{--tw-rotate-x:initial;--tw-rotate-y:initial;--tw-rotate-z:initial;--tw-skew-x:initial;--tw-skew-y:initial;--tw-space-y-reverse:0;--tw-border-style:solid;--tw-leading:initial;--tw-font-weight:initial;--tw-tracking:initial;--tw-blur:initial;--tw-brightness:initial;--tw-contrast:initial;--tw-grayscale:initial;--tw-hue-rotate:initial;--tw-invert:initial;--tw-opacity:initial;--tw-saturate:initial;--tw-sepia:initial;--tw-drop-shadow:initial;--tw-drop-shadow-color:initial;--tw-drop-shadow-alpha:100%;--tw-drop-shadow-size:initial;--tw-duration:initial;--tw-scale-x:1;--tw-scale-y:1;--tw-scale-z:1}}}@layer theme{:root,:host{--font-sans:ui-sans-serif, system-ui, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji";--font-mono:ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;--spacing:.25rem;--container-sm:24rem;--container-md:28rem;--container-2xl:42rem;--text-xs:.75rem;--text-xs--line-height:calc(1 / .75);--text-sm:.875rem;--text-sm--line-height:calc(1.25 / .875);--text-base:1rem;--text-base--line-height:calc(1.5 / 1);--text-lg:1.125rem;--text-lg--line-height:calc(1.75 / 1.125);--text-xl:1.25rem;--text-xl--line-height:calc(1.75 / 1.25);--text-2xl:1.5rem;--text-2xl--line-height:calc(2 / 1.5);--text-4xl:2.25rem;--text-4xl--line-height:calc(2.5 / 2.25);--text-5xl:3rem;--text-5xl--line-height:1;--font-weight-medium:500;--font-weight-semibold:600;--tracking-wider:.05em;--tracking-widest:.1em;--leading-relaxed:1.625;--radius-md:.375rem;--radius-lg:.5rem;--radius-xl:.75rem;--radius-2xl:1rem;--default-transition-duration:.15s;--default-transition-timing-function:cubic-bezier(.4, 0, .2, 1);--default-font-family:var(--font-sans);--default-mono-font-family:var(--font-mono)}}@layer base{*,:after,:before,::backdrop{box-sizing:border-box;border:0 solid;margin:0;padding:0}::file-selector-button{box-sizing:border-box;border:0 solid;margin:0;padding:0}html,:host{-webkit-text-size-adjust:100%;tab-size:4;line-height:1.5;font-family:var(--default-font-family,ui-sans-serif, system-ui, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji");font-feature-settings:var(--default-font-feature-settings,normal);font-variation-settings:var(--default-font-variation-settings,normal);-webkit-tap-highlight-color:transparent}hr{height:0;color:inherit;border-top-width:1px}abbr:where([title]){-webkit-text-decoration:underline dotted;text-decoration:underline dotted}h1,h2,h3,h4,h5,h6{font-size:inherit;font-weight:inherit}a{color:inherit;-webkit-text-decoration:inherit;-webkit-text-decoration:inherit;-webkit-text-decoration:inherit;-webkit-text-decoration:inherit;text-decoration:inherit}b,strong{font-weight:bolder}code,kbd,samp,pre{font-family:var(--default-mono-font-family,ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace);font-feature-settings:var(--default-mono-font-feature-settings,normal);font-variation-settings:var(--default-mono-font-variation-settings,normal);font-size:1em}small{font-size:80%}sub,sup{vertical-align:baseline;font-size:75%;line-height:0;position:relative}sub{bottom:-.25em}sup{top:-.5em}table{text-indent:0;border-color:inherit;border-collapse:collapse}:-moz-focusring{outline:auto}progress{vertical-align:baseline}summary{display:list-item}ol,ul,menu{list-style:none}img,svg,video,canvas,audio,iframe,embed,object{vertical-align:middle;display:block}img,video{max-width:100%;height:auto}button,input,select,optgroup,textarea{font:inherit;font-feature-settings:inherit;font-variation-settings:inherit;letter-spacing:inherit;color:inherit;opacity:1;background-color:#0000;border-radius:0}::file-selector-button{font:inherit;font-feature-settings:inherit;font-variation-settings:inherit;letter-spacing:inherit;color:inherit;opacity:1;background-color:#0000;border-radius:0}:where(select:is([multiple],[size])) optgroup{font-weight:bolder}:where(select:is([multiple],[size])) optgroup option{padding-inline-start:20px}::file-selector-button{margin-inline-end:4px}::placeholder{opacity:1}@supports (not ((-webkit-appearance:-apple-pay-button))) or (contain-intrinsic-size:1px){::placeholder{color:currentColor}@supports (color:color-mix(in lab, red, red)){::placeholder{color:color-mix(in oklab, currentcolor 50%, transparent)}}}textarea{resize:vertical}::-webkit-search-decoration{-webkit-appearance:none}::-webkit-date-and-time-value{min-height:1lh;text-align:inherit}::-webkit-datetime-edit{display:inline-flex}::-webkit-datetime-edit-fields-wrapper{padding:0}::-webkit-datetime-edit{padding-block:0}::-webkit-datetime-edit-year-field{padding-block:0}::-webkit-datetime-edit-month-field{padding-block:0}::-webkit-datetime-edit-day-field{padding-block:0}::-webkit-datetime-edit-hour-field{padding-block:0}::-webkit-datetime-edit-minute-field{padding-block:0}::-webkit-datetime-edit-second-field{padding-block:0}::-webkit-datetime-edit-millisecond-field{padding-block:0}::-webkit-datetime-edit-meridiem-field{padding-block:0}::-webkit-calendar-picker-indicator{line-height:1}:-moz-ui-invalid{box-shadow:none}button,input:where([type=button],[type=reset],[type=submit]){appearance:button}::file-selector-button{appearance:button}::-webkit-inner-spin-button{height:auto}::-webkit-outer-spin-button{height:auto}[hidden]:where(:not([hidden=until-found])){display:none!important}}@layer components;@layer utilities{.fixed{position:fixed}.relative{position:relative}.static{position:static}.inset-0{inset:calc(var(--spacing) * 0)}.start{inset-inline-start:var(--spacing)}.end{inset-inline-end:var(--spacing)}.z-50{z-index:50}.mx-2{margin-inline:calc(var(--spacing) * 2)}.mt-0\.5{margin-top:calc(var(--spacing) * .5)}.mt-3{margin-top:calc(var(--spacing) * 3)}.mt-4{margin-top:calc(var(--spacing) * 4)}.mt-12{margin-top:calc(var(--spacing) * 12)}.mb-1{margin-bottom:calc(var(--spacing) * 1)}.mb-1\.5{margin-bottom:calc(var(--spacing) * 1.5)}.mb-2{margin-bottom:calc(var(--spacing) * 2)}.mb-3{margin-bottom:calc(var(--spacing) * 3)}.mb-4{margin-bottom:calc(var(--spacing) * 4)}.mb-5{margin-bottom:calc(var(--spacing) * 5)}.mb-6{margin-bottom:calc(var(--spacing) * 6)}.mb-10{margin-bottom:calc(var(--spacing) * 10)}.block{display:block}.contents{display:contents}.flex{display:flex}.hidden{display:none}.table{display:table}.h-1{height:calc(var(--spacing) * 1)}.h-10{height:calc(var(--spacing) * 10)}.h-14{height:calc(var(--spacing) * 14)}.h-full{height:100%}.min-h-screen{min-height:100vh}.w-8{width:calc(var(--spacing) * 8)}.w-10{width:calc(var(--spacing) * 10)}.w-24{width:calc(var(--spacing) * 24)}.w-full{width:100%}.max-w-2xl{max-width:var(--container-2xl)}.max-w-md{max-width:var(--container-md)}.max-w-sm{max-width:var(--container-sm)}.min-w-0{min-width:calc(var(--spacing) * 0)}.flex-1{flex:1}.flex-shrink-0{flex-shrink:0}.transform{transform:var(--tw-rotate-x,) var(--tw-rotate-y,) var(--tw-rotate-z,) var(--tw-skew-x,) var(--tw-skew-y,)}.cursor-pointer{cursor:pointer}.resize-none{resize:none}.list-inside{list-style-position:inside}.list-decimal{list-style-type:decimal}.list-disc{list-style-type:disc}.flex-col{flex-direction:column}.flex-wrap{flex-wrap:wrap}.items-baseline{align-items:baseline}.items-center{align-items:center}.items-end{align-items:flex-end}.justify-between{justify-content:space-between}.justify-center{justify-content:center}.gap-2{gap:calc(var(--spacing) * 2)}.gap-3{gap:calc(var(--spacing) * 3)}.gap-\[2px\]{gap:2px}:where(.space-y-1>:not(:last-child)){--tw-space-y-reverse:0;margin-block-start:calc(calc(var(--spacing) * 1) * var(--tw-space-y-reverse));margin-block-end:calc(calc(var(--spacing) * 1) * calc(1 - var(--tw-space-y-reverse)))}:where(.space-y-2>:not(:last-child)){--tw-space-y-reverse:0;margin-block-start:calc(calc(var(--spacing) * 2) * var(--tw-space-y-reverse));margin-block-end:calc(calc(var(--spacing) * 2) * calc(1 - var(--tw-space-y-reverse)))}:where(.space-y-3>:not(:last-child)){--tw-space-y-reverse:0;margin-block-start:calc(calc(var(--spacing) * 3) * var(--tw-space-y-reverse));margin-block-end:calc(calc(var(--spacing) * 3) * calc(1 - var(--tw-space-y-reverse)))}:where(.space-y-4>:not(:last-child)){--tw-space-y-reverse:0;margin-block-start:calc(calc(var(--spacing) * 4) * var(--tw-space-y-reverse));margin-block-end:calc(calc(var(--spacing) * 4) * calc(1 - var(--tw-space-y-reverse)))}.truncate{text-overflow:ellipsis;white-space:nowrap;overflow:hidden}.overflow-hidden{overflow:hidden}.rounded{border-radius:.25rem}.rounded-2xl{border-radius:var(--radius-2xl)}.rounded-\[2px\]{border-radius:2px}.rounded-full{border-radius:3.40282e38px}.rounded-lg{border-radius:var(--radius-lg)}.rounded-md{border-radius:var(--radius-md)}.rounded-xl{border-radius:var(--radius-xl)}.border{border-style:var(--tw-border-style);border-width:1px}.bg-transparent{background-color:#0000}.p-3{padding:calc(var(--spacing) * 3)}.p-4{padding:calc(var(--spacing) * 4)}.p-5{padding:calc(var(--spacing) * 5)}.p-8{padding:calc(var(--spacing) * 8)}.px-3{padding-inline:calc(var(--spacing) * 3)}.px-4{padding-inline:calc(var(--spacing) * 4)}.px-6{padding-inline:calc(var(--spacing) * 6)}.px-8{padding-inline:calc(var(--spacing) * 8)}.py-1\.5{padding-block:calc(var(--spacing) * 1.5)}.py-2\.5{padding-block:calc(var(--spacing) * 2.5)}.py-3{padding-block:calc(var(--spacing) * 3)}.py-3\.5{padding-block:calc(var(--spacing) * 3.5)}.py-10{padding-block:calc(var(--spacing) * 10)}.pt-2{padding-top:calc(var(--spacing) * 2)}.text-center{text-align:center}.text-left{text-align:left}.text-right{text-align:right}.font-mono{font-family:var(--font-mono)}.text-2xl{font-size:var(--text-2xl);line-height:var(--tw-leading,var(--text-2xl--line-height))}.text-4xl{font-size:var(--text-4xl);line-height:var(--tw-leading,var(--text-4xl--line-height))}.text-5xl{font-size:var(--text-5xl);line-height:var(--tw-leading,var(--text-5xl--line-height))}.text-base{font-size:var(--text-base);line-height:var(--tw-leading,var(--text-base--line-height))}.text-lg{font-size:var(--text-lg);line-height:var(--tw-leading,var(--text-lg--line-height))}.text-sm{font-size:var(--text-sm);line-height:var(--tw-leading,var(--text-sm--line-height))}.text-xl{font-size:var(--text-xl);line-height:var(--tw-leading,var(--text-xl--line-height))}.text-xs{font-size:var(--text-xs);line-height:var(--tw-leading,var(--text-xs--line-height))}.text-\[10px\]{font-size:10px}.text-\[11px\]{font-size:11px}.text-\[13px\]{font-size:13px}.leading-none{--tw-leading:1;line-height:1}.leading-relaxed{--tw-leading:var(--leading-relaxed);line-height:var(--leading-relaxed)}.font-medium{--tw-font-weight:var(--font-weight-medium);font-weight:var(--font-weight-medium)}.font-semibold{--tw-font-weight:var(--font-weight-semibold);font-weight:var(--font-weight-semibold)}.tracking-wider{--tw-tracking:var(--tracking-wider);letter-spacing:var(--tracking-wider)}.tracking-widest{--tw-tracking:var(--tracking-widest);letter-spacing:var(--tracking-widest)}.uppercase{text-transform:uppercase}.underline{text-decoration-line:underline}.filter{filter:var(--tw-blur,) var(--tw-brightness,) var(--tw-contrast,) var(--tw-grayscale,) var(--tw-hue-rotate,) var(--tw-invert,) var(--tw-saturate,) var(--tw-sepia,) var(--tw-drop-shadow,)}.transition{transition-property:color,background-color,border-color,outline-color,text-decoration-color,fill,stroke,--tw-gradient-from,--tw-gradient-via,--tw-gradient-to,opacity,box-shadow,transform,translate,scale,rotate,filter,-webkit-backdrop-filter,backdrop-filter,display,content-visibility,overlay,pointer-events;transition-timing-function:var(--tw-ease,var(--default-transition-timing-function));transition-duration:var(--tw-duration,var(--default-transition-duration))}.transition-all{transition-property:all;transition-timing-function:var(--tw-ease,var(--default-transition-timing-function));transition-duration:var(--tw-duration,var(--default-transition-duration))}.transition-colors{transition-property:color,background-color,border-color,outline-color,text-decoration-color,fill,stroke,--tw-gradient-from,--tw-gradient-via,--tw-gradient-to;transition-timing-function:var(--tw-ease,var(--default-transition-timing-function));transition-duration:var(--tw-duration,var(--default-transition-duration))}.duration-300{--tw-duration:.3s;transition-duration:.3s}.outline-none{--tw-outline-style:none;outline-style:none}.select-none{-webkit-user-select:none;user-select:none}@media (hover:hover){.hover\:scale-105:hover{--tw-scale-x:105%;--tw-scale-y:105%;--tw-scale-z:105%;scale:var(--tw-scale-x) var(--tw-scale-y)}.hover\:scale-\[1\.01\]:hover{scale:1.01}.hover\:scale-\[1\.02\]:hover{scale:1.02}.hover\:opacity-80:hover{opacity:.8}}.disabled\:cursor-not-allowed:disabled{cursor:not-allowed}.disabled\:opacity-50:disabled{opacity:.5}}:root{--bg:oklch(13% .006 260);--bg-elev:oklch(17% .008 260);--surface:oklch(22% .01 260);--border:oklch(28% .01 260);--text:oklch(95% .005 260);--text-muted:oklch(65% .01 260);--text-dim:oklch(45% .008 260);--accent:oklch(72% .17 305);--accent-glow:oklch(80% .18 305);--accent-soft:oklch(72% .17 305/.15);--success:oklch(72% .14 155);--danger:oklch(65% .2 22)}html,body,#root{min-height:100vh;margin:0}body{background:var(--bg);color:var(--text);letter-spacing:-.005em;-webkit-font-smoothing:antialiased;font-family:Hanken Grotesk,system-ui,-apple-system,sans-serif;font-weight:400}code,pre,.font-mono{font-family:JetBrains Mono,ui-monospace,Consolas,monospace}@keyframes wave-pulse{0%,to{opacity:.2;transform:scaleY(.3)}50%{opacity:.6;transform:scaleY(1)}}@keyframes fade-in{0%{opacity:0;transform:translateY(4px)}to{opacity:1;transform:translateY(0)}}@keyframes soft-glow{0%,to{box-shadow:0 0 20px oklch(72% .17 305/.2)}50%{box-shadow:0 0 40px oklch(72% .17 305/.5)}}.pulse-bar{transform-origin:bottom;animation:1.2s ease-in-out infinite wave-pulse}.fade-in{animation:.3s ease-out fade-in}.glow{animation:2s ease-in-out infinite soft-glow}input[type=range]{appearance:none;cursor:pointer;background:0 0}input[type=range]::-webkit-slider-runnable-track{background:var(--border);border-radius:4px;height:4px}input[type=range]::-webkit-slider-thumb{appearance:none;background:var(--accent);border-radius:50%;width:16px;height:16px;margin-top:-6px;box-shadow:0 0 12px oklch(72% .17 305/.4)}input[type=range]:focus{outline:none}textarea::-webkit-scrollbar{width:6px}textarea::-webkit-scrollbar-track{background:0 0}textarea::-webkit-scrollbar-thumb{background:var(--border);border-radius:3px}textarea::-webkit-scrollbar-thumb:hover{background:var(--text-dim)}@property --tw-rotate-x{syntax:"*";inherits:false}@property --tw-rotate-y{syntax:"*";inherits:false}@property --tw-rotate-z{syntax:"*";inherits:false}@property --tw-skew-x{syntax:"*";inherits:false}@property --tw-skew-y{syntax:"*";inherits:false}@property --tw-space-y-reverse{syntax:"*";inherits:false;initial-value:0}@property --tw-border-style{syntax:"*";inherits:false;initial-value:solid}@property --tw-leading{syntax:"*";inherits:false}@property --tw-font-weight{syntax:"*";inherits:false}@property --tw-tracking{syntax:"*";inherits:false}@property --tw-blur{syntax:"*";inherits:false}@property --tw-brightness{syntax:"*";inherits:false}@property --tw-contrast{syntax:"*";inherits:false}@property --tw-grayscale{syntax:"*";inherits:false}@property --tw-hue-rotate{syntax:"*";inherits:false}@property --tw-invert{syntax:"*";inherits:false}@property --tw-opacity{syntax:"*";inherits:false}@property --tw-saturate{syntax:"*";inherits:false}@property --tw-sepia{syntax:"*";inherits:false}@property --tw-drop-shadow{syntax:"*";inherits:false}@property --tw-drop-shadow-color{syntax:"*";inherits:false}@property --tw-drop-shadow-alpha{syntax:"<percentage>";inherits:false;initial-value:100%}@property --tw-drop-shadow-size{syntax:"*";inherits:false}@property --tw-duration{syntax:"*";inherits:false}@property --tw-scale-x{syntax:"*";inherits:false;initial-value:1}@property --tw-scale-y{syntax:"*";inherits:false;initial-value:1}@property --tw-scale-z{syntax:"*";inherits:false;initial-value:1}
|
assets/lm-worker-CMbQRLr6.js
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
assets/ort-wasm-simd-threaded.asyncify-9GUf3Unn.wasm
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f33595b9f7ea51aa6f646dd5a2bde6fbb1c7bcde0b9d2b5f240011a09c1830d0
|
| 3 |
+
size 27190919
|
assets/ort-wasm-simd-threaded.asyncify-CtKKja6V.wasm
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1d6ee4ff60d7f0e6b6efa34469157d3bdcbe2f3b0dbcea2a645bb41361a85973
|
| 3 |
+
size 23543806
|
assets/worker-retwKpvq.js
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
favicon.svg
ADDED
|
|
icons.svg
ADDED
|
|
index.html
CHANGED
|
@@ -1,19 +1,18 @@
|
|
| 1 |
<!doctype html>
|
| 2 |
-
<html>
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
</body>
|
| 19 |
</html>
|
|
|
|
| 1 |
<!doctype html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8" />
|
| 5 |
+
<link rel="icon" type="image/svg+xml" href="/favicon.svg" />
|
| 6 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
| 7 |
+
<meta name="description" content="ACE-Step 1.5 text-to-music generation running entirely in your browser via WebGPU" />
|
| 8 |
+
<link rel="preconnect" href="https://fonts.googleapis.com" />
|
| 9 |
+
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
|
| 10 |
+
<link href="https://fonts.googleapis.com/css2?family=Hanken+Grotesk:wght@300;400;500;600;700&family=JetBrains+Mono:wght@400;500&family=Dancing+Script:wght@500;600;700&display=swap" rel="stylesheet" />
|
| 11 |
+
<title>ACE-Step WebGPU — Text to Music</title>
|
| 12 |
+
<script type="module" crossorigin src="/assets/index-C7vMACvi.js"></script>
|
| 13 |
+
<link rel="stylesheet" crossorigin href="/assets/index-CccuoAYh.css">
|
| 14 |
+
</head>
|
| 15 |
+
<body>
|
| 16 |
+
<div id="root"></div>
|
| 17 |
+
</body>
|
|
|
|
| 18 |
</html>
|
silence_latent.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a7ee13d8902f0c02def49249f05a3e5dd99550ae8aed263299be43329b330e23
|
| 3 |
+
size 3840000
|
silence_latent_meta.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"shape": [1, 15000, 64], "dtype": "float32"}
|
silence_roundtripped.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8e22b8c9e8a687c7ebfe57dc3bee42b5c330d35ca350f04575a79bca6045dfcd
|
| 3 |
+
size 192000
|
silence_roundtripped_meta.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"shape": [1, 750, 64], "dtype": "float32"}
|