shreyask Claude Opus 4.6 (1M context) commited on
Commit
9b1aef8
·
verified ·
1 Parent(s): f2081ca

feat: KittenTTS WebGPU browser demo

Browse files

Browser TTS using KittenML models with ONNX Runtime Web (WebGPU/WASM).
Phonemization via xenova/phonemizer.js. No server needed.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

README.md CHANGED
@@ -1,10 +1,37 @@
1
  ---
2
  title: KittenTTS WebGPU
3
- emoji: 🔥
4
- colorFrom: yellow
5
- colorTo: blue
6
  sdk: static
7
  pinned: false
 
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: KittenTTS WebGPU
3
+ emoji: 🐱🔥
4
+ colorFrom: purple
5
+ colorTo: indigo
6
  sdk: static
7
  pinned: false
8
+ license: apache-2.0
9
  ---
10
 
11
+ # KittenTTS WebGPU
12
+
13
+ Text-to-speech running entirely in your browser via WebGPU/WASM. No server needed.
14
+
15
+ ## Credits
16
+
17
+ - **Models**: [KittenML](https://huggingface.co/KittenML) — ultra-lightweight TTS models based on StyleTTS 2
18
+ - **Original Demo**: [KittenTTS-Demo](https://huggingface.co/spaces/KittenML/KittenTTS-Demo)
19
+ - **Transformers.js v4**: [huggingface/transformers.js](https://github.com/huggingface/transformers.js) — ML inference in the browser
20
+ - **phonemizer.js**: [xenova/phonemizer.js](https://github.com/xenova/phonemizer.js) — eSpeak-NG phonemization for the web by [Xenova](https://github.com/xenova)
21
+ - **ONNX Runtime Web**: [onnxruntime](https://onnxruntime.ai) — cross-platform ML inference
22
+ - **Kokoro Web**: [xenova/kokoro-web](https://github.com/xenova/kokoro-web) — reference implementation for browser TTS
23
+
24
+ ## Development
25
+
26
+ ```bash
27
+ npm install
28
+ npm run dev
29
+ ```
30
+
31
+ ## Build
32
+
33
+ ```bash
34
+ npm run build
35
+ ```
36
+
37
+ Output goes to `dist/` — deploy as a static HuggingFace Space or any static host.
assets/index-CcptzfjJ.js ADDED
The diff for this file is too large to render. See raw diff
 
assets/index-Ck4Qq8gn.css ADDED
@@ -0,0 +1 @@
 
 
1
+ *,:before,:after{box-sizing:border-box;margin:0;padding:0}:root{--bg:#111;--surface:#1a1a1a;--surface-2:#222;--border:#333;--text:#e5e5e5;--text-muted:#888;--accent:#c084fc;--accent-dim:#c084fc26;--radius:8px;--font:system-ui, -apple-system, "Segoe UI", sans-serif;--mono:ui-monospace, "SF Mono", Consolas, monospace}html{--lightningcss-light: ;--lightningcss-dark:initial;color-scheme:dark}body{font-family:var(--font);background:var(--bg);color:var(--text);-webkit-font-smoothing:antialiased;line-height:1.5}.container{flex-direction:column;max-width:720px;min-height:100vh;margin:0 auto;padding:2rem 1.5rem;display:flex}header{text-align:center;margin-bottom:2rem;position:relative}header h1{letter-spacing:-.02em;margin-bottom:.25rem;font-size:1.75rem;font-weight:600}.logo{font-size:1.5rem}.subtitle{color:var(--text-muted);font-size:.9rem}.badge{font-size:.7rem;font-weight:600;font-family:var(--mono);letter-spacing:.05em;border-radius:999px;margin-top:.5rem;padding:.15rem .6rem;display:inline-block}.badge-gpu{color:#4ade80;background:#4ade8026;border:1px solid #4ade804d}.badge-wasm{color:#fbbf24;background:#fbbf2426;border:1px solid #fbbf244d}main{flex-direction:column;flex:1;gap:1.5rem;display:flex}label{color:var(--text-muted);text-transform:uppercase;letter-spacing:.05em;margin-bottom:.4rem;font-size:.8rem;font-weight:500;display:block}textarea{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);width:100%;color:var(--text);font-family:var(--font);resize:vertical;outline:none;padding:.75rem;font-size:.95rem;transition:border-color .15s}textarea:focus{border-color:var(--accent)}textarea::placeholder{color:#555}.controls-row{grid-template-columns:1fr 1fr;gap:.75rem;margin-top:.75rem;display:grid}select{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);width:100%;color:var(--text);font-family:var(--font);cursor:pointer;appearance:none;background-image:url("data:image/svg+xml,%3Csvg width='10' height='6' viewBox='0 0 10 6' fill='none' xmlns='http://www.w3.org/2000/svg'%3E%3Cpath d='M1 1l4 4 4-4' stroke='%23888' stroke-width='1.5' stroke-linecap='round' stroke-linejoin='round'/%3E%3C/svg%3E");background-position:right .75rem center;background-repeat:no-repeat;outline:none;padding:.5rem 2rem .5rem .75rem;font-size:.9rem}select:focus{border-color:var(--accent)}select:disabled{opacity:.5;cursor:not-allowed}.speed-row{margin-top:.75rem}.speed-row label{font-family:var(--mono);font-size:.75rem}input[type=range]{width:100%;accent-color:var(--accent);cursor:pointer;height:4px}.generate-btn{background:var(--accent);color:#111;border-radius:var(--radius);width:100%;font-family:var(--font);cursor:pointer;border:none;margin-top:1rem;padding:.7rem 1.5rem;font-size:.9rem;font-weight:600;transition:opacity .15s}.generate-btn:hover:not(:disabled){opacity:.9}.generate-btn:disabled{opacity:.4;cursor:not-allowed}.output-section{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:1rem}.audio-result{flex-direction:column;gap:.5rem;display:flex}.audio-player{border-radius:var(--radius);width:100%}.duration{color:var(--text-muted);font-size:.75rem;font-family:var(--mono)}.audio-placeholder{text-align:center;color:#555;padding:2rem 1rem;font-size:.85rem}.examples-grid{flex-direction:column;gap:.5rem;display:flex}.example-btn{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);width:100%;color:var(--text);font-family:var(--font);cursor:pointer;text-align:left;align-items:baseline;gap:.75rem;padding:.6rem .75rem;font-size:.85rem;transition:border-color .15s;display:flex}.example-btn:hover:not(:disabled){border-color:var(--accent)}.example-btn:disabled{opacity:.4;cursor:not-allowed}.example-voice{font-family:var(--mono);color:var(--accent);background:var(--accent-dim);border-radius:4px;flex-shrink:0;padding:.1rem .4rem;font-size:.75rem;font-weight:600}.example-text{color:var(--text-muted)}.error-msg{color:#f87171;border-radius:var(--radius);background:#ef44441a;border:1px solid #ef44444d;padding:.75rem 1rem;font-size:.85rem}footer{border-top:1px solid var(--border);text-align:center;color:var(--text-muted);flex-direction:column;gap:.25rem;margin-top:2.5rem;padding-top:1.5rem;font-size:.75rem;display:flex}footer a{color:var(--text-muted);text-underline-offset:2px;text-decoration:underline;transition:color .15s}footer a:hover{color:var(--text)}@media (width<=480px){.container{padding:1.25rem 1rem}.controls-row{grid-template-columns:1fr}header h1{font-size:1.5rem}}
assets/ort-wasm-simd-threaded.jsep-Bzonanhp.wasm ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66dd6edabc43c9ec1df860978baa403c6610de2f3b3bbfdfcfcbbfadf7677132
3
+ size 25096522
assets/ort.bundle.min-DL658BJE.js ADDED
The diff for this file is too large to render. See raw diff
 
assets/phonemizer-BgK0uh4o.js ADDED
The diff for this file is too large to render. See raw diff
 
assets/worker-Br1yesTn.js ADDED
@@ -0,0 +1 @@
 
 
1
+ const e=[`$`,...`;:,.!?¡¿—…"«»"" `,...`ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz`,...`ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ`],t={};for(let n=0;n<e.length;n++)t[e[n]]=n;function n(e){let n=[];for(let r of e){let e=t[r];e!==void 0&&n.push(e)}return n}function r(e){let t=n(e);return t.unshift(0),t.push(10),t.push(0),t}function i(e){if(e[0]!==147||String.fromCharCode(e[1],e[2],e[3],e[4],e[5])!==`NUMPY`)throw Error(`Not a valid .npy file`);let t=e[6],n=new DataView(e.buffer,e.byteOffset,e.byteLength),r,i;t===1?(r=n.getUint16(8,!0),i=10):(r=n.getUint32(8,!0),i=12);let a=new TextDecoder().decode(e.slice(i,i+r)),o=a.match(/'descr'\s*:\s*'([^']+)'/),s=a.match(/'shape'\s*:\s*\(([^)]*)\)/);if(!o)throw Error(`Could not parse dtype from .npy header: `+a);return{descr:o[1],shape:s?s[1].split(`,`).map(e=>parseInt(e.trim(),10)).filter(e=>!isNaN(e)):[],dataOffset:i+r}}function a(e){let{descr:t,shape:n,dataOffset:r}=i(e),a=e.slice(r),o=new ArrayBuffer(a.length);new Uint8Array(o).set(a);let s;if(t===`<f4`||t===`float32`)s=new Float32Array(o);else if(t===`<f8`||t===`float64`){let e=new Float64Array(o);s=new Float32Array(e.length);for(let t=0;t<e.length;t++)s[t]=e[t]}else throw Error(`Unsupported npy dtype: `+t);return{data:s,shape:n}}async function o(e){let t=new Uint8Array(e),n=new DataView(e),r=new Map,i=-1;for(let e=t.length-22;e>=0;e--)if(n.getUint32(e,!0)===101010256){i=e;break}if(i===-1)throw Error(`Could not find End of Central Directory`);let a=n.getUint32(i+16,!0),o=n.getUint16(i+10,!0),s=[],c=a;for(let e=0;e<o&&n.getUint32(c,!0)===33639248;e++){let e=n.getUint16(c+10,!0),r=n.getUint32(c+20,!0),i=n.getUint32(c+24,!0),a=n.getUint16(c+28,!0),o=n.getUint16(c+30,!0),l=n.getUint16(c+32,!0),u=n.getUint32(c+42,!0),d=new TextDecoder().decode(t.slice(c+46,c+46+a));s.push({fileName:d,compressedSize:r,uncompressedSize:i,localHeaderOffset:u,compressionMethod:e}),c+=46+a+o+l}for(let e of s){let i=e.localHeaderOffset,a=n.getUint16(i+26,!0),o=n.getUint16(i+28,!0),s=i+30+a+o,c;if(e.compressionMethod===0)c=t.slice(s,s+e.uncompressedSize);else if(e.compressionMethod===8){let n=t.slice(s,s+e.compressedSize),r=new DecompressionStream(`deflate-raw`),i=r.writable.getWriter();i.write(n),i.close();let a=r.readable.getReader(),o=[],l=0;for(;;){let{done:e,value:t}=await a.read();if(e)break;o.push(t),l+=t.length}c=new Uint8Array(l);let u=0;for(let e of o)c.set(e,u),u+=e.length}else{console.warn(`Skipping ${e.fileName}: unsupported compression ${e.compressionMethod}`);continue}r.set(e.fileName,c)}return r}async function s(e){let t=await fetch(e);if(!t.ok)throw Error(`Failed to fetch voices: ${t.status}`);let n=await o(await t.arrayBuffer()),r={};for(let[e,t]of n){if(!e.endsWith(`.npy`))continue;let n=e.replace(/\.npy$/,``),{data:i,shape:o}=a(t);r[n]={data:i,shape:[o[0]||1,o[1]||i.length]}}return r}let c,l;const u=[`int8`];let d=null,f={},p=null,m=`wasm`;function h(e,t){return`https://huggingface.co/${e}/resolve/main/${t}`}async function g(){try{return`gpu`in navigator?!!await navigator.gpu.requestAdapter():!1}catch{return!1}}async function _(e){self.postMessage({type:`status`,message:`Detecting hardware...`});let t=await g();self.postMessage({type:`status`,message:`Loading runtime...`});let[n,r]=await Promise.all([import(`./ort.bundle.min-DL658BJE.js`),import(`./phonemizer-BgK0uh4o.js`)]);l=n,c=r.phonemize,self.postMessage({type:`status`,message:`Loading config...`});let i=h(e,`config.json`);p=await(await fetch(i)).json();let a=p.model||e.split(`/`).pop()||``,o=u.some(e=>a.includes(e));m=t&&!o?`webgpu`:`wasm`,t&&o&&console.log(`[KittenTTS] Using WASM for "${a}" (int8 models produce NaN on WebGPU)`),self.postMessage({type:`device`,device:m}),self.postMessage({type:`status`,message:`Downloading model & voices...`});let _=h(e,p.model_file),v=(async()=>{let e=await fetch(_);if(!e.ok)throw Error(`Failed to fetch model: ${e.status}`);let t=parseInt(e.headers.get(`content-length`)||`0`,10),n=e.body.getReader(),r=[],i=0;for(;;){let{done:e,value:a}=await n.read();if(e)break;if(r.push(a),i+=a.length,t>0){let e=Math.round(i/t*100),n=(i/1024/1024).toFixed(1);self.postMessage({type:`status`,message:`Downloading model... ${e}% (${n} MB)`})}}let a=new Uint8Array(i),o=0;for(let e of r)a.set(e,o),o+=e.length;return a.buffer})(),y=s(h(e,p.voices)),[b,x]=await Promise.all([v,y]);f=x,self.postMessage({type:`status`,message:`Initializing ${m.toUpperCase()} session...`});let S={executionProviders:m===`webgpu`?[`webgpu`]:[`wasm`]};m===`wasm`&&(l.env.wasm.numThreads=1),d=await l.InferenceSession.create(b,S);let C=p.voice_aliases?Object.keys(p.voice_aliases):Object.keys(f);self.postMessage({type:`ready`,voices:C,device:m,modelName:p.name})}function v(e){return e=e.trim(),e&&(`.!?,;:`.includes(e[e.length-1])||(e+=`,`),e)}function y(e,t=400){let n=e.match(/[^.!?]*[.!?]+|[^.!?]+$/g)||[e],r=[];for(let e of n)if(e=e.trim(),e)if(e.length<=t)r.push(v(e));else{let n=e.split(/\s+/),i=``;for(let e of n)i.length+e.length+1<=t?i+=(i?` `:``)+e:(i&&r.push(v(i)),i=e);i&&r.push(v(i))}return r}function b(e){return e.match(/[\p{L}\p{N}_]+|[^\p{L}\p{N}_\s]/gu)||[]}async function x(e,t,n){if(!d||!p)throw Error(`Model not loaded`);let i=t;p.voice_aliases?.[t]&&(i=p.voice_aliases[t]);let a=f[i];if(!a)throw Error(`Voice "${t}" not found`);p.speed_priors?.[i]&&(n*=p.speed_priors[i]);let o=r(b((await c(e,`en-us`))[0]||``).join(` `)),s=Math.min(e.length,a.shape[0]-1),u=a.shape[1],m=a.data.slice(s*u,(s+1)*u),h=new l.Tensor(`int64`,BigInt64Array.from(o.map(BigInt)),[1,o.length]),g=new l.Tensor(`float32`,m,[1,u]),_=new l.Tensor(`float32`,new Float32Array([n]),[1]),v=(await d.run({input_ids:h,style:g,speed:_}))[d.outputNames[0]].data;return v.slice(0,Math.max(0,v.length-5e3))}async function S(e,t,n){try{let r=y(e);self.postMessage({type:`status`,message:`Generating (${r.length} chunk${r.length>1?`s`:``})...`});let i=[];for(let e=0;e<r.length;e++){self.postMessage({type:`progress`,current:e+1,total:r.length});let a=await x(r[e],t,n);i.push(a)}let a=i.reduce((e,t)=>e+t.length,0),o=new Float32Array(a),s=0;for(let e of i)o.set(e,s),s+=e.length;self.postMessage({type:`audio`,audio:o.buffer,sampleRate:24e3},{transfer:[o.buffer]})}catch(e){self.postMessage({type:`error`,error:e.message||String(e)})}}self.addEventListener(`message`,async e=>{let{action:t,...n}=e.data;switch(t){case`load`:try{await _(n.repoId)}catch(e){console.error(`[KittenTTS Worker] Load error:`,e),self.postMessage({type:`error`,error:e.message||String(e)})}break;case`generate`:await S(n.text,n.voice,n.speed);break}}),self.addEventListener(`error`,e=>{self.postMessage({type:`error`,error:e.message||`Unknown worker error`})}),self.addEventListener(`unhandledrejection`,e=>{self.postMessage({type:`error`,error:e.reason?.message||String(e.reason)})});
index.html CHANGED
@@ -1,19 +1,14 @@
1
  <!doctype html>
2
- <html>
3
- <head>
4
- <meta charset="utf-8" />
5
- <meta name="viewport" content="width=device-width" />
6
- <title>My static Space</title>
7
- <link rel="stylesheet" href="style.css" />
8
- </head>
9
- <body>
10
- <div class="card">
11
- <h1>Welcome to your static Space!</h1>
12
- <p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
13
- <p>
14
- Also don't forget to check the
15
- <a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
16
- </p>
17
- </div>
18
- </body>
19
  </html>
 
1
  <!doctype html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
6
+ <title>KittenTTS Browser TTS with WebGPU</title>
7
+ <meta name="description" content="Text-to-speech running entirely in your browser via WebGPU. Powered by KittenML models and Transformers.js v4." />
8
+ <script type="module" crossorigin src="/assets/index-CcptzfjJ.js"></script>
9
+ <link rel="stylesheet" crossorigin href="/assets/index-Ck4Qq8gn.css">
10
+ </head>
11
+ <body>
12
+ <div id="root"></div>
13
+ </body>
 
 
 
 
 
14
  </html>
package.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "kitten-tts-web",
3
+ "private": true,
4
+ "version": "0.0.0",
5
+ "type": "module",
6
+ "scripts": {
7
+ "dev": "vite",
8
+ "build": "tsc -b && vite build",
9
+ "lint": "eslint .",
10
+ "preview": "vite preview"
11
+ },
12
+ "dependencies": {
13
+ "@huggingface/transformers": "^4.0.0-next.8",
14
+ "onnxruntime-web": "^1.25.0-dev.20260307-d626b568e0",
15
+ "phonemizer": "^1.2.1",
16
+ "react": "^19.2.4",
17
+ "react-dom": "^19.2.4"
18
+ },
19
+ "devDependencies": {
20
+ "@eslint/js": "^9.39.4",
21
+ "@types/node": "^24.12.0",
22
+ "@types/react": "^19.2.14",
23
+ "@types/react-dom": "^19.2.3",
24
+ "@vitejs/plugin-react": "^6.0.1",
25
+ "@webgpu/types": "^0.1.69",
26
+ "eslint": "^9.39.4",
27
+ "eslint-plugin-react-hooks": "^7.0.1",
28
+ "eslint-plugin-react-refresh": "^0.5.2",
29
+ "globals": "^17.4.0",
30
+ "typescript": "~5.9.3",
31
+ "typescript-eslint": "^8.57.0",
32
+ "vite": "^8.0.1"
33
+ }
34
+ }
src/App.tsx ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useState, useRef, useCallback, useEffect } from "react";
2
+
3
+ const MODELS: Record<string, string> = {
4
+ "Nano Int8 (15M · Fastest)": "KittenML/kitten-tts-nano-0.8-int8",
5
+ "Nano FP32 (15M)": "KittenML/kitten-tts-nano-0.8-fp32",
6
+ "Micro (40M · Balanced)": "KittenML/kitten-tts-micro-0.8",
7
+ "Mini (80M · Best Quality)": "KittenML/kitten-tts-mini-0.8",
8
+ };
9
+
10
+ const DEFAULT_MODEL = "Nano FP32 (15M)";
11
+
12
+ const EXAMPLES = [
13
+ {
14
+ text: "Space is a three-dimensional continuum containing positions and directions.",
15
+ voice: "Jasper",
16
+ },
17
+ {
18
+ text: "She picked up her coffee and walked toward the window.",
19
+ voice: "Luna",
20
+ },
21
+ {
22
+ text: "The sun set slowly over the calm, quiet lake.",
23
+ voice: "Bella",
24
+ },
25
+ ];
26
+
27
+ type Status = "idle" | "loading" | "ready" | "generating" | "error";
28
+
29
+ export default function App() {
30
+ const [text, setText] = useState("");
31
+ const [model, setModel] = useState(DEFAULT_MODEL);
32
+ const [voice, setVoice] = useState("Jasper");
33
+ const [speed, setSpeed] = useState(1.0);
34
+ const [voices, setVoices] = useState<string[]>([]);
35
+ const [status, setStatus] = useState<Status>("idle");
36
+ const [statusMsg, setStatusMsg] = useState("");
37
+ const [device, setDevice] = useState("");
38
+ const [progress, setProgress] = useState({ current: 0, total: 0 });
39
+ const [audioUrl, setAudioUrl] = useState<string | null>(null);
40
+ const [error, setError] = useState<string | null>(null);
41
+ const [duration, setDuration] = useState<number | null>(null);
42
+
43
+ const workerRef = useRef<Worker | null>(null);
44
+ const genStartRef = useRef<number>(0);
45
+
46
+ const initWorker = useCallback(() => {
47
+ if (workerRef.current) workerRef.current.terminate();
48
+
49
+ const worker = new Worker(new URL("./worker.ts", import.meta.url), {
50
+ type: "module",
51
+ });
52
+ workerRef.current = worker;
53
+
54
+ worker.addEventListener("error", (e) => {
55
+ console.error("Worker error:", e);
56
+ setError(`Worker failed: ${e.message}`);
57
+ setStatus("error");
58
+ setStatusMsg("");
59
+ });
60
+
61
+ worker.addEventListener("message", (e) => {
62
+ const msg = e.data;
63
+ switch (msg.type) {
64
+ case "status":
65
+ setStatusMsg(msg.message);
66
+ break;
67
+ case "device":
68
+ setDevice(msg.device);
69
+ break;
70
+ case "ready":
71
+ setStatus("ready");
72
+ setVoices(msg.voices);
73
+ setStatusMsg(`${msg.modelName} loaded`);
74
+ break;
75
+ case "progress":
76
+ setProgress({ current: msg.current, total: msg.total });
77
+ break;
78
+ case "audio": {
79
+ const audioData = new Float32Array(msg.audio);
80
+ const blob = float32ToWav(audioData, msg.sampleRate);
81
+ const url = URL.createObjectURL(blob);
82
+ setAudioUrl((prev) => {
83
+ if (prev) URL.revokeObjectURL(prev);
84
+ return url;
85
+ });
86
+ setDuration(
87
+ Math.round(performance.now() - genStartRef.current)
88
+ );
89
+ setStatus("ready");
90
+ setStatusMsg("Done!");
91
+ break;
92
+ }
93
+ case "error":
94
+ setError(msg.error);
95
+ setStatus("error");
96
+ setStatusMsg("");
97
+ break;
98
+ }
99
+ });
100
+
101
+ return worker;
102
+ }, []);
103
+
104
+ const loadModel = useCallback(
105
+ (modelKey: string) => {
106
+ const worker = workerRef.current || initWorker();
107
+ setStatus("loading");
108
+ setError(null);
109
+ setAudioUrl(null);
110
+ setDuration(null);
111
+ setStatusMsg("Starting...");
112
+ worker.postMessage({ action: "load", repoId: MODELS[modelKey] });
113
+ },
114
+ [initWorker]
115
+ );
116
+
117
+ useEffect(() => {
118
+ loadModel(model);
119
+ return () => {
120
+ workerRef.current?.terminate();
121
+ };
122
+ // eslint-disable-next-line react-hooks/exhaustive-deps
123
+ }, []);
124
+
125
+ const handleModelChange = (newModel: string) => {
126
+ setModel(newModel);
127
+ loadModel(newModel);
128
+ };
129
+
130
+ const handleGenerate = () => {
131
+ if (!text.trim() || status !== "ready") return;
132
+ setStatus("generating");
133
+ setError(null);
134
+ setDuration(null);
135
+ setProgress({ current: 0, total: 0 });
136
+ genStartRef.current = performance.now();
137
+ workerRef.current?.postMessage({ action: "generate", text, voice, speed });
138
+ };
139
+
140
+ const handleExample = (ex: (typeof EXAMPLES)[0]) => {
141
+ setText(ex.text);
142
+ setVoice(ex.voice);
143
+ };
144
+
145
+ const handleKeyDown = (e: React.KeyboardEvent<HTMLTextAreaElement>) => {
146
+ if ((e.metaKey || e.ctrlKey) && e.key === "Enter") {
147
+ e.preventDefault();
148
+ handleGenerate();
149
+ }
150
+ };
151
+
152
+ return (
153
+ <div className="container">
154
+ <header>
155
+ <h1>
156
+ <span className="logo">🐱</span> KittenTTS
157
+ </h1>
158
+ <p className="subtitle">
159
+ Text-to-speech running entirely in your browser
160
+ </p>
161
+ {device && (
162
+ <span
163
+ className={`badge ${device === "webgpu" ? "badge-gpu" : "badge-wasm"}`}
164
+ >
165
+ {device.toUpperCase()}
166
+ </span>
167
+ )}
168
+ </header>
169
+
170
+ <main>
171
+ <div className="input-section">
172
+ <label htmlFor="text-input">Text</label>
173
+ <textarea
174
+ id="text-input"
175
+ value={text}
176
+ onChange={(e) => setText(e.target.value)}
177
+ onKeyDown={handleKeyDown}
178
+ placeholder="Enter text to synthesize…"
179
+ rows={5}
180
+ />
181
+
182
+ <div className="controls-row">
183
+ <div className="control">
184
+ <label htmlFor="model-select">Model</label>
185
+ <select
186
+ id="model-select"
187
+ value={model}
188
+ onChange={(e) => handleModelChange(e.target.value)}
189
+ disabled={status === "loading" || status === "generating"}
190
+ >
191
+ {Object.keys(MODELS).map((m) => (
192
+ <option key={m} value={m}>
193
+ {m}
194
+ </option>
195
+ ))}
196
+ </select>
197
+ </div>
198
+ <div className="control">
199
+ <label htmlFor="voice-select">Voice</label>
200
+ <select
201
+ id="voice-select"
202
+ value={voice}
203
+ onChange={(e) => setVoice(e.target.value)}
204
+ disabled={voices.length === 0}
205
+ >
206
+ {voices.map((v) => (
207
+ <option key={v} value={v}>
208
+ {v}
209
+ </option>
210
+ ))}
211
+ </select>
212
+ </div>
213
+ </div>
214
+
215
+ <div className="speed-row">
216
+ <label htmlFor="speed-slider">Speed: {speed.toFixed(2)}x</label>
217
+ <input
218
+ id="speed-slider"
219
+ type="range"
220
+ min={0.5}
221
+ max={2.0}
222
+ step={0.05}
223
+ value={speed}
224
+ onChange={(e) => setSpeed(parseFloat(e.target.value))}
225
+ />
226
+ </div>
227
+
228
+ <button
229
+ className="generate-btn"
230
+ onClick={handleGenerate}
231
+ disabled={status !== "ready" || !text.trim()}
232
+ >
233
+ {status === "generating"
234
+ ? progress.total > 0
235
+ ? `Generating ${progress.current}/${progress.total}…`
236
+ : "Generating…"
237
+ : status === "loading"
238
+ ? "Loading model…"
239
+ : "Generate Speech"}
240
+ </button>
241
+ </div>
242
+
243
+ <div className="output-section">
244
+ <label>Output</label>
245
+ {audioUrl ? (
246
+ <div className="audio-result">
247
+ <audio controls src={audioUrl} className="audio-player" />
248
+ {duration !== null && (
249
+ <span className="duration">
250
+ Generated in {(duration / 1000).toFixed(1)}s
251
+ </span>
252
+ )}
253
+ </div>
254
+ ) : (
255
+ <div className="audio-placeholder">
256
+ {status === "loading" || status === "generating"
257
+ ? statusMsg
258
+ : "Audio will appear here"}
259
+ </div>
260
+ )}
261
+ </div>
262
+
263
+ <div className="examples">
264
+ <label>Examples</label>
265
+ <div className="examples-grid">
266
+ {EXAMPLES.map((ex, i) => (
267
+ <button
268
+ key={i}
269
+ className="example-btn"
270
+ onClick={() => handleExample(ex)}
271
+ disabled={status !== "ready"}
272
+ >
273
+ <span className="example-voice">{ex.voice}</span>
274
+ <span className="example-text">{ex.text}</span>
275
+ </button>
276
+ ))}
277
+ </div>
278
+ </div>
279
+
280
+ {error && <div className="error-msg">{error}</div>}
281
+ </main>
282
+
283
+ <footer>
284
+ <p>
285
+ Models by{" "}
286
+ <a
287
+ href="https://huggingface.co/KittenML"
288
+ target="_blank"
289
+ rel="noopener"
290
+ >
291
+ KittenML
292
+ </a>
293
+ {" · "}
294
+ Original demo:{" "}
295
+ <a
296
+ href="https://huggingface.co/spaces/KittenML/KittenTTS-Demo"
297
+ target="_blank"
298
+ rel="noopener"
299
+ >
300
+ KittenTTS-Demo
301
+ </a>
302
+ </p>
303
+ <p>
304
+ Powered by{" "}
305
+ <a
306
+ href="https://github.com/huggingface/transformers.js"
307
+ target="_blank"
308
+ rel="noopener"
309
+ >
310
+ Transformers.js v4
311
+ </a>
312
+ {" · "}
313
+ <a
314
+ href="https://github.com/xenova/phonemizer.js"
315
+ target="_blank"
316
+ rel="noopener"
317
+ >
318
+ phonemizer.js
319
+ </a>{" "}
320
+ by{" "}
321
+ <a
322
+ href="https://github.com/xenova"
323
+ target="_blank"
324
+ rel="noopener"
325
+ >
326
+ Xenova
327
+ </a>
328
+ {" · "}
329
+ <a href="https://onnxruntime.ai" target="_blank" rel="noopener">
330
+ ONNX Runtime Web
331
+ </a>
332
+ </p>
333
+ </footer>
334
+ </div>
335
+ );
336
+ }
337
+
338
+ /** Convert Float32Array PCM to WAV Blob */
339
+ /** Convert Float32Array PCM to WAV Blob using IEEE float format */
340
+ function float32ToWav(samples: Float32Array, sampleRate: number): Blob {
341
+ // Normalize audio to [-1, 1] range
342
+ let maxAbs = 0;
343
+ for (let i = 0; i < samples.length; i++) {
344
+ const abs = Math.abs(samples[i]);
345
+ if (abs > maxAbs) maxAbs = abs;
346
+ }
347
+ if (maxAbs > 1) {
348
+ const scale = 0.95 / maxAbs; // leave some headroom
349
+ for (let i = 0; i < samples.length; i++) {
350
+ samples[i] *= scale;
351
+ }
352
+ }
353
+
354
+ // Write as IEEE 32-bit float WAV (format 3)
355
+ const bytesPerSample = 4;
356
+ const dataSize = samples.length * bytesPerSample;
357
+ const buffer = new ArrayBuffer(44 + dataSize);
358
+ const view = new DataView(buffer);
359
+
360
+ const writeStr = (offset: number, str: string) => {
361
+ for (let i = 0; i < str.length; i++)
362
+ view.setUint8(offset + i, str.charCodeAt(i));
363
+ };
364
+
365
+ writeStr(0, "RIFF");
366
+ view.setUint32(4, 36 + dataSize, true);
367
+ writeStr(8, "WAVE");
368
+ writeStr(12, "fmt ");
369
+ view.setUint32(16, 16, true);
370
+ view.setUint16(20, 3, true); // IEEE float
371
+ view.setUint16(22, 1, true); // mono
372
+ view.setUint32(24, sampleRate, true);
373
+ view.setUint32(28, sampleRate * bytesPerSample, true);
374
+ view.setUint16(32, bytesPerSample, true);
375
+ view.setUint16(34, 32, true); // bits per sample
376
+ writeStr(36, "data");
377
+ view.setUint32(40, dataSize, true);
378
+
379
+ // Write float samples directly
380
+ let offset = 44;
381
+ for (let i = 0; i < samples.length; i++) {
382
+ view.setFloat32(offset, samples[i], true);
383
+ offset += 4;
384
+ }
385
+
386
+ return new Blob([buffer], { type: "audio/wav" });
387
+ }
src/index.css ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *,
2
+ *::before,
3
+ *::after {
4
+ box-sizing: border-box;
5
+ margin: 0;
6
+ padding: 0;
7
+ }
8
+
9
+ :root {
10
+ --bg: #111;
11
+ --surface: #1a1a1a;
12
+ --surface-2: #222;
13
+ --border: #333;
14
+ --text: #e5e5e5;
15
+ --text-muted: #888;
16
+ --accent: #c084fc;
17
+ --accent-dim: rgba(192, 132, 252, 0.15);
18
+ --radius: 8px;
19
+ --font: system-ui, -apple-system, "Segoe UI", sans-serif;
20
+ --mono: ui-monospace, "SF Mono", Consolas, monospace;
21
+ }
22
+
23
+ html {
24
+ color-scheme: dark;
25
+ }
26
+
27
+ body {
28
+ font-family: var(--font);
29
+ background: var(--bg);
30
+ color: var(--text);
31
+ line-height: 1.5;
32
+ -webkit-font-smoothing: antialiased;
33
+ }
34
+
35
+ .container {
36
+ max-width: 720px;
37
+ margin: 0 auto;
38
+ padding: 2rem 1.5rem;
39
+ min-height: 100vh;
40
+ display: flex;
41
+ flex-direction: column;
42
+ }
43
+
44
+ /* Header */
45
+
46
+ header {
47
+ text-align: center;
48
+ margin-bottom: 2rem;
49
+ position: relative;
50
+ }
51
+
52
+ header h1 {
53
+ font-size: 1.75rem;
54
+ font-weight: 600;
55
+ letter-spacing: -0.02em;
56
+ margin-bottom: 0.25rem;
57
+ }
58
+
59
+ .logo {
60
+ font-size: 1.5rem;
61
+ }
62
+
63
+ .subtitle {
64
+ color: var(--text-muted);
65
+ font-size: 0.9rem;
66
+ }
67
+
68
+ .badge {
69
+ display: inline-block;
70
+ margin-top: 0.5rem;
71
+ padding: 0.15rem 0.6rem;
72
+ border-radius: 999px;
73
+ font-size: 0.7rem;
74
+ font-weight: 600;
75
+ font-family: var(--mono);
76
+ letter-spacing: 0.05em;
77
+ }
78
+
79
+ .badge-gpu {
80
+ background: rgba(74, 222, 128, 0.15);
81
+ color: #4ade80;
82
+ border: 1px solid rgba(74, 222, 128, 0.3);
83
+ }
84
+
85
+ .badge-wasm {
86
+ background: rgba(251, 191, 36, 0.15);
87
+ color: #fbbf24;
88
+ border: 1px solid rgba(251, 191, 36, 0.3);
89
+ }
90
+
91
+ /* Main */
92
+
93
+ main {
94
+ flex: 1;
95
+ display: flex;
96
+ flex-direction: column;
97
+ gap: 1.5rem;
98
+ }
99
+
100
+ label {
101
+ display: block;
102
+ font-size: 0.8rem;
103
+ font-weight: 500;
104
+ color: var(--text-muted);
105
+ margin-bottom: 0.4rem;
106
+ text-transform: uppercase;
107
+ letter-spacing: 0.05em;
108
+ }
109
+
110
+ textarea {
111
+ width: 100%;
112
+ background: var(--surface);
113
+ border: 1px solid var(--border);
114
+ border-radius: var(--radius);
115
+ color: var(--text);
116
+ font-family: var(--font);
117
+ font-size: 0.95rem;
118
+ padding: 0.75rem;
119
+ resize: vertical;
120
+ outline: none;
121
+ transition: border-color 0.15s;
122
+ }
123
+
124
+ textarea:focus {
125
+ border-color: var(--accent);
126
+ }
127
+
128
+ textarea::placeholder {
129
+ color: #555;
130
+ }
131
+
132
+ /* Controls */
133
+
134
+ .controls-row {
135
+ display: grid;
136
+ grid-template-columns: 1fr 1fr;
137
+ gap: 0.75rem;
138
+ margin-top: 0.75rem;
139
+ }
140
+
141
+ select {
142
+ width: 100%;
143
+ background: var(--surface);
144
+ border: 1px solid var(--border);
145
+ border-radius: var(--radius);
146
+ color: var(--text);
147
+ font-family: var(--font);
148
+ font-size: 0.9rem;
149
+ padding: 0.5rem 0.75rem;
150
+ outline: none;
151
+ cursor: pointer;
152
+ appearance: none;
153
+ background-image: url("data:image/svg+xml,%3Csvg width='10' height='6' viewBox='0 0 10 6' fill='none' xmlns='http://www.w3.org/2000/svg'%3E%3Cpath d='M1 1l4 4 4-4' stroke='%23888' stroke-width='1.5' stroke-linecap='round' stroke-linejoin='round'/%3E%3C/svg%3E");
154
+ background-repeat: no-repeat;
155
+ background-position: right 0.75rem center;
156
+ padding-right: 2rem;
157
+ }
158
+
159
+ select:focus {
160
+ border-color: var(--accent);
161
+ }
162
+
163
+ select:disabled {
164
+ opacity: 0.5;
165
+ cursor: not-allowed;
166
+ }
167
+
168
+ .speed-row {
169
+ margin-top: 0.75rem;
170
+ }
171
+
172
+ .speed-row label {
173
+ font-family: var(--mono);
174
+ font-size: 0.75rem;
175
+ }
176
+
177
+ input[type="range"] {
178
+ width: 100%;
179
+ accent-color: var(--accent);
180
+ height: 4px;
181
+ cursor: pointer;
182
+ }
183
+
184
+ /* Generate button */
185
+
186
+ .generate-btn {
187
+ margin-top: 1rem;
188
+ width: 100%;
189
+ padding: 0.7rem 1.5rem;
190
+ background: var(--accent);
191
+ color: #111;
192
+ border: none;
193
+ border-radius: var(--radius);
194
+ font-family: var(--font);
195
+ font-size: 0.9rem;
196
+ font-weight: 600;
197
+ cursor: pointer;
198
+ transition: opacity 0.15s;
199
+ }
200
+
201
+ .generate-btn:hover:not(:disabled) {
202
+ opacity: 0.9;
203
+ }
204
+
205
+ .generate-btn:disabled {
206
+ opacity: 0.4;
207
+ cursor: not-allowed;
208
+ }
209
+
210
+ /* Output */
211
+
212
+ .output-section {
213
+ background: var(--surface);
214
+ border: 1px solid var(--border);
215
+ border-radius: var(--radius);
216
+ padding: 1rem;
217
+ }
218
+
219
+ .audio-result {
220
+ display: flex;
221
+ flex-direction: column;
222
+ gap: 0.5rem;
223
+ }
224
+
225
+ .audio-player {
226
+ width: 100%;
227
+ border-radius: var(--radius);
228
+ }
229
+
230
+ .duration {
231
+ font-size: 0.75rem;
232
+ color: var(--text-muted);
233
+ font-family: var(--mono);
234
+ }
235
+
236
+ .audio-placeholder {
237
+ padding: 2rem 1rem;
238
+ text-align: center;
239
+ color: #555;
240
+ font-size: 0.85rem;
241
+ }
242
+
243
+ /* Examples */
244
+
245
+ .examples-grid {
246
+ display: flex;
247
+ flex-direction: column;
248
+ gap: 0.5rem;
249
+ }
250
+
251
+ .example-btn {
252
+ display: flex;
253
+ align-items: baseline;
254
+ gap: 0.75rem;
255
+ width: 100%;
256
+ background: var(--surface);
257
+ border: 1px solid var(--border);
258
+ border-radius: var(--radius);
259
+ padding: 0.6rem 0.75rem;
260
+ color: var(--text);
261
+ font-family: var(--font);
262
+ font-size: 0.85rem;
263
+ cursor: pointer;
264
+ text-align: left;
265
+ transition: border-color 0.15s;
266
+ }
267
+
268
+ .example-btn:hover:not(:disabled) {
269
+ border-color: var(--accent);
270
+ }
271
+
272
+ .example-btn:disabled {
273
+ opacity: 0.4;
274
+ cursor: not-allowed;
275
+ }
276
+
277
+ .example-voice {
278
+ flex-shrink: 0;
279
+ font-family: var(--mono);
280
+ font-size: 0.75rem;
281
+ font-weight: 600;
282
+ color: var(--accent);
283
+ padding: 0.1rem 0.4rem;
284
+ background: var(--accent-dim);
285
+ border-radius: 4px;
286
+ }
287
+
288
+ .example-text {
289
+ color: var(--text-muted);
290
+ }
291
+
292
+ /* Error */
293
+
294
+ .error-msg {
295
+ background: rgba(239, 68, 68, 0.1);
296
+ border: 1px solid rgba(239, 68, 68, 0.3);
297
+ color: #f87171;
298
+ border-radius: var(--radius);
299
+ padding: 0.75rem 1rem;
300
+ font-size: 0.85rem;
301
+ }
302
+
303
+ /* Footer */
304
+
305
+ footer {
306
+ margin-top: 2.5rem;
307
+ padding-top: 1.5rem;
308
+ border-top: 1px solid var(--border);
309
+ text-align: center;
310
+ font-size: 0.75rem;
311
+ color: var(--text-muted);
312
+ display: flex;
313
+ flex-direction: column;
314
+ gap: 0.25rem;
315
+ }
316
+
317
+ footer a {
318
+ color: var(--text-muted);
319
+ text-decoration: underline;
320
+ text-underline-offset: 2px;
321
+ transition: color 0.15s;
322
+ }
323
+
324
+ footer a:hover {
325
+ color: var(--text);
326
+ }
327
+
328
+ /* Responsive */
329
+
330
+ @media (max-width: 480px) {
331
+ .container {
332
+ padding: 1.25rem 1rem;
333
+ }
334
+ .controls-row {
335
+ grid-template-columns: 1fr;
336
+ }
337
+ header h1 {
338
+ font-size: 1.5rem;
339
+ }
340
+ }
src/lib/npz-reader.ts ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Voice data loader — loads KittenTTS voice embeddings.
3
+ *
4
+ * Instead of parsing .npz (zip of npy), we download the npz and use
5
+ * a robust zip + npy parser with proper byte alignment handling.
6
+ */
7
+
8
+ export interface VoiceInfo {
9
+ data: Float32Array;
10
+ shape: [number, number]; // [numStyles, styleDim]
11
+ }
12
+
13
+ function parseNpyHeader(bytes: Uint8Array) {
14
+ // Magic: \x93NUMPY
15
+ if (bytes[0] !== 0x93 || String.fromCharCode(bytes[1], bytes[2], bytes[3], bytes[4], bytes[5]) !== "NUMPY") {
16
+ throw new Error("Not a valid .npy file");
17
+ }
18
+
19
+ const majorVersion = bytes[6];
20
+ const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength);
21
+ let headerLen: number;
22
+ let headerOffset: number;
23
+
24
+ if (majorVersion === 1) {
25
+ headerLen = view.getUint16(8, true);
26
+ headerOffset = 10;
27
+ } else {
28
+ headerLen = view.getUint32(8, true);
29
+ headerOffset = 12;
30
+ }
31
+
32
+ const headerStr = new TextDecoder().decode(
33
+ bytes.slice(headerOffset, headerOffset + headerLen)
34
+ );
35
+
36
+ const descrMatch = headerStr.match(/'descr'\s*:\s*'([^']+)'/);
37
+ const shapeMatch = headerStr.match(/'shape'\s*:\s*\(([^)]*)\)/);
38
+
39
+ if (!descrMatch) throw new Error("Could not parse dtype from .npy header: " + headerStr);
40
+
41
+ const descr = descrMatch[1];
42
+ const shapeNums = shapeMatch
43
+ ? shapeMatch[1].split(",").map((s) => parseInt(s.trim(), 10)).filter((n) => !isNaN(n))
44
+ : [];
45
+
46
+ const dataOffset = headerOffset + headerLen;
47
+
48
+ return { descr, shape: shapeNums, dataOffset };
49
+ }
50
+
51
+ function npyToFloat32(bytes: Uint8Array): { data: Float32Array; shape: number[] } {
52
+ const { descr, shape, dataOffset } = parseNpyHeader(bytes);
53
+ const rawBytes = bytes.slice(dataOffset);
54
+
55
+ // Always copy into a fresh aligned ArrayBuffer
56
+ const aligned = new ArrayBuffer(rawBytes.length);
57
+ new Uint8Array(aligned).set(rawBytes);
58
+
59
+ let data: Float32Array;
60
+ if (descr === "<f4" || descr === "float32") {
61
+ data = new Float32Array(aligned);
62
+ } else if (descr === "<f8" || descr === "float64") {
63
+ const f64 = new Float64Array(aligned);
64
+ data = new Float32Array(f64.length);
65
+ for (let i = 0; i < f64.length; i++) data[i] = f64[i];
66
+ } else {
67
+ throw new Error("Unsupported npy dtype: " + descr);
68
+ }
69
+
70
+ return { data, shape };
71
+ }
72
+
73
+ /**
74
+ * Parse a zip file and extract entries.
75
+ * Handles both stored (method 0) and deflated (method 8) entries.
76
+ * Properly handles data descriptors (bit 3 of flags).
77
+ */
78
+ async function extractZipEntries(
79
+ buffer: ArrayBuffer
80
+ ): Promise<Map<string, Uint8Array>> {
81
+ const bytes = new Uint8Array(buffer);
82
+ const view = new DataView(buffer);
83
+ const entries = new Map<string, Uint8Array>();
84
+
85
+ // First, find the Central Directory to get reliable sizes
86
+ // Search for End of Central Directory signature (0x06054b50) from the end
87
+ let eocdOffset = -1;
88
+ for (let i = bytes.length - 22; i >= 0; i--) {
89
+ if (view.getUint32(i, true) === 0x06054b50) {
90
+ eocdOffset = i;
91
+ break;
92
+ }
93
+ }
94
+
95
+ if (eocdOffset === -1) {
96
+ throw new Error("Could not find End of Central Directory");
97
+ }
98
+
99
+ const cdOffset = view.getUint32(eocdOffset + 16, true);
100
+ const cdEntries = view.getUint16(eocdOffset + 10, true);
101
+
102
+ // Parse Central Directory entries to get accurate sizes and offsets
103
+ interface CDEntry {
104
+ fileName: string;
105
+ compressedSize: number;
106
+ uncompressedSize: number;
107
+ localHeaderOffset: number;
108
+ compressionMethod: number;
109
+ }
110
+
111
+ const cdList: CDEntry[] = [];
112
+ let cdPos = cdOffset;
113
+
114
+ for (let i = 0; i < cdEntries; i++) {
115
+ const sig = view.getUint32(cdPos, true);
116
+ if (sig !== 0x02014b50) break;
117
+
118
+ const compressionMethod = view.getUint16(cdPos + 10, true);
119
+ const compressedSize = view.getUint32(cdPos + 20, true);
120
+ const uncompressedSize = view.getUint32(cdPos + 24, true);
121
+ const fileNameLen = view.getUint16(cdPos + 28, true);
122
+ const extraLen = view.getUint16(cdPos + 30, true);
123
+ const commentLen = view.getUint16(cdPos + 32, true);
124
+ const localHeaderOffset = view.getUint32(cdPos + 42, true);
125
+
126
+ const fileName = new TextDecoder().decode(
127
+ bytes.slice(cdPos + 46, cdPos + 46 + fileNameLen)
128
+ );
129
+
130
+ cdList.push({
131
+ fileName,
132
+ compressedSize,
133
+ uncompressedSize,
134
+ localHeaderOffset,
135
+ compressionMethod,
136
+ });
137
+
138
+ cdPos += 46 + fileNameLen + extraLen + commentLen;
139
+ }
140
+
141
+ // Now extract each entry using local headers + CD sizes
142
+ for (const cd of cdList) {
143
+ const lhOffset = cd.localHeaderOffset;
144
+ const lhFileNameLen = view.getUint16(lhOffset + 26, true);
145
+ const lhExtraLen = view.getUint16(lhOffset + 28, true);
146
+ const dataStart = lhOffset + 30 + lhFileNameLen + lhExtraLen;
147
+
148
+ let fileData: Uint8Array;
149
+
150
+ if (cd.compressionMethod === 0) {
151
+ // Stored
152
+ fileData = bytes.slice(dataStart, dataStart + cd.uncompressedSize);
153
+ } else if (cd.compressionMethod === 8) {
154
+ // Deflate
155
+ const compressed = bytes.slice(dataStart, dataStart + cd.compressedSize);
156
+ const ds = new DecompressionStream("deflate-raw");
157
+ const writer = ds.writable.getWriter();
158
+ writer.write(compressed);
159
+ writer.close();
160
+
161
+ const reader = ds.readable.getReader();
162
+ const chunks: Uint8Array[] = [];
163
+ let totalLen = 0;
164
+ while (true) {
165
+ const { done, value } = await reader.read();
166
+ if (done) break;
167
+ chunks.push(value);
168
+ totalLen += value.length;
169
+ }
170
+
171
+ fileData = new Uint8Array(totalLen);
172
+ let pos = 0;
173
+ for (const chunk of chunks) {
174
+ fileData.set(chunk, pos);
175
+ pos += chunk.length;
176
+ }
177
+ } else {
178
+ console.warn(`Skipping ${cd.fileName}: unsupported compression ${cd.compressionMethod}`);
179
+ continue;
180
+ }
181
+
182
+ entries.set(cd.fileName, fileData);
183
+ }
184
+
185
+ return entries;
186
+ }
187
+
188
+ /**
189
+ * Load voice embeddings from a .npz file URL.
190
+ */
191
+ export async function loadVoices(
192
+ url: string
193
+ ): Promise<Record<string, VoiceInfo>> {
194
+ const response = await fetch(url);
195
+ if (!response.ok) throw new Error(`Failed to fetch voices: ${response.status}`);
196
+ const arrayBuffer = await response.arrayBuffer();
197
+
198
+ const entries = await extractZipEntries(arrayBuffer);
199
+ const voices: Record<string, VoiceInfo> = {};
200
+
201
+ for (const [fileName, fileData] of entries) {
202
+ if (!fileName.endsWith(".npy")) continue;
203
+
204
+ const voiceName = fileName.replace(/\.npy$/, "");
205
+ const { data, shape } = npyToFloat32(fileData);
206
+
207
+ voices[voiceName] = {
208
+ data,
209
+ shape: [shape[0] || 1, shape[1] || data.length],
210
+ };
211
+ }
212
+
213
+ return voices;
214
+ }
src/lib/preprocess.ts ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Text preprocessor — converts numbers, currencies, ordinals, etc. to words.
3
+ * Port of KittenTTS preprocess.py.
4
+ * https://github.com/KittenML/KittenTTS
5
+ */
6
+
7
+ // ── Number → Words ──
8
+
9
+ const ONES = [
10
+ "", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine",
11
+ "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen",
12
+ "seventeen", "eighteen", "nineteen",
13
+ ];
14
+ const TENS = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"];
15
+ const SCALE = ["", "thousand", "million", "billion", "trillion"];
16
+
17
+ const ORDINAL_EXCEPTIONS: Record<string, string> = {
18
+ one: "first", two: "second", three: "third", four: "fourth",
19
+ five: "fifth", six: "sixth", seven: "seventh", eight: "eighth",
20
+ nine: "ninth", twelve: "twelfth",
21
+ };
22
+
23
+ const CURRENCY_SYMBOLS: Record<string, string> = {
24
+ "$": "dollar", "€": "euro", "£": "pound", "¥": "yen",
25
+ "₹": "rupee", "₩": "won", "₿": "bitcoin",
26
+ };
27
+
28
+ function threeDigitsToWords(n: number): string {
29
+ if (n === 0) return "";
30
+ const parts: string[] = [];
31
+ const hundreds = Math.floor(n / 100);
32
+ const remainder = n % 100;
33
+ if (hundreds) parts.push(`${ONES[hundreds]} hundred`);
34
+ if (remainder < 20) {
35
+ if (remainder) parts.push(ONES[remainder]);
36
+ } else {
37
+ const tensWord = TENS[Math.floor(remainder / 10)];
38
+ const onesWord = ONES[remainder % 10];
39
+ parts.push(onesWord ? `${tensWord}-${onesWord}` : tensWord);
40
+ }
41
+ return parts.join(" ");
42
+ }
43
+
44
+ export function numberToWords(n: number): string {
45
+ if (!Number.isInteger(n)) n = Math.floor(n);
46
+ if (n === 0) return "zero";
47
+ if (n < 0) return `negative ${numberToWords(-n)}`;
48
+ if (n >= 100 && n <= 9999 && n % 100 === 0 && n % 1000 !== 0) {
49
+ const hundreds = Math.floor(n / 100);
50
+ if (hundreds < 20) return `${ONES[hundreds]} hundred`;
51
+ }
52
+ const parts: string[] = [];
53
+ let remaining = n;
54
+ for (let i = 0; i < SCALE.length; i++) {
55
+ const chunk = remaining % 1000;
56
+ if (chunk) {
57
+ const w = threeDigitsToWords(chunk);
58
+ parts.push(SCALE[i] ? `${w} ${SCALE[i]}` : w);
59
+ }
60
+ remaining = Math.floor(remaining / 1000);
61
+ if (remaining === 0) break;
62
+ }
63
+ return parts.reverse().join(" ");
64
+ }
65
+
66
+ export function floatToWords(value: string | number, sep = "point"): string {
67
+ const text = typeof value === "string" ? value : `${value}`;
68
+ const negative = text.startsWith("-");
69
+ const clean = negative ? text.slice(1) : text;
70
+ let result: string;
71
+ if (clean.includes(".")) {
72
+ const [intPart, decPart] = clean.split(".");
73
+ const intWords = intPart ? numberToWords(parseInt(intPart, 10)) : "zero";
74
+ const digitMap = ["zero", ...ONES.slice(1)];
75
+ const decWords = [...decPart].map((d) => digitMap[parseInt(d, 10)]).join(" ");
76
+ result = `${intWords} ${sep} ${decWords}`;
77
+ } else {
78
+ result = numberToWords(parseInt(clean, 10));
79
+ }
80
+ return negative ? `negative ${result}` : result;
81
+ }
82
+
83
+ function ordinalSuffix(n: number): string {
84
+ const word = numberToWords(n);
85
+ let prefix: string, last: string, joiner: string;
86
+ if (word.includes("-")) {
87
+ const idx = word.lastIndexOf("-");
88
+ prefix = word.slice(0, idx);
89
+ last = word.slice(idx + 1);
90
+ joiner = "-";
91
+ } else {
92
+ const parts = word.split(" ");
93
+ if (parts.length >= 2) {
94
+ last = parts.pop()!;
95
+ prefix = parts.join(" ");
96
+ joiner = " ";
97
+ } else {
98
+ last = word;
99
+ prefix = "";
100
+ joiner = "";
101
+ }
102
+ }
103
+ let lastOrd: string;
104
+ if (ORDINAL_EXCEPTIONS[last]) {
105
+ lastOrd = ORDINAL_EXCEPTIONS[last];
106
+ } else if (last.endsWith("t")) {
107
+ lastOrd = last + "h";
108
+ } else if (last.endsWith("e")) {
109
+ lastOrd = last.slice(0, -1) + "th";
110
+ } else {
111
+ lastOrd = last + "th";
112
+ }
113
+ return prefix ? `${prefix}${joiner}${lastOrd}` : lastOrd;
114
+ }
115
+
116
+ // ── Regex patterns ──
117
+
118
+ const RE_NUMBER = /(?<![a-zA-Z])-?[\d,]+(?:\.\d+)?/g;
119
+ const RE_ORDINAL = /\b(\d+)(st|nd|rd|th)\b/gi;
120
+ const RE_PERCENT = /(-?[\d,]+(?:\.\d+)?)\s*%/g;
121
+ const RE_CURRENCY = /([$€£¥₹₩₿])\s*([\d,]+(?:\.\d+)?)\s*([KMBT])?(?![a-zA-Z\d])/g;
122
+ const RE_TIME = /\b(\d{1,2}):(\d{2})(?::(\d{2}))?\s*(am|pm)?\b/gi;
123
+ const RE_RANGE = /(?<!\w)(\d+)-(\d+)(?!\w)/g;
124
+ const RE_MODEL_VER = /\b([a-zA-Z][a-zA-Z0-9]*)-(\d[\d.]*)(?=[^\d.]|$)/g;
125
+ const RE_UNIT = /(\d+(?:\.\d+)?)\s*(km|kg|mg|ml|gb|mb|kb|tb|hz|khz|mhz|ghz|mph|kph|°[cCfF]|[cCfF]°|ms|ns|µs)\b/gi;
126
+ const RE_SCALE = /(?<![a-zA-Z])(\d+(?:\.\d+)?)\s*([KMBT])(?![a-zA-Z\d])/g;
127
+ const RE_SCI = /(?<![a-zA-Z\d])(-?\d+(?:\.\d+)?)[eE]([+-]?\d+)(?![a-zA-Z\d])/g;
128
+ const RE_FRACTION = /\b(\d+)\s*\/\s*(\d+)\b/g;
129
+ const RE_DECADE = /\b(\d{1,3})0s\b/g;
130
+
131
+ const UNIT_MAP: Record<string, string> = {
132
+ km: "kilometers", kg: "kilograms", mg: "milligrams", ml: "milliliters",
133
+ gb: "gigabytes", mb: "megabytes", kb: "kilobytes", tb: "terabytes",
134
+ hz: "hertz", khz: "kilohertz", mhz: "megahertz", ghz: "gigahertz",
135
+ mph: "miles per hour", kph: "kilometers per hour",
136
+ ms: "milliseconds", ns: "nanoseconds", "µs": "microseconds",
137
+ "°c": "degrees Celsius", "c°": "degrees Celsius",
138
+ "°f": "degrees Fahrenheit", "f°": "degrees Fahrenheit",
139
+ };
140
+
141
+ const SCALE_MAP: Record<string, string> = {
142
+ K: "thousand", M: "million", B: "billion", T: "trillion",
143
+ };
144
+
145
+ const DECADE_MAP: Record<number, string> = {
146
+ 0: "hundreds", 1: "tens", 2: "twenties", 3: "thirties", 4: "forties",
147
+ 5: "fifties", 6: "sixties", 7: "seventies", 8: "eighties", 9: "nineties",
148
+ };
149
+
150
+ // ── Expansion functions ──
151
+
152
+ function expandOrdinals(text: string): string {
153
+ return text.replace(RE_ORDINAL, (_, n) => ordinalSuffix(parseInt(n, 10)));
154
+ }
155
+
156
+ function expandPercentages(text: string): string {
157
+ return text.replace(RE_PERCENT, (_, raw) => {
158
+ const clean = raw.replace(/,/g, "");
159
+ const w = clean.includes(".") ? floatToWords(parseFloat(clean)) : numberToWords(parseInt(clean, 10));
160
+ return `${w} percent`;
161
+ });
162
+ }
163
+
164
+ function expandCurrency(text: string): string {
165
+ return text.replace(RE_CURRENCY, (_, symbol, raw, scaleSuffix) => {
166
+ const clean = raw.replace(/,/g, "");
167
+ const unit = CURRENCY_SYMBOLS[symbol] || "";
168
+ if (scaleSuffix) {
169
+ const scaleWord = SCALE_MAP[scaleSuffix];
170
+ const num = clean.includes(".") ? floatToWords(clean) : numberToWords(parseInt(clean, 10));
171
+ return `${num} ${scaleWord} ${unit}s`.trim();
172
+ }
173
+ if (clean.includes(".")) {
174
+ const [intPart, decPart] = clean.split(".");
175
+ const decVal = parseInt(decPart.slice(0, 2).padEnd(2, "0"), 10);
176
+ let result = `${numberToWords(parseInt(intPart, 10))} ${unit}s`;
177
+ if (decVal) result += ` and ${numberToWords(decVal)} cent${decVal !== 1 ? "s" : ""}`;
178
+ return result;
179
+ }
180
+ const val = parseInt(clean, 10);
181
+ return `${numberToWords(val)} ${unit}${val !== 1 && unit ? "s" : ""}`;
182
+ });
183
+ }
184
+
185
+ function expandTime(text: string): string {
186
+ return text.replace(RE_TIME, (_, h, m, _s, suffix) => {
187
+ const hour = parseInt(h, 10);
188
+ const mins = parseInt(m, 10);
189
+ const sfx = suffix ? ` ${suffix.toLowerCase()}` : "";
190
+ const hWords = numberToWords(hour);
191
+ if (mins === 0) return suffix ? `${hWords}${sfx}` : `${hWords} hundred${sfx}`;
192
+ if (mins < 10) return `${hWords} oh ${numberToWords(mins)}${sfx}`;
193
+ return `${hWords} ${numberToWords(mins)}${sfx}`;
194
+ });
195
+ }
196
+
197
+ function expandRanges(text: string): string {
198
+ return text.replace(RE_RANGE, (_, lo, hi) =>
199
+ `${numberToWords(parseInt(lo, 10))} to ${numberToWords(parseInt(hi, 10))}`
200
+ );
201
+ }
202
+
203
+ function expandModelNames(text: string): string {
204
+ return text.replace(RE_MODEL_VER, (_, name, ver) => `${name} ${ver}`);
205
+ }
206
+
207
+ function expandUnits(text: string): string {
208
+ return text.replace(RE_UNIT, (_, raw, unit) => {
209
+ const expanded = UNIT_MAP[unit.toLowerCase()] || unit;
210
+ const num = raw.includes(".") ? floatToWords(parseFloat(raw)) : numberToWords(parseInt(raw, 10));
211
+ return `${num} ${expanded}`;
212
+ });
213
+ }
214
+
215
+ function expandScaleSuffixes(text: string): string {
216
+ return text.replace(RE_SCALE, (_, raw, suffix) => {
217
+ const scaleWord = SCALE_MAP[suffix] || suffix;
218
+ const num = raw.includes(".") ? floatToWords(raw) : numberToWords(parseInt(raw, 10));
219
+ return `${num} ${scaleWord}`;
220
+ });
221
+ }
222
+
223
+ function expandScientific(text: string): string {
224
+ return text.replace(RE_SCI, (_, coeff, exp) => {
225
+ const coeffW = coeff.includes(".") ? floatToWords(coeff) : numberToWords(parseInt(coeff, 10));
226
+ const expVal = parseInt(exp, 10);
227
+ const sign = expVal < 0 ? "negative " : "";
228
+ return `${coeffW} times ten to the ${sign}${numberToWords(Math.abs(expVal))}`;
229
+ });
230
+ }
231
+
232
+ function expandFractions(text: string): string {
233
+ return text.replace(RE_FRACTION, (m, num, den) => {
234
+ const n = parseInt(num, 10);
235
+ const d = parseInt(den, 10);
236
+ if (d === 0) return m;
237
+ const nWords = numberToWords(n);
238
+ let dWord: string;
239
+ if (d === 2) dWord = n === 1 ? "half" : "halves";
240
+ else if (d === 4) dWord = n === 1 ? "quarter" : "quarters";
241
+ else {
242
+ dWord = ordinalSuffix(d);
243
+ if (n !== 1) dWord += "s";
244
+ }
245
+ return `${nWords} ${dWord}`;
246
+ });
247
+ }
248
+
249
+ function expandDecades(text: string): string {
250
+ return text.replace(RE_DECADE, (_, base) => {
251
+ const b = parseInt(base, 10);
252
+ const decadeDigit = b % 10;
253
+ const decadeWord = DECADE_MAP[decadeDigit] || "";
254
+ if (b < 10) return decadeWord;
255
+ return `${numberToWords(Math.floor(b / 10))} ${decadeWord}`;
256
+ });
257
+ }
258
+
259
+ function replaceNumbers(text: string): string {
260
+ return text.replace(RE_NUMBER, (m) => {
261
+ const clean = m.replace(/,/g, "");
262
+ if (clean.includes(".")) return floatToWords(clean);
263
+ return numberToWords(parseInt(clean, 10));
264
+ });
265
+ }
266
+
267
+ function normalizeLeadingDecimals(text: string): string {
268
+ text = text.replace(/(?<!\d)(-)\.([\d])/g, "$1" + "0.$2");
269
+ text = text.replace(/(?<!\d)\.([\d])/g, "0.$1");
270
+ return text;
271
+ }
272
+
273
+ const RE_URL = /https?:\/\/\S+|www\.\S+/g;
274
+ const RE_EMAIL = /\b[\w.+-]+@[\w-]+\.[a-z]{2,}\b/gi;
275
+ const RE_HTML = /<[^>]+>/g;
276
+ const RE_PUNCT = /[^\w\s.,?!;:\-\u2014\u2013\u2026]/g;
277
+ const RE_SPACES = /\s+/g;
278
+
279
+ export function preprocessText(text: string): string {
280
+ // Remove URLs, emails, HTML
281
+ text = text.replace(RE_URL, "");
282
+ text = text.replace(RE_EMAIL, "");
283
+ text = text.replace(RE_HTML, " ");
284
+
285
+ // Normalize leading decimals
286
+ text = normalizeLeadingDecimals(text);
287
+
288
+ // Expand special forms before generic number replacement
289
+ text = expandCurrency(text);
290
+ text = expandPercentages(text);
291
+ text = expandScientific(text);
292
+ text = expandTime(text);
293
+ text = expandOrdinals(text);
294
+ text = expandUnits(text);
295
+ text = expandScaleSuffixes(text);
296
+ text = expandFractions(text);
297
+ text = expandDecades(text);
298
+ text = expandRanges(text);
299
+ text = expandModelNames(text);
300
+ text = replaceNumbers(text);
301
+
302
+ // Remove non-prosodic punctuation
303
+ text = text.replace(RE_PUNCT, " ");
304
+
305
+ // Lowercase and collapse whitespace
306
+ text = text.toLowerCase();
307
+ text = text.replace(RE_SPACES, " ").trim();
308
+
309
+ return text;
310
+ }
src/lib/text-cleaner.ts ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * TextCleaner — maps IPA phoneme characters to integer token IDs.
3
+ * Direct port of KittenTTS Python TextCleaner class.
4
+ * https://github.com/KittenML/KittenTTS
5
+ */
6
+
7
+ const _pad = "$";
8
+ const _punctuation = ';:,.!?¡¿—…"«»"" ';
9
+ const _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
10
+ const _letters_ipa =
11
+ "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ";
12
+
13
+ const symbols = [_pad, ..._punctuation, ..._letters, ..._letters_ipa];
14
+
15
+ const charToIndex: Record<string, number> = {};
16
+ for (let i = 0; i < symbols.length; i++) {
17
+ charToIndex[symbols[i]] = i;
18
+ }
19
+
20
+ export function cleanText(text: string): number[] {
21
+ const indexes: number[] = [];
22
+ for (const char of text) {
23
+ const idx = charToIndex[char];
24
+ if (idx !== undefined) {
25
+ indexes.push(idx);
26
+ }
27
+ }
28
+ return indexes;
29
+ }
30
+
31
+ export function tokenize(phonemes: string): number[] {
32
+ const tokens = cleanText(phonemes);
33
+ // Add start/end tokens matching Python: insert 0 at start, append 10, append 0
34
+ tokens.unshift(0);
35
+ tokens.push(10);
36
+ tokens.push(0);
37
+ return tokens;
38
+ }
src/main.tsx ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import { StrictMode } from "react";
2
+ import { createRoot } from "react-dom/client";
3
+ import "./index.css";
4
+ import App from "./App";
5
+
6
+ createRoot(document.getElementById("root")!).render(
7
+ <StrictMode>
8
+ <App />
9
+ </StrictMode>
10
+ );
src/vite-env.d.ts ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ /// <reference types="vite/client" />
2
+
3
+ declare module "*?worker" {
4
+ const workerConstructor: {
5
+ new (): Worker;
6
+ };
7
+ export default workerConstructor;
8
+ }
src/worker.ts ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Web Worker — KittenTTS inference via ONNX Runtime Web (WebGPU/WASM).
3
+ *
4
+ * Models: https://huggingface.co/KittenML
5
+ * Phonemizer: https://github.com/xenova/phonemizer.js (Xenova)
6
+ * ONNX Runtime Web: https://onnxruntime.ai
7
+ */
8
+
9
+ import { tokenize } from "./lib/text-cleaner";
10
+ import { loadVoices, type VoiceInfo } from "./lib/npz-reader";
11
+
12
+ // Dynamic imports — resolved at runtime to avoid Vite dev server transform issues
13
+ let phonemize: (text: string, lang: string) => Promise<string[]>;
14
+ let ort: any;
15
+
16
+ const HF_BASE = "https://huggingface.co";
17
+ const SAMPLE_RATE = 24000;
18
+
19
+ // Int8 quantized models produce NaN on WebGPU; all fp32 models should be fine
20
+ const WEBGPU_BLOCKED_PATTERNS = ["int8"];
21
+
22
+ interface ModelConfig {
23
+ name: string;
24
+ version: string;
25
+ type: string;
26
+ model: string;
27
+ model_file: string;
28
+ voices: string;
29
+ speed_priors: Record<string, number>;
30
+ voice_aliases: Record<string, string>;
31
+ }
32
+
33
+ let session: any = null;
34
+ let voices: Record<string, VoiceInfo> = {};
35
+ let config: ModelConfig | null = null;
36
+ let currentDevice: "webgpu" | "wasm" = "wasm";
37
+
38
+ function resolveUrl(repoId: string, filename: string): string {
39
+ return `${HF_BASE}/${repoId}/resolve/main/${filename}`;
40
+ }
41
+
42
+ async function detectWebGPU(): Promise<boolean> {
43
+ try {
44
+ if (!("gpu" in navigator)) return false;
45
+ const adapter = await (navigator as any).gpu.requestAdapter();
46
+ return !!adapter;
47
+ } catch {
48
+ return false;
49
+ }
50
+ }
51
+
52
+ async function loadModel(repoId: string) {
53
+ self.postMessage({ type: "status", message: "Detecting hardware..." });
54
+
55
+ const hasWebGPU = await detectWebGPU();
56
+
57
+ // Load runtime dependencies
58
+ self.postMessage({ type: "status", message: "Loading runtime..." });
59
+ const [ortModule, phonemizerModule] = await Promise.all([
60
+ import("onnxruntime-web"),
61
+ import("phonemizer"),
62
+ ]);
63
+ ort = ortModule;
64
+ phonemize = phonemizerModule.phonemize;
65
+
66
+ // Load config
67
+ self.postMessage({ type: "status", message: "Loading config..." });
68
+ const configUrl = resolveUrl(repoId, "config.json");
69
+ const configResp = await fetch(configUrl);
70
+ config = (await configResp.json()) as ModelConfig;
71
+
72
+ // Int8 quantized models produce NaN on WebGPU — only block those
73
+ const modelName = config.model || repoId.split("/").pop() || "";
74
+ const isBlocked = WEBGPU_BLOCKED_PATTERNS.some((p) => modelName.includes(p));
75
+ currentDevice = hasWebGPU && !isBlocked ? "webgpu" : "wasm";
76
+
77
+ if (hasWebGPU && isBlocked) {
78
+ console.log(`[KittenTTS] Using WASM for "${modelName}" (int8 models produce NaN on WebGPU)`);
79
+ }
80
+
81
+ self.postMessage({ type: "device", device: currentDevice });
82
+
83
+ // Load voices (.npz) and ONNX model in parallel
84
+ self.postMessage({ type: "status", message: "Downloading model & voices..." });
85
+
86
+ const modelUrl = resolveUrl(repoId, config.model_file);
87
+
88
+ const modelPromise = (async () => {
89
+ const resp = await fetch(modelUrl);
90
+ if (!resp.ok) throw new Error(`Failed to fetch model: ${resp.status}`);
91
+
92
+ const contentLength = parseInt(resp.headers.get("content-length") || "0", 10);
93
+ const reader = resp.body!.getReader();
94
+ const chunks: Uint8Array[] = [];
95
+ let loaded = 0;
96
+
97
+ while (true) {
98
+ const { done, value } = await reader.read();
99
+ if (done) break;
100
+ chunks.push(value);
101
+ loaded += value.length;
102
+ if (contentLength > 0) {
103
+ const pct = Math.round((loaded / contentLength) * 100);
104
+ const mb = (loaded / 1024 / 1024).toFixed(1);
105
+ self.postMessage({
106
+ type: "status",
107
+ message: `Downloading model... ${pct}% (${mb} MB)`,
108
+ });
109
+ }
110
+ }
111
+
112
+ const modelData = new Uint8Array(loaded);
113
+ let offset = 0;
114
+ for (const chunk of chunks) {
115
+ modelData.set(chunk, offset);
116
+ offset += chunk.length;
117
+ }
118
+ return modelData.buffer;
119
+ })();
120
+
121
+ const voicesUrl = resolveUrl(repoId, config.voices);
122
+ const voicesPromise = loadVoices(voicesUrl);
123
+
124
+ const [modelBuffer, loadedVoices] = await Promise.all([modelPromise, voicesPromise]);
125
+ voices = loadedVoices;
126
+
127
+ // Create ONNX inference session
128
+ self.postMessage({
129
+ type: "status",
130
+ message: `Initializing ${currentDevice.toUpperCase()} session...`,
131
+ });
132
+
133
+ const sessionOptions: any = {
134
+ executionProviders: currentDevice === "webgpu" ? ["webgpu"] : ["wasm"],
135
+ };
136
+
137
+ if (currentDevice === "wasm") {
138
+ ort.env.wasm.numThreads = 1;
139
+ }
140
+
141
+ session = await ort.InferenceSession.create(modelBuffer, sessionOptions);
142
+
143
+ const voiceNames = config.voice_aliases
144
+ ? Object.keys(config.voice_aliases)
145
+ : Object.keys(voices);
146
+
147
+ self.postMessage({
148
+ type: "ready",
149
+ voices: voiceNames,
150
+ device: currentDevice,
151
+ modelName: config.name,
152
+ });
153
+ }
154
+
155
+ function ensurePunctuation(text: string): string {
156
+ text = text.trim();
157
+ if (!text) return text;
158
+ if (!".!?,;:".includes(text[text.length - 1])) {
159
+ text += ",";
160
+ }
161
+ return text;
162
+ }
163
+
164
+ function chunkText(text: string, maxLen = 400): string[] {
165
+ // Split on sentence boundaries but keep the punctuation
166
+ const sentences = text.match(/[^.!?]*[.!?]+|[^.!?]+$/g) || [text];
167
+ const chunks: string[] = [];
168
+ for (let sentence of sentences) {
169
+ sentence = sentence.trim();
170
+ if (!sentence) continue;
171
+ if (sentence.length <= maxLen) {
172
+ chunks.push(ensurePunctuation(sentence));
173
+ } else {
174
+ const words = sentence.split(/\s+/);
175
+ let temp = "";
176
+ for (const word of words) {
177
+ if (temp.length + word.length + 1 <= maxLen) {
178
+ temp += (temp ? " " : "") + word;
179
+ } else {
180
+ if (temp) chunks.push(ensurePunctuation(temp));
181
+ temp = word;
182
+ }
183
+ }
184
+ if (temp) chunks.push(ensurePunctuation(temp));
185
+ }
186
+ }
187
+ return chunks;
188
+ }
189
+
190
+ function basicTokenize(text: string): string[] {
191
+ // Python's \w matches Unicode word chars (including IPA symbols).
192
+ // JS \w only matches [a-zA-Z0-9_], so we use the Unicode-aware flag.
193
+ return text.match(/[\p{L}\p{N}_]+|[^\p{L}\p{N}_\s]/gu) || [];
194
+ }
195
+
196
+ async function generateChunk(
197
+ text: string,
198
+ voiceKey: string,
199
+ speed: number
200
+ ): Promise<Float32Array> {
201
+ if (!session || !config) throw new Error("Model not loaded");
202
+
203
+ let voiceId = voiceKey;
204
+ if (config.voice_aliases?.[voiceKey]) {
205
+ voiceId = config.voice_aliases[voiceKey];
206
+ }
207
+
208
+ const voiceData = voices[voiceId];
209
+ if (!voiceData) throw new Error(`Voice "${voiceKey}" not found`);
210
+
211
+ if (config.speed_priors?.[voiceId]) {
212
+ speed = speed * config.speed_priors[voiceId];
213
+ }
214
+
215
+ // Phonemize text using espeak-ng WASM
216
+ const phonemesList = await phonemize(text, "en-us");
217
+ const phonemesRaw = phonemesList[0] || "";
218
+ const phonemeTokens = basicTokenize(phonemesRaw);
219
+ const phonemesJoined = phonemeTokens.join(" ");
220
+ const inputIds = tokenize(phonemesJoined);
221
+
222
+ // Select voice style reference based on text length (matches Python logic)
223
+ const refId = Math.min(text.length, voiceData.shape[0] - 1);
224
+ const styleDim = voiceData.shape[1];
225
+ const refStyle = voiceData.data.slice(refId * styleDim, (refId + 1) * styleDim);
226
+
227
+ // Create ONNX tensors
228
+ const inputIdsTensor = new ort.Tensor(
229
+ "int64",
230
+ BigInt64Array.from(inputIds.map(BigInt)),
231
+ [1, inputIds.length]
232
+ );
233
+ const styleTensor = new ort.Tensor("float32", refStyle, [1, styleDim]);
234
+ const speedTensor = new ort.Tensor("float32", new Float32Array([speed]), [1]);
235
+
236
+ // Run inference
237
+ const results = await session.run({
238
+ input_ids: inputIdsTensor,
239
+ style: styleTensor,
240
+ speed: speedTensor,
241
+ });
242
+
243
+ // Get output audio
244
+ const outputKey = session.outputNames[0];
245
+ const audioData = results[outputKey].data as Float32Array;
246
+
247
+ // Trim trailing silence (matching Python: audio[..., :-5000])
248
+ return audioData.slice(0, Math.max(0, audioData.length - 5000));
249
+ }
250
+
251
+ async function generate(text: string, voice: string, speed: number) {
252
+ try {
253
+ const chunks = chunkText(text);
254
+
255
+ self.postMessage({
256
+ type: "status",
257
+ message: `Generating (${chunks.length} chunk${chunks.length > 1 ? "s" : ""})...`,
258
+ });
259
+
260
+ const audioChunks: Float32Array[] = [];
261
+ for (let i = 0; i < chunks.length; i++) {
262
+ self.postMessage({
263
+ type: "progress",
264
+ current: i + 1,
265
+ total: chunks.length,
266
+ });
267
+ const audio = await generateChunk(chunks[i], voice, speed);
268
+ audioChunks.push(audio);
269
+ }
270
+
271
+ const totalLen = audioChunks.reduce((s, c) => s + c.length, 0);
272
+ const fullAudio = new Float32Array(totalLen);
273
+ let offset = 0;
274
+ for (const chunk of audioChunks) {
275
+ fullAudio.set(chunk, offset);
276
+ offset += chunk.length;
277
+ }
278
+
279
+ self.postMessage(
280
+ {
281
+ type: "audio",
282
+ audio: fullAudio.buffer,
283
+ sampleRate: SAMPLE_RATE,
284
+ },
285
+ { transfer: [fullAudio.buffer] }
286
+ );
287
+ } catch (err: any) {
288
+ self.postMessage({ type: "error", error: err.message || String(err) });
289
+ }
290
+ }
291
+
292
+ // Message handler
293
+ self.addEventListener("message", async (e) => {
294
+ const { action, ...data } = e.data;
295
+ switch (action) {
296
+ case "load":
297
+ try {
298
+ await loadModel(data.repoId);
299
+ } catch (err: any) {
300
+ console.error("[KittenTTS Worker] Load error:", err);
301
+ self.postMessage({ type: "error", error: err.message || String(err) });
302
+ }
303
+ break;
304
+ case "generate":
305
+ await generate(data.text, data.voice, data.speed);
306
+ break;
307
+ }
308
+ });
309
+
310
+ self.addEventListener("error", (e) => {
311
+ self.postMessage({ type: "error", error: e.message || "Unknown worker error" });
312
+ });
313
+
314
+ self.addEventListener("unhandledrejection", (e: any) => {
315
+ self.postMessage({ type: "error", error: e.reason?.message || String(e.reason) });
316
+ });
style.css DELETED
@@ -1,28 +0,0 @@
1
- body {
2
- padding: 2rem;
3
- font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
4
- }
5
-
6
- h1 {
7
- font-size: 16px;
8
- margin-top: 0;
9
- }
10
-
11
- p {
12
- color: rgb(107, 114, 128);
13
- font-size: 15px;
14
- margin-bottom: 10px;
15
- margin-top: 5px;
16
- }
17
-
18
- .card {
19
- max-width: 620px;
20
- margin: 0 auto;
21
- padding: 16px;
22
- border: 1px solid lightgray;
23
- border-radius: 16px;
24
- }
25
-
26
- .card p:last-child {
27
- margin-bottom: 0;
28
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tsconfig.app.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "compilerOptions": {
3
+ "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.app.tsbuildinfo",
4
+ "target": "ES2023",
5
+ "useDefineForClassFields": true,
6
+ "lib": ["ES2023", "DOM", "DOM.Iterable"],
7
+ "module": "ESNext",
8
+ "types": ["vite/client"],
9
+ "skipLibCheck": true,
10
+
11
+ /* Bundler mode */
12
+ "moduleResolution": "bundler",
13
+ "allowImportingTsExtensions": true,
14
+ "verbatimModuleSyntax": true,
15
+ "moduleDetection": "force",
16
+ "noEmit": true,
17
+ "jsx": "react-jsx",
18
+
19
+ /* Linting */
20
+ "strict": true,
21
+ "noUnusedLocals": true,
22
+ "noUnusedParameters": true,
23
+ "erasableSyntaxOnly": true,
24
+ "noFallthroughCasesInSwitch": true,
25
+ "noUncheckedSideEffectImports": true
26
+ },
27
+ "include": ["src"]
28
+ }
tsconfig.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "files": [],
3
+ "references": [
4
+ { "path": "./tsconfig.app.json" },
5
+ { "path": "./tsconfig.node.json" }
6
+ ]
7
+ }
tsconfig.node.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "compilerOptions": {
3
+ "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.node.tsbuildinfo",
4
+ "target": "ES2023",
5
+ "lib": ["ES2023"],
6
+ "module": "ESNext",
7
+ "types": ["node"],
8
+ "skipLibCheck": true,
9
+
10
+ /* Bundler mode */
11
+ "moduleResolution": "bundler",
12
+ "allowImportingTsExtensions": true,
13
+ "verbatimModuleSyntax": true,
14
+ "moduleDetection": "force",
15
+ "noEmit": true,
16
+
17
+ /* Linting */
18
+ "strict": true,
19
+ "noUnusedLocals": true,
20
+ "noUnusedParameters": true,
21
+ "erasableSyntaxOnly": true,
22
+ "noFallthroughCasesInSwitch": true,
23
+ "noUncheckedSideEffectImports": true
24
+ },
25
+ "include": ["vite.config.ts"]
26
+ }
vite.config.ts ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import { defineConfig } from "vite";
2
+ import react from "@vitejs/plugin-react";
3
+
4
+ export default defineConfig({
5
+ plugins: [react()],
6
+ worker: {
7
+ format: "es",
8
+ },
9
+ });