Spaces:
Running
Running
feat: KittenTTS WebGPU browser demo
Browse filesBrowser TTS using KittenML models with ONNX Runtime Web (WebGPU/WASM).
Phonemization via xenova/phonemizer.js. No server needed.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
- README.md +31 -4
- assets/index-CcptzfjJ.js +0 -0
- assets/index-Ck4Qq8gn.css +1 -0
- assets/ort-wasm-simd-threaded.jsep-Bzonanhp.wasm +3 -0
- assets/ort.bundle.min-DL658BJE.js +0 -0
- assets/phonemizer-BgK0uh4o.js +0 -0
- assets/worker-Br1yesTn.js +1 -0
- index.html +12 -17
- package.json +34 -0
- src/App.tsx +387 -0
- src/index.css +340 -0
- src/lib/npz-reader.ts +214 -0
- src/lib/preprocess.ts +310 -0
- src/lib/text-cleaner.ts +38 -0
- src/main.tsx +10 -0
- src/vite-env.d.ts +8 -0
- src/worker.ts +316 -0
- style.css +0 -28
- tsconfig.app.json +28 -0
- tsconfig.json +7 -0
- tsconfig.node.json +26 -0
- vite.config.ts +9 -0
README.md
CHANGED
|
@@ -1,10 +1,37 @@
|
|
| 1 |
---
|
| 2 |
title: KittenTTS WebGPU
|
| 3 |
-
emoji: 🔥
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: static
|
| 7 |
pinned: false
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
title: KittenTTS WebGPU
|
| 3 |
+
emoji: 🐱🔥
|
| 4 |
+
colorFrom: purple
|
| 5 |
+
colorTo: indigo
|
| 6 |
sdk: static
|
| 7 |
pinned: false
|
| 8 |
+
license: apache-2.0
|
| 9 |
---
|
| 10 |
|
| 11 |
+
# KittenTTS WebGPU
|
| 12 |
+
|
| 13 |
+
Text-to-speech running entirely in your browser via WebGPU/WASM. No server needed.
|
| 14 |
+
|
| 15 |
+
## Credits
|
| 16 |
+
|
| 17 |
+
- **Models**: [KittenML](https://huggingface.co/KittenML) — ultra-lightweight TTS models based on StyleTTS 2
|
| 18 |
+
- **Original Demo**: [KittenTTS-Demo](https://huggingface.co/spaces/KittenML/KittenTTS-Demo)
|
| 19 |
+
- **Transformers.js v4**: [huggingface/transformers.js](https://github.com/huggingface/transformers.js) — ML inference in the browser
|
| 20 |
+
- **phonemizer.js**: [xenova/phonemizer.js](https://github.com/xenova/phonemizer.js) — eSpeak-NG phonemization for the web by [Xenova](https://github.com/xenova)
|
| 21 |
+
- **ONNX Runtime Web**: [onnxruntime](https://onnxruntime.ai) — cross-platform ML inference
|
| 22 |
+
- **Kokoro Web**: [xenova/kokoro-web](https://github.com/xenova/kokoro-web) — reference implementation for browser TTS
|
| 23 |
+
|
| 24 |
+
## Development
|
| 25 |
+
|
| 26 |
+
```bash
|
| 27 |
+
npm install
|
| 28 |
+
npm run dev
|
| 29 |
+
```
|
| 30 |
+
|
| 31 |
+
## Build
|
| 32 |
+
|
| 33 |
+
```bash
|
| 34 |
+
npm run build
|
| 35 |
+
```
|
| 36 |
+
|
| 37 |
+
Output goes to `dist/` — deploy as a static HuggingFace Space or any static host.
|
assets/index-CcptzfjJ.js
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
assets/index-Ck4Qq8gn.css
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
*,:before,:after{box-sizing:border-box;margin:0;padding:0}:root{--bg:#111;--surface:#1a1a1a;--surface-2:#222;--border:#333;--text:#e5e5e5;--text-muted:#888;--accent:#c084fc;--accent-dim:#c084fc26;--radius:8px;--font:system-ui, -apple-system, "Segoe UI", sans-serif;--mono:ui-monospace, "SF Mono", Consolas, monospace}html{--lightningcss-light: ;--lightningcss-dark:initial;color-scheme:dark}body{font-family:var(--font);background:var(--bg);color:var(--text);-webkit-font-smoothing:antialiased;line-height:1.5}.container{flex-direction:column;max-width:720px;min-height:100vh;margin:0 auto;padding:2rem 1.5rem;display:flex}header{text-align:center;margin-bottom:2rem;position:relative}header h1{letter-spacing:-.02em;margin-bottom:.25rem;font-size:1.75rem;font-weight:600}.logo{font-size:1.5rem}.subtitle{color:var(--text-muted);font-size:.9rem}.badge{font-size:.7rem;font-weight:600;font-family:var(--mono);letter-spacing:.05em;border-radius:999px;margin-top:.5rem;padding:.15rem .6rem;display:inline-block}.badge-gpu{color:#4ade80;background:#4ade8026;border:1px solid #4ade804d}.badge-wasm{color:#fbbf24;background:#fbbf2426;border:1px solid #fbbf244d}main{flex-direction:column;flex:1;gap:1.5rem;display:flex}label{color:var(--text-muted);text-transform:uppercase;letter-spacing:.05em;margin-bottom:.4rem;font-size:.8rem;font-weight:500;display:block}textarea{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);width:100%;color:var(--text);font-family:var(--font);resize:vertical;outline:none;padding:.75rem;font-size:.95rem;transition:border-color .15s}textarea:focus{border-color:var(--accent)}textarea::placeholder{color:#555}.controls-row{grid-template-columns:1fr 1fr;gap:.75rem;margin-top:.75rem;display:grid}select{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);width:100%;color:var(--text);font-family:var(--font);cursor:pointer;appearance:none;background-image:url("data:image/svg+xml,%3Csvg width='10' height='6' viewBox='0 0 10 6' fill='none' xmlns='http://www.w3.org/2000/svg'%3E%3Cpath d='M1 1l4 4 4-4' stroke='%23888' stroke-width='1.5' stroke-linecap='round' stroke-linejoin='round'/%3E%3C/svg%3E");background-position:right .75rem center;background-repeat:no-repeat;outline:none;padding:.5rem 2rem .5rem .75rem;font-size:.9rem}select:focus{border-color:var(--accent)}select:disabled{opacity:.5;cursor:not-allowed}.speed-row{margin-top:.75rem}.speed-row label{font-family:var(--mono);font-size:.75rem}input[type=range]{width:100%;accent-color:var(--accent);cursor:pointer;height:4px}.generate-btn{background:var(--accent);color:#111;border-radius:var(--radius);width:100%;font-family:var(--font);cursor:pointer;border:none;margin-top:1rem;padding:.7rem 1.5rem;font-size:.9rem;font-weight:600;transition:opacity .15s}.generate-btn:hover:not(:disabled){opacity:.9}.generate-btn:disabled{opacity:.4;cursor:not-allowed}.output-section{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:1rem}.audio-result{flex-direction:column;gap:.5rem;display:flex}.audio-player{border-radius:var(--radius);width:100%}.duration{color:var(--text-muted);font-size:.75rem;font-family:var(--mono)}.audio-placeholder{text-align:center;color:#555;padding:2rem 1rem;font-size:.85rem}.examples-grid{flex-direction:column;gap:.5rem;display:flex}.example-btn{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);width:100%;color:var(--text);font-family:var(--font);cursor:pointer;text-align:left;align-items:baseline;gap:.75rem;padding:.6rem .75rem;font-size:.85rem;transition:border-color .15s;display:flex}.example-btn:hover:not(:disabled){border-color:var(--accent)}.example-btn:disabled{opacity:.4;cursor:not-allowed}.example-voice{font-family:var(--mono);color:var(--accent);background:var(--accent-dim);border-radius:4px;flex-shrink:0;padding:.1rem .4rem;font-size:.75rem;font-weight:600}.example-text{color:var(--text-muted)}.error-msg{color:#f87171;border-radius:var(--radius);background:#ef44441a;border:1px solid #ef44444d;padding:.75rem 1rem;font-size:.85rem}footer{border-top:1px solid var(--border);text-align:center;color:var(--text-muted);flex-direction:column;gap:.25rem;margin-top:2.5rem;padding-top:1.5rem;font-size:.75rem;display:flex}footer a{color:var(--text-muted);text-underline-offset:2px;text-decoration:underline;transition:color .15s}footer a:hover{color:var(--text)}@media (width<=480px){.container{padding:1.25rem 1rem}.controls-row{grid-template-columns:1fr}header h1{font-size:1.5rem}}
|
assets/ort-wasm-simd-threaded.jsep-Bzonanhp.wasm
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:66dd6edabc43c9ec1df860978baa403c6610de2f3b3bbfdfcfcbbfadf7677132
|
| 3 |
+
size 25096522
|
assets/ort.bundle.min-DL658BJE.js
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
assets/phonemizer-BgK0uh4o.js
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
assets/worker-Br1yesTn.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
const e=[`$`,...`;:,.!?¡¿—…"«»"" `,...`ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz`,...`ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ`],t={};for(let n=0;n<e.length;n++)t[e[n]]=n;function n(e){let n=[];for(let r of e){let e=t[r];e!==void 0&&n.push(e)}return n}function r(e){let t=n(e);return t.unshift(0),t.push(10),t.push(0),t}function i(e){if(e[0]!==147||String.fromCharCode(e[1],e[2],e[3],e[4],e[5])!==`NUMPY`)throw Error(`Not a valid .npy file`);let t=e[6],n=new DataView(e.buffer,e.byteOffset,e.byteLength),r,i;t===1?(r=n.getUint16(8,!0),i=10):(r=n.getUint32(8,!0),i=12);let a=new TextDecoder().decode(e.slice(i,i+r)),o=a.match(/'descr'\s*:\s*'([^']+)'/),s=a.match(/'shape'\s*:\s*\(([^)]*)\)/);if(!o)throw Error(`Could not parse dtype from .npy header: `+a);return{descr:o[1],shape:s?s[1].split(`,`).map(e=>parseInt(e.trim(),10)).filter(e=>!isNaN(e)):[],dataOffset:i+r}}function a(e){let{descr:t,shape:n,dataOffset:r}=i(e),a=e.slice(r),o=new ArrayBuffer(a.length);new Uint8Array(o).set(a);let s;if(t===`<f4`||t===`float32`)s=new Float32Array(o);else if(t===`<f8`||t===`float64`){let e=new Float64Array(o);s=new Float32Array(e.length);for(let t=0;t<e.length;t++)s[t]=e[t]}else throw Error(`Unsupported npy dtype: `+t);return{data:s,shape:n}}async function o(e){let t=new Uint8Array(e),n=new DataView(e),r=new Map,i=-1;for(let e=t.length-22;e>=0;e--)if(n.getUint32(e,!0)===101010256){i=e;break}if(i===-1)throw Error(`Could not find End of Central Directory`);let a=n.getUint32(i+16,!0),o=n.getUint16(i+10,!0),s=[],c=a;for(let e=0;e<o&&n.getUint32(c,!0)===33639248;e++){let e=n.getUint16(c+10,!0),r=n.getUint32(c+20,!0),i=n.getUint32(c+24,!0),a=n.getUint16(c+28,!0),o=n.getUint16(c+30,!0),l=n.getUint16(c+32,!0),u=n.getUint32(c+42,!0),d=new TextDecoder().decode(t.slice(c+46,c+46+a));s.push({fileName:d,compressedSize:r,uncompressedSize:i,localHeaderOffset:u,compressionMethod:e}),c+=46+a+o+l}for(let e of s){let i=e.localHeaderOffset,a=n.getUint16(i+26,!0),o=n.getUint16(i+28,!0),s=i+30+a+o,c;if(e.compressionMethod===0)c=t.slice(s,s+e.uncompressedSize);else if(e.compressionMethod===8){let n=t.slice(s,s+e.compressedSize),r=new DecompressionStream(`deflate-raw`),i=r.writable.getWriter();i.write(n),i.close();let a=r.readable.getReader(),o=[],l=0;for(;;){let{done:e,value:t}=await a.read();if(e)break;o.push(t),l+=t.length}c=new Uint8Array(l);let u=0;for(let e of o)c.set(e,u),u+=e.length}else{console.warn(`Skipping ${e.fileName}: unsupported compression ${e.compressionMethod}`);continue}r.set(e.fileName,c)}return r}async function s(e){let t=await fetch(e);if(!t.ok)throw Error(`Failed to fetch voices: ${t.status}`);let n=await o(await t.arrayBuffer()),r={};for(let[e,t]of n){if(!e.endsWith(`.npy`))continue;let n=e.replace(/\.npy$/,``),{data:i,shape:o}=a(t);r[n]={data:i,shape:[o[0]||1,o[1]||i.length]}}return r}let c,l;const u=[`int8`];let d=null,f={},p=null,m=`wasm`;function h(e,t){return`https://huggingface.co/${e}/resolve/main/${t}`}async function g(){try{return`gpu`in navigator?!!await navigator.gpu.requestAdapter():!1}catch{return!1}}async function _(e){self.postMessage({type:`status`,message:`Detecting hardware...`});let t=await g();self.postMessage({type:`status`,message:`Loading runtime...`});let[n,r]=await Promise.all([import(`./ort.bundle.min-DL658BJE.js`),import(`./phonemizer-BgK0uh4o.js`)]);l=n,c=r.phonemize,self.postMessage({type:`status`,message:`Loading config...`});let i=h(e,`config.json`);p=await(await fetch(i)).json();let a=p.model||e.split(`/`).pop()||``,o=u.some(e=>a.includes(e));m=t&&!o?`webgpu`:`wasm`,t&&o&&console.log(`[KittenTTS] Using WASM for "${a}" (int8 models produce NaN on WebGPU)`),self.postMessage({type:`device`,device:m}),self.postMessage({type:`status`,message:`Downloading model & voices...`});let _=h(e,p.model_file),v=(async()=>{let e=await fetch(_);if(!e.ok)throw Error(`Failed to fetch model: ${e.status}`);let t=parseInt(e.headers.get(`content-length`)||`0`,10),n=e.body.getReader(),r=[],i=0;for(;;){let{done:e,value:a}=await n.read();if(e)break;if(r.push(a),i+=a.length,t>0){let e=Math.round(i/t*100),n=(i/1024/1024).toFixed(1);self.postMessage({type:`status`,message:`Downloading model... ${e}% (${n} MB)`})}}let a=new Uint8Array(i),o=0;for(let e of r)a.set(e,o),o+=e.length;return a.buffer})(),y=s(h(e,p.voices)),[b,x]=await Promise.all([v,y]);f=x,self.postMessage({type:`status`,message:`Initializing ${m.toUpperCase()} session...`});let S={executionProviders:m===`webgpu`?[`webgpu`]:[`wasm`]};m===`wasm`&&(l.env.wasm.numThreads=1),d=await l.InferenceSession.create(b,S);let C=p.voice_aliases?Object.keys(p.voice_aliases):Object.keys(f);self.postMessage({type:`ready`,voices:C,device:m,modelName:p.name})}function v(e){return e=e.trim(),e&&(`.!?,;:`.includes(e[e.length-1])||(e+=`,`),e)}function y(e,t=400){let n=e.match(/[^.!?]*[.!?]+|[^.!?]+$/g)||[e],r=[];for(let e of n)if(e=e.trim(),e)if(e.length<=t)r.push(v(e));else{let n=e.split(/\s+/),i=``;for(let e of n)i.length+e.length+1<=t?i+=(i?` `:``)+e:(i&&r.push(v(i)),i=e);i&&r.push(v(i))}return r}function b(e){return e.match(/[\p{L}\p{N}_]+|[^\p{L}\p{N}_\s]/gu)||[]}async function x(e,t,n){if(!d||!p)throw Error(`Model not loaded`);let i=t;p.voice_aliases?.[t]&&(i=p.voice_aliases[t]);let a=f[i];if(!a)throw Error(`Voice "${t}" not found`);p.speed_priors?.[i]&&(n*=p.speed_priors[i]);let o=r(b((await c(e,`en-us`))[0]||``).join(` `)),s=Math.min(e.length,a.shape[0]-1),u=a.shape[1],m=a.data.slice(s*u,(s+1)*u),h=new l.Tensor(`int64`,BigInt64Array.from(o.map(BigInt)),[1,o.length]),g=new l.Tensor(`float32`,m,[1,u]),_=new l.Tensor(`float32`,new Float32Array([n]),[1]),v=(await d.run({input_ids:h,style:g,speed:_}))[d.outputNames[0]].data;return v.slice(0,Math.max(0,v.length-5e3))}async function S(e,t,n){try{let r=y(e);self.postMessage({type:`status`,message:`Generating (${r.length} chunk${r.length>1?`s`:``})...`});let i=[];for(let e=0;e<r.length;e++){self.postMessage({type:`progress`,current:e+1,total:r.length});let a=await x(r[e],t,n);i.push(a)}let a=i.reduce((e,t)=>e+t.length,0),o=new Float32Array(a),s=0;for(let e of i)o.set(e,s),s+=e.length;self.postMessage({type:`audio`,audio:o.buffer,sampleRate:24e3},{transfer:[o.buffer]})}catch(e){self.postMessage({type:`error`,error:e.message||String(e)})}}self.addEventListener(`message`,async e=>{let{action:t,...n}=e.data;switch(t){case`load`:try{await _(n.repoId)}catch(e){console.error(`[KittenTTS Worker] Load error:`,e),self.postMessage({type:`error`,error:e.message||String(e)})}break;case`generate`:await S(n.text,n.voice,n.speed);break}}),self.addEventListener(`error`,e=>{self.postMessage({type:`error`,error:e.message||`Unknown worker error`})}),self.addEventListener(`unhandledrejection`,e=>{self.postMessage({type:`error`,error:e.reason?.message||String(e.reason)})});
|
index.html
CHANGED
|
@@ -1,19 +1,14 @@
|
|
| 1 |
<!doctype html>
|
| 2 |
-
<html>
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
Also don't forget to check the
|
| 15 |
-
<a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
|
| 16 |
-
</p>
|
| 17 |
-
</div>
|
| 18 |
-
</body>
|
| 19 |
</html>
|
|
|
|
| 1 |
<!doctype html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8" />
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
| 6 |
+
<title>KittenTTS — Browser TTS with WebGPU</title>
|
| 7 |
+
<meta name="description" content="Text-to-speech running entirely in your browser via WebGPU. Powered by KittenML models and Transformers.js v4." />
|
| 8 |
+
<script type="module" crossorigin src="/assets/index-CcptzfjJ.js"></script>
|
| 9 |
+
<link rel="stylesheet" crossorigin href="/assets/index-Ck4Qq8gn.css">
|
| 10 |
+
</head>
|
| 11 |
+
<body>
|
| 12 |
+
<div id="root"></div>
|
| 13 |
+
</body>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
</html>
|
package.json
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "kitten-tts-web",
|
| 3 |
+
"private": true,
|
| 4 |
+
"version": "0.0.0",
|
| 5 |
+
"type": "module",
|
| 6 |
+
"scripts": {
|
| 7 |
+
"dev": "vite",
|
| 8 |
+
"build": "tsc -b && vite build",
|
| 9 |
+
"lint": "eslint .",
|
| 10 |
+
"preview": "vite preview"
|
| 11 |
+
},
|
| 12 |
+
"dependencies": {
|
| 13 |
+
"@huggingface/transformers": "^4.0.0-next.8",
|
| 14 |
+
"onnxruntime-web": "^1.25.0-dev.20260307-d626b568e0",
|
| 15 |
+
"phonemizer": "^1.2.1",
|
| 16 |
+
"react": "^19.2.4",
|
| 17 |
+
"react-dom": "^19.2.4"
|
| 18 |
+
},
|
| 19 |
+
"devDependencies": {
|
| 20 |
+
"@eslint/js": "^9.39.4",
|
| 21 |
+
"@types/node": "^24.12.0",
|
| 22 |
+
"@types/react": "^19.2.14",
|
| 23 |
+
"@types/react-dom": "^19.2.3",
|
| 24 |
+
"@vitejs/plugin-react": "^6.0.1",
|
| 25 |
+
"@webgpu/types": "^0.1.69",
|
| 26 |
+
"eslint": "^9.39.4",
|
| 27 |
+
"eslint-plugin-react-hooks": "^7.0.1",
|
| 28 |
+
"eslint-plugin-react-refresh": "^0.5.2",
|
| 29 |
+
"globals": "^17.4.0",
|
| 30 |
+
"typescript": "~5.9.3",
|
| 31 |
+
"typescript-eslint": "^8.57.0",
|
| 32 |
+
"vite": "^8.0.1"
|
| 33 |
+
}
|
| 34 |
+
}
|
src/App.tsx
ADDED
|
@@ -0,0 +1,387 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useState, useRef, useCallback, useEffect } from "react";
|
| 2 |
+
|
| 3 |
+
const MODELS: Record<string, string> = {
|
| 4 |
+
"Nano Int8 (15M · Fastest)": "KittenML/kitten-tts-nano-0.8-int8",
|
| 5 |
+
"Nano FP32 (15M)": "KittenML/kitten-tts-nano-0.8-fp32",
|
| 6 |
+
"Micro (40M · Balanced)": "KittenML/kitten-tts-micro-0.8",
|
| 7 |
+
"Mini (80M · Best Quality)": "KittenML/kitten-tts-mini-0.8",
|
| 8 |
+
};
|
| 9 |
+
|
| 10 |
+
const DEFAULT_MODEL = "Nano FP32 (15M)";
|
| 11 |
+
|
| 12 |
+
const EXAMPLES = [
|
| 13 |
+
{
|
| 14 |
+
text: "Space is a three-dimensional continuum containing positions and directions.",
|
| 15 |
+
voice: "Jasper",
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
text: "She picked up her coffee and walked toward the window.",
|
| 19 |
+
voice: "Luna",
|
| 20 |
+
},
|
| 21 |
+
{
|
| 22 |
+
text: "The sun set slowly over the calm, quiet lake.",
|
| 23 |
+
voice: "Bella",
|
| 24 |
+
},
|
| 25 |
+
];
|
| 26 |
+
|
| 27 |
+
type Status = "idle" | "loading" | "ready" | "generating" | "error";
|
| 28 |
+
|
| 29 |
+
export default function App() {
|
| 30 |
+
const [text, setText] = useState("");
|
| 31 |
+
const [model, setModel] = useState(DEFAULT_MODEL);
|
| 32 |
+
const [voice, setVoice] = useState("Jasper");
|
| 33 |
+
const [speed, setSpeed] = useState(1.0);
|
| 34 |
+
const [voices, setVoices] = useState<string[]>([]);
|
| 35 |
+
const [status, setStatus] = useState<Status>("idle");
|
| 36 |
+
const [statusMsg, setStatusMsg] = useState("");
|
| 37 |
+
const [device, setDevice] = useState("");
|
| 38 |
+
const [progress, setProgress] = useState({ current: 0, total: 0 });
|
| 39 |
+
const [audioUrl, setAudioUrl] = useState<string | null>(null);
|
| 40 |
+
const [error, setError] = useState<string | null>(null);
|
| 41 |
+
const [duration, setDuration] = useState<number | null>(null);
|
| 42 |
+
|
| 43 |
+
const workerRef = useRef<Worker | null>(null);
|
| 44 |
+
const genStartRef = useRef<number>(0);
|
| 45 |
+
|
| 46 |
+
const initWorker = useCallback(() => {
|
| 47 |
+
if (workerRef.current) workerRef.current.terminate();
|
| 48 |
+
|
| 49 |
+
const worker = new Worker(new URL("./worker.ts", import.meta.url), {
|
| 50 |
+
type: "module",
|
| 51 |
+
});
|
| 52 |
+
workerRef.current = worker;
|
| 53 |
+
|
| 54 |
+
worker.addEventListener("error", (e) => {
|
| 55 |
+
console.error("Worker error:", e);
|
| 56 |
+
setError(`Worker failed: ${e.message}`);
|
| 57 |
+
setStatus("error");
|
| 58 |
+
setStatusMsg("");
|
| 59 |
+
});
|
| 60 |
+
|
| 61 |
+
worker.addEventListener("message", (e) => {
|
| 62 |
+
const msg = e.data;
|
| 63 |
+
switch (msg.type) {
|
| 64 |
+
case "status":
|
| 65 |
+
setStatusMsg(msg.message);
|
| 66 |
+
break;
|
| 67 |
+
case "device":
|
| 68 |
+
setDevice(msg.device);
|
| 69 |
+
break;
|
| 70 |
+
case "ready":
|
| 71 |
+
setStatus("ready");
|
| 72 |
+
setVoices(msg.voices);
|
| 73 |
+
setStatusMsg(`${msg.modelName} loaded`);
|
| 74 |
+
break;
|
| 75 |
+
case "progress":
|
| 76 |
+
setProgress({ current: msg.current, total: msg.total });
|
| 77 |
+
break;
|
| 78 |
+
case "audio": {
|
| 79 |
+
const audioData = new Float32Array(msg.audio);
|
| 80 |
+
const blob = float32ToWav(audioData, msg.sampleRate);
|
| 81 |
+
const url = URL.createObjectURL(blob);
|
| 82 |
+
setAudioUrl((prev) => {
|
| 83 |
+
if (prev) URL.revokeObjectURL(prev);
|
| 84 |
+
return url;
|
| 85 |
+
});
|
| 86 |
+
setDuration(
|
| 87 |
+
Math.round(performance.now() - genStartRef.current)
|
| 88 |
+
);
|
| 89 |
+
setStatus("ready");
|
| 90 |
+
setStatusMsg("Done!");
|
| 91 |
+
break;
|
| 92 |
+
}
|
| 93 |
+
case "error":
|
| 94 |
+
setError(msg.error);
|
| 95 |
+
setStatus("error");
|
| 96 |
+
setStatusMsg("");
|
| 97 |
+
break;
|
| 98 |
+
}
|
| 99 |
+
});
|
| 100 |
+
|
| 101 |
+
return worker;
|
| 102 |
+
}, []);
|
| 103 |
+
|
| 104 |
+
const loadModel = useCallback(
|
| 105 |
+
(modelKey: string) => {
|
| 106 |
+
const worker = workerRef.current || initWorker();
|
| 107 |
+
setStatus("loading");
|
| 108 |
+
setError(null);
|
| 109 |
+
setAudioUrl(null);
|
| 110 |
+
setDuration(null);
|
| 111 |
+
setStatusMsg("Starting...");
|
| 112 |
+
worker.postMessage({ action: "load", repoId: MODELS[modelKey] });
|
| 113 |
+
},
|
| 114 |
+
[initWorker]
|
| 115 |
+
);
|
| 116 |
+
|
| 117 |
+
useEffect(() => {
|
| 118 |
+
loadModel(model);
|
| 119 |
+
return () => {
|
| 120 |
+
workerRef.current?.terminate();
|
| 121 |
+
};
|
| 122 |
+
// eslint-disable-next-line react-hooks/exhaustive-deps
|
| 123 |
+
}, []);
|
| 124 |
+
|
| 125 |
+
const handleModelChange = (newModel: string) => {
|
| 126 |
+
setModel(newModel);
|
| 127 |
+
loadModel(newModel);
|
| 128 |
+
};
|
| 129 |
+
|
| 130 |
+
const handleGenerate = () => {
|
| 131 |
+
if (!text.trim() || status !== "ready") return;
|
| 132 |
+
setStatus("generating");
|
| 133 |
+
setError(null);
|
| 134 |
+
setDuration(null);
|
| 135 |
+
setProgress({ current: 0, total: 0 });
|
| 136 |
+
genStartRef.current = performance.now();
|
| 137 |
+
workerRef.current?.postMessage({ action: "generate", text, voice, speed });
|
| 138 |
+
};
|
| 139 |
+
|
| 140 |
+
const handleExample = (ex: (typeof EXAMPLES)[0]) => {
|
| 141 |
+
setText(ex.text);
|
| 142 |
+
setVoice(ex.voice);
|
| 143 |
+
};
|
| 144 |
+
|
| 145 |
+
const handleKeyDown = (e: React.KeyboardEvent<HTMLTextAreaElement>) => {
|
| 146 |
+
if ((e.metaKey || e.ctrlKey) && e.key === "Enter") {
|
| 147 |
+
e.preventDefault();
|
| 148 |
+
handleGenerate();
|
| 149 |
+
}
|
| 150 |
+
};
|
| 151 |
+
|
| 152 |
+
return (
|
| 153 |
+
<div className="container">
|
| 154 |
+
<header>
|
| 155 |
+
<h1>
|
| 156 |
+
<span className="logo">🐱</span> KittenTTS
|
| 157 |
+
</h1>
|
| 158 |
+
<p className="subtitle">
|
| 159 |
+
Text-to-speech running entirely in your browser
|
| 160 |
+
</p>
|
| 161 |
+
{device && (
|
| 162 |
+
<span
|
| 163 |
+
className={`badge ${device === "webgpu" ? "badge-gpu" : "badge-wasm"}`}
|
| 164 |
+
>
|
| 165 |
+
{device.toUpperCase()}
|
| 166 |
+
</span>
|
| 167 |
+
)}
|
| 168 |
+
</header>
|
| 169 |
+
|
| 170 |
+
<main>
|
| 171 |
+
<div className="input-section">
|
| 172 |
+
<label htmlFor="text-input">Text</label>
|
| 173 |
+
<textarea
|
| 174 |
+
id="text-input"
|
| 175 |
+
value={text}
|
| 176 |
+
onChange={(e) => setText(e.target.value)}
|
| 177 |
+
onKeyDown={handleKeyDown}
|
| 178 |
+
placeholder="Enter text to synthesize…"
|
| 179 |
+
rows={5}
|
| 180 |
+
/>
|
| 181 |
+
|
| 182 |
+
<div className="controls-row">
|
| 183 |
+
<div className="control">
|
| 184 |
+
<label htmlFor="model-select">Model</label>
|
| 185 |
+
<select
|
| 186 |
+
id="model-select"
|
| 187 |
+
value={model}
|
| 188 |
+
onChange={(e) => handleModelChange(e.target.value)}
|
| 189 |
+
disabled={status === "loading" || status === "generating"}
|
| 190 |
+
>
|
| 191 |
+
{Object.keys(MODELS).map((m) => (
|
| 192 |
+
<option key={m} value={m}>
|
| 193 |
+
{m}
|
| 194 |
+
</option>
|
| 195 |
+
))}
|
| 196 |
+
</select>
|
| 197 |
+
</div>
|
| 198 |
+
<div className="control">
|
| 199 |
+
<label htmlFor="voice-select">Voice</label>
|
| 200 |
+
<select
|
| 201 |
+
id="voice-select"
|
| 202 |
+
value={voice}
|
| 203 |
+
onChange={(e) => setVoice(e.target.value)}
|
| 204 |
+
disabled={voices.length === 0}
|
| 205 |
+
>
|
| 206 |
+
{voices.map((v) => (
|
| 207 |
+
<option key={v} value={v}>
|
| 208 |
+
{v}
|
| 209 |
+
</option>
|
| 210 |
+
))}
|
| 211 |
+
</select>
|
| 212 |
+
</div>
|
| 213 |
+
</div>
|
| 214 |
+
|
| 215 |
+
<div className="speed-row">
|
| 216 |
+
<label htmlFor="speed-slider">Speed: {speed.toFixed(2)}x</label>
|
| 217 |
+
<input
|
| 218 |
+
id="speed-slider"
|
| 219 |
+
type="range"
|
| 220 |
+
min={0.5}
|
| 221 |
+
max={2.0}
|
| 222 |
+
step={0.05}
|
| 223 |
+
value={speed}
|
| 224 |
+
onChange={(e) => setSpeed(parseFloat(e.target.value))}
|
| 225 |
+
/>
|
| 226 |
+
</div>
|
| 227 |
+
|
| 228 |
+
<button
|
| 229 |
+
className="generate-btn"
|
| 230 |
+
onClick={handleGenerate}
|
| 231 |
+
disabled={status !== "ready" || !text.trim()}
|
| 232 |
+
>
|
| 233 |
+
{status === "generating"
|
| 234 |
+
? progress.total > 0
|
| 235 |
+
? `Generating ${progress.current}/${progress.total}…`
|
| 236 |
+
: "Generating…"
|
| 237 |
+
: status === "loading"
|
| 238 |
+
? "Loading model…"
|
| 239 |
+
: "Generate Speech"}
|
| 240 |
+
</button>
|
| 241 |
+
</div>
|
| 242 |
+
|
| 243 |
+
<div className="output-section">
|
| 244 |
+
<label>Output</label>
|
| 245 |
+
{audioUrl ? (
|
| 246 |
+
<div className="audio-result">
|
| 247 |
+
<audio controls src={audioUrl} className="audio-player" />
|
| 248 |
+
{duration !== null && (
|
| 249 |
+
<span className="duration">
|
| 250 |
+
Generated in {(duration / 1000).toFixed(1)}s
|
| 251 |
+
</span>
|
| 252 |
+
)}
|
| 253 |
+
</div>
|
| 254 |
+
) : (
|
| 255 |
+
<div className="audio-placeholder">
|
| 256 |
+
{status === "loading" || status === "generating"
|
| 257 |
+
? statusMsg
|
| 258 |
+
: "Audio will appear here"}
|
| 259 |
+
</div>
|
| 260 |
+
)}
|
| 261 |
+
</div>
|
| 262 |
+
|
| 263 |
+
<div className="examples">
|
| 264 |
+
<label>Examples</label>
|
| 265 |
+
<div className="examples-grid">
|
| 266 |
+
{EXAMPLES.map((ex, i) => (
|
| 267 |
+
<button
|
| 268 |
+
key={i}
|
| 269 |
+
className="example-btn"
|
| 270 |
+
onClick={() => handleExample(ex)}
|
| 271 |
+
disabled={status !== "ready"}
|
| 272 |
+
>
|
| 273 |
+
<span className="example-voice">{ex.voice}</span>
|
| 274 |
+
<span className="example-text">{ex.text}</span>
|
| 275 |
+
</button>
|
| 276 |
+
))}
|
| 277 |
+
</div>
|
| 278 |
+
</div>
|
| 279 |
+
|
| 280 |
+
{error && <div className="error-msg">{error}</div>}
|
| 281 |
+
</main>
|
| 282 |
+
|
| 283 |
+
<footer>
|
| 284 |
+
<p>
|
| 285 |
+
Models by{" "}
|
| 286 |
+
<a
|
| 287 |
+
href="https://huggingface.co/KittenML"
|
| 288 |
+
target="_blank"
|
| 289 |
+
rel="noopener"
|
| 290 |
+
>
|
| 291 |
+
KittenML
|
| 292 |
+
</a>
|
| 293 |
+
{" · "}
|
| 294 |
+
Original demo:{" "}
|
| 295 |
+
<a
|
| 296 |
+
href="https://huggingface.co/spaces/KittenML/KittenTTS-Demo"
|
| 297 |
+
target="_blank"
|
| 298 |
+
rel="noopener"
|
| 299 |
+
>
|
| 300 |
+
KittenTTS-Demo
|
| 301 |
+
</a>
|
| 302 |
+
</p>
|
| 303 |
+
<p>
|
| 304 |
+
Powered by{" "}
|
| 305 |
+
<a
|
| 306 |
+
href="https://github.com/huggingface/transformers.js"
|
| 307 |
+
target="_blank"
|
| 308 |
+
rel="noopener"
|
| 309 |
+
>
|
| 310 |
+
Transformers.js v4
|
| 311 |
+
</a>
|
| 312 |
+
{" · "}
|
| 313 |
+
<a
|
| 314 |
+
href="https://github.com/xenova/phonemizer.js"
|
| 315 |
+
target="_blank"
|
| 316 |
+
rel="noopener"
|
| 317 |
+
>
|
| 318 |
+
phonemizer.js
|
| 319 |
+
</a>{" "}
|
| 320 |
+
by{" "}
|
| 321 |
+
<a
|
| 322 |
+
href="https://github.com/xenova"
|
| 323 |
+
target="_blank"
|
| 324 |
+
rel="noopener"
|
| 325 |
+
>
|
| 326 |
+
Xenova
|
| 327 |
+
</a>
|
| 328 |
+
{" · "}
|
| 329 |
+
<a href="https://onnxruntime.ai" target="_blank" rel="noopener">
|
| 330 |
+
ONNX Runtime Web
|
| 331 |
+
</a>
|
| 332 |
+
</p>
|
| 333 |
+
</footer>
|
| 334 |
+
</div>
|
| 335 |
+
);
|
| 336 |
+
}
|
| 337 |
+
|
| 338 |
+
/** Convert Float32Array PCM to WAV Blob */
|
| 339 |
+
/** Convert Float32Array PCM to WAV Blob using IEEE float format */
|
| 340 |
+
function float32ToWav(samples: Float32Array, sampleRate: number): Blob {
|
| 341 |
+
// Normalize audio to [-1, 1] range
|
| 342 |
+
let maxAbs = 0;
|
| 343 |
+
for (let i = 0; i < samples.length; i++) {
|
| 344 |
+
const abs = Math.abs(samples[i]);
|
| 345 |
+
if (abs > maxAbs) maxAbs = abs;
|
| 346 |
+
}
|
| 347 |
+
if (maxAbs > 1) {
|
| 348 |
+
const scale = 0.95 / maxAbs; // leave some headroom
|
| 349 |
+
for (let i = 0; i < samples.length; i++) {
|
| 350 |
+
samples[i] *= scale;
|
| 351 |
+
}
|
| 352 |
+
}
|
| 353 |
+
|
| 354 |
+
// Write as IEEE 32-bit float WAV (format 3)
|
| 355 |
+
const bytesPerSample = 4;
|
| 356 |
+
const dataSize = samples.length * bytesPerSample;
|
| 357 |
+
const buffer = new ArrayBuffer(44 + dataSize);
|
| 358 |
+
const view = new DataView(buffer);
|
| 359 |
+
|
| 360 |
+
const writeStr = (offset: number, str: string) => {
|
| 361 |
+
for (let i = 0; i < str.length; i++)
|
| 362 |
+
view.setUint8(offset + i, str.charCodeAt(i));
|
| 363 |
+
};
|
| 364 |
+
|
| 365 |
+
writeStr(0, "RIFF");
|
| 366 |
+
view.setUint32(4, 36 + dataSize, true);
|
| 367 |
+
writeStr(8, "WAVE");
|
| 368 |
+
writeStr(12, "fmt ");
|
| 369 |
+
view.setUint32(16, 16, true);
|
| 370 |
+
view.setUint16(20, 3, true); // IEEE float
|
| 371 |
+
view.setUint16(22, 1, true); // mono
|
| 372 |
+
view.setUint32(24, sampleRate, true);
|
| 373 |
+
view.setUint32(28, sampleRate * bytesPerSample, true);
|
| 374 |
+
view.setUint16(32, bytesPerSample, true);
|
| 375 |
+
view.setUint16(34, 32, true); // bits per sample
|
| 376 |
+
writeStr(36, "data");
|
| 377 |
+
view.setUint32(40, dataSize, true);
|
| 378 |
+
|
| 379 |
+
// Write float samples directly
|
| 380 |
+
let offset = 44;
|
| 381 |
+
for (let i = 0; i < samples.length; i++) {
|
| 382 |
+
view.setFloat32(offset, samples[i], true);
|
| 383 |
+
offset += 4;
|
| 384 |
+
}
|
| 385 |
+
|
| 386 |
+
return new Blob([buffer], { type: "audio/wav" });
|
| 387 |
+
}
|
src/index.css
ADDED
|
@@ -0,0 +1,340 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*,
|
| 2 |
+
*::before,
|
| 3 |
+
*::after {
|
| 4 |
+
box-sizing: border-box;
|
| 5 |
+
margin: 0;
|
| 6 |
+
padding: 0;
|
| 7 |
+
}
|
| 8 |
+
|
| 9 |
+
:root {
|
| 10 |
+
--bg: #111;
|
| 11 |
+
--surface: #1a1a1a;
|
| 12 |
+
--surface-2: #222;
|
| 13 |
+
--border: #333;
|
| 14 |
+
--text: #e5e5e5;
|
| 15 |
+
--text-muted: #888;
|
| 16 |
+
--accent: #c084fc;
|
| 17 |
+
--accent-dim: rgba(192, 132, 252, 0.15);
|
| 18 |
+
--radius: 8px;
|
| 19 |
+
--font: system-ui, -apple-system, "Segoe UI", sans-serif;
|
| 20 |
+
--mono: ui-monospace, "SF Mono", Consolas, monospace;
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
html {
|
| 24 |
+
color-scheme: dark;
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
body {
|
| 28 |
+
font-family: var(--font);
|
| 29 |
+
background: var(--bg);
|
| 30 |
+
color: var(--text);
|
| 31 |
+
line-height: 1.5;
|
| 32 |
+
-webkit-font-smoothing: antialiased;
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
.container {
|
| 36 |
+
max-width: 720px;
|
| 37 |
+
margin: 0 auto;
|
| 38 |
+
padding: 2rem 1.5rem;
|
| 39 |
+
min-height: 100vh;
|
| 40 |
+
display: flex;
|
| 41 |
+
flex-direction: column;
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
/* Header */
|
| 45 |
+
|
| 46 |
+
header {
|
| 47 |
+
text-align: center;
|
| 48 |
+
margin-bottom: 2rem;
|
| 49 |
+
position: relative;
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
header h1 {
|
| 53 |
+
font-size: 1.75rem;
|
| 54 |
+
font-weight: 600;
|
| 55 |
+
letter-spacing: -0.02em;
|
| 56 |
+
margin-bottom: 0.25rem;
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
.logo {
|
| 60 |
+
font-size: 1.5rem;
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
.subtitle {
|
| 64 |
+
color: var(--text-muted);
|
| 65 |
+
font-size: 0.9rem;
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
.badge {
|
| 69 |
+
display: inline-block;
|
| 70 |
+
margin-top: 0.5rem;
|
| 71 |
+
padding: 0.15rem 0.6rem;
|
| 72 |
+
border-radius: 999px;
|
| 73 |
+
font-size: 0.7rem;
|
| 74 |
+
font-weight: 600;
|
| 75 |
+
font-family: var(--mono);
|
| 76 |
+
letter-spacing: 0.05em;
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
.badge-gpu {
|
| 80 |
+
background: rgba(74, 222, 128, 0.15);
|
| 81 |
+
color: #4ade80;
|
| 82 |
+
border: 1px solid rgba(74, 222, 128, 0.3);
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
.badge-wasm {
|
| 86 |
+
background: rgba(251, 191, 36, 0.15);
|
| 87 |
+
color: #fbbf24;
|
| 88 |
+
border: 1px solid rgba(251, 191, 36, 0.3);
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
/* Main */
|
| 92 |
+
|
| 93 |
+
main {
|
| 94 |
+
flex: 1;
|
| 95 |
+
display: flex;
|
| 96 |
+
flex-direction: column;
|
| 97 |
+
gap: 1.5rem;
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
label {
|
| 101 |
+
display: block;
|
| 102 |
+
font-size: 0.8rem;
|
| 103 |
+
font-weight: 500;
|
| 104 |
+
color: var(--text-muted);
|
| 105 |
+
margin-bottom: 0.4rem;
|
| 106 |
+
text-transform: uppercase;
|
| 107 |
+
letter-spacing: 0.05em;
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
textarea {
|
| 111 |
+
width: 100%;
|
| 112 |
+
background: var(--surface);
|
| 113 |
+
border: 1px solid var(--border);
|
| 114 |
+
border-radius: var(--radius);
|
| 115 |
+
color: var(--text);
|
| 116 |
+
font-family: var(--font);
|
| 117 |
+
font-size: 0.95rem;
|
| 118 |
+
padding: 0.75rem;
|
| 119 |
+
resize: vertical;
|
| 120 |
+
outline: none;
|
| 121 |
+
transition: border-color 0.15s;
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
textarea:focus {
|
| 125 |
+
border-color: var(--accent);
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
textarea::placeholder {
|
| 129 |
+
color: #555;
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
/* Controls */
|
| 133 |
+
|
| 134 |
+
.controls-row {
|
| 135 |
+
display: grid;
|
| 136 |
+
grid-template-columns: 1fr 1fr;
|
| 137 |
+
gap: 0.75rem;
|
| 138 |
+
margin-top: 0.75rem;
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
select {
|
| 142 |
+
width: 100%;
|
| 143 |
+
background: var(--surface);
|
| 144 |
+
border: 1px solid var(--border);
|
| 145 |
+
border-radius: var(--radius);
|
| 146 |
+
color: var(--text);
|
| 147 |
+
font-family: var(--font);
|
| 148 |
+
font-size: 0.9rem;
|
| 149 |
+
padding: 0.5rem 0.75rem;
|
| 150 |
+
outline: none;
|
| 151 |
+
cursor: pointer;
|
| 152 |
+
appearance: none;
|
| 153 |
+
background-image: url("data:image/svg+xml,%3Csvg width='10' height='6' viewBox='0 0 10 6' fill='none' xmlns='http://www.w3.org/2000/svg'%3E%3Cpath d='M1 1l4 4 4-4' stroke='%23888' stroke-width='1.5' stroke-linecap='round' stroke-linejoin='round'/%3E%3C/svg%3E");
|
| 154 |
+
background-repeat: no-repeat;
|
| 155 |
+
background-position: right 0.75rem center;
|
| 156 |
+
padding-right: 2rem;
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
select:focus {
|
| 160 |
+
border-color: var(--accent);
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
select:disabled {
|
| 164 |
+
opacity: 0.5;
|
| 165 |
+
cursor: not-allowed;
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
.speed-row {
|
| 169 |
+
margin-top: 0.75rem;
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
.speed-row label {
|
| 173 |
+
font-family: var(--mono);
|
| 174 |
+
font-size: 0.75rem;
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
input[type="range"] {
|
| 178 |
+
width: 100%;
|
| 179 |
+
accent-color: var(--accent);
|
| 180 |
+
height: 4px;
|
| 181 |
+
cursor: pointer;
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
/* Generate button */
|
| 185 |
+
|
| 186 |
+
.generate-btn {
|
| 187 |
+
margin-top: 1rem;
|
| 188 |
+
width: 100%;
|
| 189 |
+
padding: 0.7rem 1.5rem;
|
| 190 |
+
background: var(--accent);
|
| 191 |
+
color: #111;
|
| 192 |
+
border: none;
|
| 193 |
+
border-radius: var(--radius);
|
| 194 |
+
font-family: var(--font);
|
| 195 |
+
font-size: 0.9rem;
|
| 196 |
+
font-weight: 600;
|
| 197 |
+
cursor: pointer;
|
| 198 |
+
transition: opacity 0.15s;
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
.generate-btn:hover:not(:disabled) {
|
| 202 |
+
opacity: 0.9;
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
.generate-btn:disabled {
|
| 206 |
+
opacity: 0.4;
|
| 207 |
+
cursor: not-allowed;
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
/* Output */
|
| 211 |
+
|
| 212 |
+
.output-section {
|
| 213 |
+
background: var(--surface);
|
| 214 |
+
border: 1px solid var(--border);
|
| 215 |
+
border-radius: var(--radius);
|
| 216 |
+
padding: 1rem;
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
.audio-result {
|
| 220 |
+
display: flex;
|
| 221 |
+
flex-direction: column;
|
| 222 |
+
gap: 0.5rem;
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
.audio-player {
|
| 226 |
+
width: 100%;
|
| 227 |
+
border-radius: var(--radius);
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
.duration {
|
| 231 |
+
font-size: 0.75rem;
|
| 232 |
+
color: var(--text-muted);
|
| 233 |
+
font-family: var(--mono);
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
+
.audio-placeholder {
|
| 237 |
+
padding: 2rem 1rem;
|
| 238 |
+
text-align: center;
|
| 239 |
+
color: #555;
|
| 240 |
+
font-size: 0.85rem;
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
/* Examples */
|
| 244 |
+
|
| 245 |
+
.examples-grid {
|
| 246 |
+
display: flex;
|
| 247 |
+
flex-direction: column;
|
| 248 |
+
gap: 0.5rem;
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
.example-btn {
|
| 252 |
+
display: flex;
|
| 253 |
+
align-items: baseline;
|
| 254 |
+
gap: 0.75rem;
|
| 255 |
+
width: 100%;
|
| 256 |
+
background: var(--surface);
|
| 257 |
+
border: 1px solid var(--border);
|
| 258 |
+
border-radius: var(--radius);
|
| 259 |
+
padding: 0.6rem 0.75rem;
|
| 260 |
+
color: var(--text);
|
| 261 |
+
font-family: var(--font);
|
| 262 |
+
font-size: 0.85rem;
|
| 263 |
+
cursor: pointer;
|
| 264 |
+
text-align: left;
|
| 265 |
+
transition: border-color 0.15s;
|
| 266 |
+
}
|
| 267 |
+
|
| 268 |
+
.example-btn:hover:not(:disabled) {
|
| 269 |
+
border-color: var(--accent);
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
.example-btn:disabled {
|
| 273 |
+
opacity: 0.4;
|
| 274 |
+
cursor: not-allowed;
|
| 275 |
+
}
|
| 276 |
+
|
| 277 |
+
.example-voice {
|
| 278 |
+
flex-shrink: 0;
|
| 279 |
+
font-family: var(--mono);
|
| 280 |
+
font-size: 0.75rem;
|
| 281 |
+
font-weight: 600;
|
| 282 |
+
color: var(--accent);
|
| 283 |
+
padding: 0.1rem 0.4rem;
|
| 284 |
+
background: var(--accent-dim);
|
| 285 |
+
border-radius: 4px;
|
| 286 |
+
}
|
| 287 |
+
|
| 288 |
+
.example-text {
|
| 289 |
+
color: var(--text-muted);
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
/* Error */
|
| 293 |
+
|
| 294 |
+
.error-msg {
|
| 295 |
+
background: rgba(239, 68, 68, 0.1);
|
| 296 |
+
border: 1px solid rgba(239, 68, 68, 0.3);
|
| 297 |
+
color: #f87171;
|
| 298 |
+
border-radius: var(--radius);
|
| 299 |
+
padding: 0.75rem 1rem;
|
| 300 |
+
font-size: 0.85rem;
|
| 301 |
+
}
|
| 302 |
+
|
| 303 |
+
/* Footer */
|
| 304 |
+
|
| 305 |
+
footer {
|
| 306 |
+
margin-top: 2.5rem;
|
| 307 |
+
padding-top: 1.5rem;
|
| 308 |
+
border-top: 1px solid var(--border);
|
| 309 |
+
text-align: center;
|
| 310 |
+
font-size: 0.75rem;
|
| 311 |
+
color: var(--text-muted);
|
| 312 |
+
display: flex;
|
| 313 |
+
flex-direction: column;
|
| 314 |
+
gap: 0.25rem;
|
| 315 |
+
}
|
| 316 |
+
|
| 317 |
+
footer a {
|
| 318 |
+
color: var(--text-muted);
|
| 319 |
+
text-decoration: underline;
|
| 320 |
+
text-underline-offset: 2px;
|
| 321 |
+
transition: color 0.15s;
|
| 322 |
+
}
|
| 323 |
+
|
| 324 |
+
footer a:hover {
|
| 325 |
+
color: var(--text);
|
| 326 |
+
}
|
| 327 |
+
|
| 328 |
+
/* Responsive */
|
| 329 |
+
|
| 330 |
+
@media (max-width: 480px) {
|
| 331 |
+
.container {
|
| 332 |
+
padding: 1.25rem 1rem;
|
| 333 |
+
}
|
| 334 |
+
.controls-row {
|
| 335 |
+
grid-template-columns: 1fr;
|
| 336 |
+
}
|
| 337 |
+
header h1 {
|
| 338 |
+
font-size: 1.5rem;
|
| 339 |
+
}
|
| 340 |
+
}
|
src/lib/npz-reader.ts
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Voice data loader — loads KittenTTS voice embeddings.
|
| 3 |
+
*
|
| 4 |
+
* Instead of parsing .npz (zip of npy), we download the npz and use
|
| 5 |
+
* a robust zip + npy parser with proper byte alignment handling.
|
| 6 |
+
*/
|
| 7 |
+
|
| 8 |
+
export interface VoiceInfo {
|
| 9 |
+
data: Float32Array;
|
| 10 |
+
shape: [number, number]; // [numStyles, styleDim]
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
function parseNpyHeader(bytes: Uint8Array) {
|
| 14 |
+
// Magic: \x93NUMPY
|
| 15 |
+
if (bytes[0] !== 0x93 || String.fromCharCode(bytes[1], bytes[2], bytes[3], bytes[4], bytes[5]) !== "NUMPY") {
|
| 16 |
+
throw new Error("Not a valid .npy file");
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
const majorVersion = bytes[6];
|
| 20 |
+
const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength);
|
| 21 |
+
let headerLen: number;
|
| 22 |
+
let headerOffset: number;
|
| 23 |
+
|
| 24 |
+
if (majorVersion === 1) {
|
| 25 |
+
headerLen = view.getUint16(8, true);
|
| 26 |
+
headerOffset = 10;
|
| 27 |
+
} else {
|
| 28 |
+
headerLen = view.getUint32(8, true);
|
| 29 |
+
headerOffset = 12;
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
const headerStr = new TextDecoder().decode(
|
| 33 |
+
bytes.slice(headerOffset, headerOffset + headerLen)
|
| 34 |
+
);
|
| 35 |
+
|
| 36 |
+
const descrMatch = headerStr.match(/'descr'\s*:\s*'([^']+)'/);
|
| 37 |
+
const shapeMatch = headerStr.match(/'shape'\s*:\s*\(([^)]*)\)/);
|
| 38 |
+
|
| 39 |
+
if (!descrMatch) throw new Error("Could not parse dtype from .npy header: " + headerStr);
|
| 40 |
+
|
| 41 |
+
const descr = descrMatch[1];
|
| 42 |
+
const shapeNums = shapeMatch
|
| 43 |
+
? shapeMatch[1].split(",").map((s) => parseInt(s.trim(), 10)).filter((n) => !isNaN(n))
|
| 44 |
+
: [];
|
| 45 |
+
|
| 46 |
+
const dataOffset = headerOffset + headerLen;
|
| 47 |
+
|
| 48 |
+
return { descr, shape: shapeNums, dataOffset };
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
function npyToFloat32(bytes: Uint8Array): { data: Float32Array; shape: number[] } {
|
| 52 |
+
const { descr, shape, dataOffset } = parseNpyHeader(bytes);
|
| 53 |
+
const rawBytes = bytes.slice(dataOffset);
|
| 54 |
+
|
| 55 |
+
// Always copy into a fresh aligned ArrayBuffer
|
| 56 |
+
const aligned = new ArrayBuffer(rawBytes.length);
|
| 57 |
+
new Uint8Array(aligned).set(rawBytes);
|
| 58 |
+
|
| 59 |
+
let data: Float32Array;
|
| 60 |
+
if (descr === "<f4" || descr === "float32") {
|
| 61 |
+
data = new Float32Array(aligned);
|
| 62 |
+
} else if (descr === "<f8" || descr === "float64") {
|
| 63 |
+
const f64 = new Float64Array(aligned);
|
| 64 |
+
data = new Float32Array(f64.length);
|
| 65 |
+
for (let i = 0; i < f64.length; i++) data[i] = f64[i];
|
| 66 |
+
} else {
|
| 67 |
+
throw new Error("Unsupported npy dtype: " + descr);
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
return { data, shape };
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
/**
|
| 74 |
+
* Parse a zip file and extract entries.
|
| 75 |
+
* Handles both stored (method 0) and deflated (method 8) entries.
|
| 76 |
+
* Properly handles data descriptors (bit 3 of flags).
|
| 77 |
+
*/
|
| 78 |
+
async function extractZipEntries(
|
| 79 |
+
buffer: ArrayBuffer
|
| 80 |
+
): Promise<Map<string, Uint8Array>> {
|
| 81 |
+
const bytes = new Uint8Array(buffer);
|
| 82 |
+
const view = new DataView(buffer);
|
| 83 |
+
const entries = new Map<string, Uint8Array>();
|
| 84 |
+
|
| 85 |
+
// First, find the Central Directory to get reliable sizes
|
| 86 |
+
// Search for End of Central Directory signature (0x06054b50) from the end
|
| 87 |
+
let eocdOffset = -1;
|
| 88 |
+
for (let i = bytes.length - 22; i >= 0; i--) {
|
| 89 |
+
if (view.getUint32(i, true) === 0x06054b50) {
|
| 90 |
+
eocdOffset = i;
|
| 91 |
+
break;
|
| 92 |
+
}
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
if (eocdOffset === -1) {
|
| 96 |
+
throw new Error("Could not find End of Central Directory");
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
const cdOffset = view.getUint32(eocdOffset + 16, true);
|
| 100 |
+
const cdEntries = view.getUint16(eocdOffset + 10, true);
|
| 101 |
+
|
| 102 |
+
// Parse Central Directory entries to get accurate sizes and offsets
|
| 103 |
+
interface CDEntry {
|
| 104 |
+
fileName: string;
|
| 105 |
+
compressedSize: number;
|
| 106 |
+
uncompressedSize: number;
|
| 107 |
+
localHeaderOffset: number;
|
| 108 |
+
compressionMethod: number;
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
const cdList: CDEntry[] = [];
|
| 112 |
+
let cdPos = cdOffset;
|
| 113 |
+
|
| 114 |
+
for (let i = 0; i < cdEntries; i++) {
|
| 115 |
+
const sig = view.getUint32(cdPos, true);
|
| 116 |
+
if (sig !== 0x02014b50) break;
|
| 117 |
+
|
| 118 |
+
const compressionMethod = view.getUint16(cdPos + 10, true);
|
| 119 |
+
const compressedSize = view.getUint32(cdPos + 20, true);
|
| 120 |
+
const uncompressedSize = view.getUint32(cdPos + 24, true);
|
| 121 |
+
const fileNameLen = view.getUint16(cdPos + 28, true);
|
| 122 |
+
const extraLen = view.getUint16(cdPos + 30, true);
|
| 123 |
+
const commentLen = view.getUint16(cdPos + 32, true);
|
| 124 |
+
const localHeaderOffset = view.getUint32(cdPos + 42, true);
|
| 125 |
+
|
| 126 |
+
const fileName = new TextDecoder().decode(
|
| 127 |
+
bytes.slice(cdPos + 46, cdPos + 46 + fileNameLen)
|
| 128 |
+
);
|
| 129 |
+
|
| 130 |
+
cdList.push({
|
| 131 |
+
fileName,
|
| 132 |
+
compressedSize,
|
| 133 |
+
uncompressedSize,
|
| 134 |
+
localHeaderOffset,
|
| 135 |
+
compressionMethod,
|
| 136 |
+
});
|
| 137 |
+
|
| 138 |
+
cdPos += 46 + fileNameLen + extraLen + commentLen;
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
// Now extract each entry using local headers + CD sizes
|
| 142 |
+
for (const cd of cdList) {
|
| 143 |
+
const lhOffset = cd.localHeaderOffset;
|
| 144 |
+
const lhFileNameLen = view.getUint16(lhOffset + 26, true);
|
| 145 |
+
const lhExtraLen = view.getUint16(lhOffset + 28, true);
|
| 146 |
+
const dataStart = lhOffset + 30 + lhFileNameLen + lhExtraLen;
|
| 147 |
+
|
| 148 |
+
let fileData: Uint8Array;
|
| 149 |
+
|
| 150 |
+
if (cd.compressionMethod === 0) {
|
| 151 |
+
// Stored
|
| 152 |
+
fileData = bytes.slice(dataStart, dataStart + cd.uncompressedSize);
|
| 153 |
+
} else if (cd.compressionMethod === 8) {
|
| 154 |
+
// Deflate
|
| 155 |
+
const compressed = bytes.slice(dataStart, dataStart + cd.compressedSize);
|
| 156 |
+
const ds = new DecompressionStream("deflate-raw");
|
| 157 |
+
const writer = ds.writable.getWriter();
|
| 158 |
+
writer.write(compressed);
|
| 159 |
+
writer.close();
|
| 160 |
+
|
| 161 |
+
const reader = ds.readable.getReader();
|
| 162 |
+
const chunks: Uint8Array[] = [];
|
| 163 |
+
let totalLen = 0;
|
| 164 |
+
while (true) {
|
| 165 |
+
const { done, value } = await reader.read();
|
| 166 |
+
if (done) break;
|
| 167 |
+
chunks.push(value);
|
| 168 |
+
totalLen += value.length;
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
fileData = new Uint8Array(totalLen);
|
| 172 |
+
let pos = 0;
|
| 173 |
+
for (const chunk of chunks) {
|
| 174 |
+
fileData.set(chunk, pos);
|
| 175 |
+
pos += chunk.length;
|
| 176 |
+
}
|
| 177 |
+
} else {
|
| 178 |
+
console.warn(`Skipping ${cd.fileName}: unsupported compression ${cd.compressionMethod}`);
|
| 179 |
+
continue;
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
entries.set(cd.fileName, fileData);
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
return entries;
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
/**
|
| 189 |
+
* Load voice embeddings from a .npz file URL.
|
| 190 |
+
*/
|
| 191 |
+
export async function loadVoices(
|
| 192 |
+
url: string
|
| 193 |
+
): Promise<Record<string, VoiceInfo>> {
|
| 194 |
+
const response = await fetch(url);
|
| 195 |
+
if (!response.ok) throw new Error(`Failed to fetch voices: ${response.status}`);
|
| 196 |
+
const arrayBuffer = await response.arrayBuffer();
|
| 197 |
+
|
| 198 |
+
const entries = await extractZipEntries(arrayBuffer);
|
| 199 |
+
const voices: Record<string, VoiceInfo> = {};
|
| 200 |
+
|
| 201 |
+
for (const [fileName, fileData] of entries) {
|
| 202 |
+
if (!fileName.endsWith(".npy")) continue;
|
| 203 |
+
|
| 204 |
+
const voiceName = fileName.replace(/\.npy$/, "");
|
| 205 |
+
const { data, shape } = npyToFloat32(fileData);
|
| 206 |
+
|
| 207 |
+
voices[voiceName] = {
|
| 208 |
+
data,
|
| 209 |
+
shape: [shape[0] || 1, shape[1] || data.length],
|
| 210 |
+
};
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
return voices;
|
| 214 |
+
}
|
src/lib/preprocess.ts
ADDED
|
@@ -0,0 +1,310 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Text preprocessor — converts numbers, currencies, ordinals, etc. to words.
|
| 3 |
+
* Port of KittenTTS preprocess.py.
|
| 4 |
+
* https://github.com/KittenML/KittenTTS
|
| 5 |
+
*/
|
| 6 |
+
|
| 7 |
+
// ── Number → Words ──
|
| 8 |
+
|
| 9 |
+
const ONES = [
|
| 10 |
+
"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine",
|
| 11 |
+
"ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen",
|
| 12 |
+
"seventeen", "eighteen", "nineteen",
|
| 13 |
+
];
|
| 14 |
+
const TENS = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"];
|
| 15 |
+
const SCALE = ["", "thousand", "million", "billion", "trillion"];
|
| 16 |
+
|
| 17 |
+
const ORDINAL_EXCEPTIONS: Record<string, string> = {
|
| 18 |
+
one: "first", two: "second", three: "third", four: "fourth",
|
| 19 |
+
five: "fifth", six: "sixth", seven: "seventh", eight: "eighth",
|
| 20 |
+
nine: "ninth", twelve: "twelfth",
|
| 21 |
+
};
|
| 22 |
+
|
| 23 |
+
const CURRENCY_SYMBOLS: Record<string, string> = {
|
| 24 |
+
"$": "dollar", "€": "euro", "£": "pound", "¥": "yen",
|
| 25 |
+
"₹": "rupee", "₩": "won", "₿": "bitcoin",
|
| 26 |
+
};
|
| 27 |
+
|
| 28 |
+
function threeDigitsToWords(n: number): string {
|
| 29 |
+
if (n === 0) return "";
|
| 30 |
+
const parts: string[] = [];
|
| 31 |
+
const hundreds = Math.floor(n / 100);
|
| 32 |
+
const remainder = n % 100;
|
| 33 |
+
if (hundreds) parts.push(`${ONES[hundreds]} hundred`);
|
| 34 |
+
if (remainder < 20) {
|
| 35 |
+
if (remainder) parts.push(ONES[remainder]);
|
| 36 |
+
} else {
|
| 37 |
+
const tensWord = TENS[Math.floor(remainder / 10)];
|
| 38 |
+
const onesWord = ONES[remainder % 10];
|
| 39 |
+
parts.push(onesWord ? `${tensWord}-${onesWord}` : tensWord);
|
| 40 |
+
}
|
| 41 |
+
return parts.join(" ");
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
export function numberToWords(n: number): string {
|
| 45 |
+
if (!Number.isInteger(n)) n = Math.floor(n);
|
| 46 |
+
if (n === 0) return "zero";
|
| 47 |
+
if (n < 0) return `negative ${numberToWords(-n)}`;
|
| 48 |
+
if (n >= 100 && n <= 9999 && n % 100 === 0 && n % 1000 !== 0) {
|
| 49 |
+
const hundreds = Math.floor(n / 100);
|
| 50 |
+
if (hundreds < 20) return `${ONES[hundreds]} hundred`;
|
| 51 |
+
}
|
| 52 |
+
const parts: string[] = [];
|
| 53 |
+
let remaining = n;
|
| 54 |
+
for (let i = 0; i < SCALE.length; i++) {
|
| 55 |
+
const chunk = remaining % 1000;
|
| 56 |
+
if (chunk) {
|
| 57 |
+
const w = threeDigitsToWords(chunk);
|
| 58 |
+
parts.push(SCALE[i] ? `${w} ${SCALE[i]}` : w);
|
| 59 |
+
}
|
| 60 |
+
remaining = Math.floor(remaining / 1000);
|
| 61 |
+
if (remaining === 0) break;
|
| 62 |
+
}
|
| 63 |
+
return parts.reverse().join(" ");
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
export function floatToWords(value: string | number, sep = "point"): string {
|
| 67 |
+
const text = typeof value === "string" ? value : `${value}`;
|
| 68 |
+
const negative = text.startsWith("-");
|
| 69 |
+
const clean = negative ? text.slice(1) : text;
|
| 70 |
+
let result: string;
|
| 71 |
+
if (clean.includes(".")) {
|
| 72 |
+
const [intPart, decPart] = clean.split(".");
|
| 73 |
+
const intWords = intPart ? numberToWords(parseInt(intPart, 10)) : "zero";
|
| 74 |
+
const digitMap = ["zero", ...ONES.slice(1)];
|
| 75 |
+
const decWords = [...decPart].map((d) => digitMap[parseInt(d, 10)]).join(" ");
|
| 76 |
+
result = `${intWords} ${sep} ${decWords}`;
|
| 77 |
+
} else {
|
| 78 |
+
result = numberToWords(parseInt(clean, 10));
|
| 79 |
+
}
|
| 80 |
+
return negative ? `negative ${result}` : result;
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
function ordinalSuffix(n: number): string {
|
| 84 |
+
const word = numberToWords(n);
|
| 85 |
+
let prefix: string, last: string, joiner: string;
|
| 86 |
+
if (word.includes("-")) {
|
| 87 |
+
const idx = word.lastIndexOf("-");
|
| 88 |
+
prefix = word.slice(0, idx);
|
| 89 |
+
last = word.slice(idx + 1);
|
| 90 |
+
joiner = "-";
|
| 91 |
+
} else {
|
| 92 |
+
const parts = word.split(" ");
|
| 93 |
+
if (parts.length >= 2) {
|
| 94 |
+
last = parts.pop()!;
|
| 95 |
+
prefix = parts.join(" ");
|
| 96 |
+
joiner = " ";
|
| 97 |
+
} else {
|
| 98 |
+
last = word;
|
| 99 |
+
prefix = "";
|
| 100 |
+
joiner = "";
|
| 101 |
+
}
|
| 102 |
+
}
|
| 103 |
+
let lastOrd: string;
|
| 104 |
+
if (ORDINAL_EXCEPTIONS[last]) {
|
| 105 |
+
lastOrd = ORDINAL_EXCEPTIONS[last];
|
| 106 |
+
} else if (last.endsWith("t")) {
|
| 107 |
+
lastOrd = last + "h";
|
| 108 |
+
} else if (last.endsWith("e")) {
|
| 109 |
+
lastOrd = last.slice(0, -1) + "th";
|
| 110 |
+
} else {
|
| 111 |
+
lastOrd = last + "th";
|
| 112 |
+
}
|
| 113 |
+
return prefix ? `${prefix}${joiner}${lastOrd}` : lastOrd;
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
// ── Regex patterns ──
|
| 117 |
+
|
| 118 |
+
const RE_NUMBER = /(?<![a-zA-Z])-?[\d,]+(?:\.\d+)?/g;
|
| 119 |
+
const RE_ORDINAL = /\b(\d+)(st|nd|rd|th)\b/gi;
|
| 120 |
+
const RE_PERCENT = /(-?[\d,]+(?:\.\d+)?)\s*%/g;
|
| 121 |
+
const RE_CURRENCY = /([$€£¥₹₩₿])\s*([\d,]+(?:\.\d+)?)\s*([KMBT])?(?![a-zA-Z\d])/g;
|
| 122 |
+
const RE_TIME = /\b(\d{1,2}):(\d{2})(?::(\d{2}))?\s*(am|pm)?\b/gi;
|
| 123 |
+
const RE_RANGE = /(?<!\w)(\d+)-(\d+)(?!\w)/g;
|
| 124 |
+
const RE_MODEL_VER = /\b([a-zA-Z][a-zA-Z0-9]*)-(\d[\d.]*)(?=[^\d.]|$)/g;
|
| 125 |
+
const RE_UNIT = /(\d+(?:\.\d+)?)\s*(km|kg|mg|ml|gb|mb|kb|tb|hz|khz|mhz|ghz|mph|kph|°[cCfF]|[cCfF]°|ms|ns|µs)\b/gi;
|
| 126 |
+
const RE_SCALE = /(?<![a-zA-Z])(\d+(?:\.\d+)?)\s*([KMBT])(?![a-zA-Z\d])/g;
|
| 127 |
+
const RE_SCI = /(?<![a-zA-Z\d])(-?\d+(?:\.\d+)?)[eE]([+-]?\d+)(?![a-zA-Z\d])/g;
|
| 128 |
+
const RE_FRACTION = /\b(\d+)\s*\/\s*(\d+)\b/g;
|
| 129 |
+
const RE_DECADE = /\b(\d{1,3})0s\b/g;
|
| 130 |
+
|
| 131 |
+
const UNIT_MAP: Record<string, string> = {
|
| 132 |
+
km: "kilometers", kg: "kilograms", mg: "milligrams", ml: "milliliters",
|
| 133 |
+
gb: "gigabytes", mb: "megabytes", kb: "kilobytes", tb: "terabytes",
|
| 134 |
+
hz: "hertz", khz: "kilohertz", mhz: "megahertz", ghz: "gigahertz",
|
| 135 |
+
mph: "miles per hour", kph: "kilometers per hour",
|
| 136 |
+
ms: "milliseconds", ns: "nanoseconds", "µs": "microseconds",
|
| 137 |
+
"°c": "degrees Celsius", "c°": "degrees Celsius",
|
| 138 |
+
"°f": "degrees Fahrenheit", "f°": "degrees Fahrenheit",
|
| 139 |
+
};
|
| 140 |
+
|
| 141 |
+
const SCALE_MAP: Record<string, string> = {
|
| 142 |
+
K: "thousand", M: "million", B: "billion", T: "trillion",
|
| 143 |
+
};
|
| 144 |
+
|
| 145 |
+
const DECADE_MAP: Record<number, string> = {
|
| 146 |
+
0: "hundreds", 1: "tens", 2: "twenties", 3: "thirties", 4: "forties",
|
| 147 |
+
5: "fifties", 6: "sixties", 7: "seventies", 8: "eighties", 9: "nineties",
|
| 148 |
+
};
|
| 149 |
+
|
| 150 |
+
// ── Expansion functions ──
|
| 151 |
+
|
| 152 |
+
function expandOrdinals(text: string): string {
|
| 153 |
+
return text.replace(RE_ORDINAL, (_, n) => ordinalSuffix(parseInt(n, 10)));
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
function expandPercentages(text: string): string {
|
| 157 |
+
return text.replace(RE_PERCENT, (_, raw) => {
|
| 158 |
+
const clean = raw.replace(/,/g, "");
|
| 159 |
+
const w = clean.includes(".") ? floatToWords(parseFloat(clean)) : numberToWords(parseInt(clean, 10));
|
| 160 |
+
return `${w} percent`;
|
| 161 |
+
});
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
function expandCurrency(text: string): string {
|
| 165 |
+
return text.replace(RE_CURRENCY, (_, symbol, raw, scaleSuffix) => {
|
| 166 |
+
const clean = raw.replace(/,/g, "");
|
| 167 |
+
const unit = CURRENCY_SYMBOLS[symbol] || "";
|
| 168 |
+
if (scaleSuffix) {
|
| 169 |
+
const scaleWord = SCALE_MAP[scaleSuffix];
|
| 170 |
+
const num = clean.includes(".") ? floatToWords(clean) : numberToWords(parseInt(clean, 10));
|
| 171 |
+
return `${num} ${scaleWord} ${unit}s`.trim();
|
| 172 |
+
}
|
| 173 |
+
if (clean.includes(".")) {
|
| 174 |
+
const [intPart, decPart] = clean.split(".");
|
| 175 |
+
const decVal = parseInt(decPart.slice(0, 2).padEnd(2, "0"), 10);
|
| 176 |
+
let result = `${numberToWords(parseInt(intPart, 10))} ${unit}s`;
|
| 177 |
+
if (decVal) result += ` and ${numberToWords(decVal)} cent${decVal !== 1 ? "s" : ""}`;
|
| 178 |
+
return result;
|
| 179 |
+
}
|
| 180 |
+
const val = parseInt(clean, 10);
|
| 181 |
+
return `${numberToWords(val)} ${unit}${val !== 1 && unit ? "s" : ""}`;
|
| 182 |
+
});
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
function expandTime(text: string): string {
|
| 186 |
+
return text.replace(RE_TIME, (_, h, m, _s, suffix) => {
|
| 187 |
+
const hour = parseInt(h, 10);
|
| 188 |
+
const mins = parseInt(m, 10);
|
| 189 |
+
const sfx = suffix ? ` ${suffix.toLowerCase()}` : "";
|
| 190 |
+
const hWords = numberToWords(hour);
|
| 191 |
+
if (mins === 0) return suffix ? `${hWords}${sfx}` : `${hWords} hundred${sfx}`;
|
| 192 |
+
if (mins < 10) return `${hWords} oh ${numberToWords(mins)}${sfx}`;
|
| 193 |
+
return `${hWords} ${numberToWords(mins)}${sfx}`;
|
| 194 |
+
});
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
function expandRanges(text: string): string {
|
| 198 |
+
return text.replace(RE_RANGE, (_, lo, hi) =>
|
| 199 |
+
`${numberToWords(parseInt(lo, 10))} to ${numberToWords(parseInt(hi, 10))}`
|
| 200 |
+
);
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
function expandModelNames(text: string): string {
|
| 204 |
+
return text.replace(RE_MODEL_VER, (_, name, ver) => `${name} ${ver}`);
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
function expandUnits(text: string): string {
|
| 208 |
+
return text.replace(RE_UNIT, (_, raw, unit) => {
|
| 209 |
+
const expanded = UNIT_MAP[unit.toLowerCase()] || unit;
|
| 210 |
+
const num = raw.includes(".") ? floatToWords(parseFloat(raw)) : numberToWords(parseInt(raw, 10));
|
| 211 |
+
return `${num} ${expanded}`;
|
| 212 |
+
});
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
function expandScaleSuffixes(text: string): string {
|
| 216 |
+
return text.replace(RE_SCALE, (_, raw, suffix) => {
|
| 217 |
+
const scaleWord = SCALE_MAP[suffix] || suffix;
|
| 218 |
+
const num = raw.includes(".") ? floatToWords(raw) : numberToWords(parseInt(raw, 10));
|
| 219 |
+
return `${num} ${scaleWord}`;
|
| 220 |
+
});
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
function expandScientific(text: string): string {
|
| 224 |
+
return text.replace(RE_SCI, (_, coeff, exp) => {
|
| 225 |
+
const coeffW = coeff.includes(".") ? floatToWords(coeff) : numberToWords(parseInt(coeff, 10));
|
| 226 |
+
const expVal = parseInt(exp, 10);
|
| 227 |
+
const sign = expVal < 0 ? "negative " : "";
|
| 228 |
+
return `${coeffW} times ten to the ${sign}${numberToWords(Math.abs(expVal))}`;
|
| 229 |
+
});
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
function expandFractions(text: string): string {
|
| 233 |
+
return text.replace(RE_FRACTION, (m, num, den) => {
|
| 234 |
+
const n = parseInt(num, 10);
|
| 235 |
+
const d = parseInt(den, 10);
|
| 236 |
+
if (d === 0) return m;
|
| 237 |
+
const nWords = numberToWords(n);
|
| 238 |
+
let dWord: string;
|
| 239 |
+
if (d === 2) dWord = n === 1 ? "half" : "halves";
|
| 240 |
+
else if (d === 4) dWord = n === 1 ? "quarter" : "quarters";
|
| 241 |
+
else {
|
| 242 |
+
dWord = ordinalSuffix(d);
|
| 243 |
+
if (n !== 1) dWord += "s";
|
| 244 |
+
}
|
| 245 |
+
return `${nWords} ${dWord}`;
|
| 246 |
+
});
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
function expandDecades(text: string): string {
|
| 250 |
+
return text.replace(RE_DECADE, (_, base) => {
|
| 251 |
+
const b = parseInt(base, 10);
|
| 252 |
+
const decadeDigit = b % 10;
|
| 253 |
+
const decadeWord = DECADE_MAP[decadeDigit] || "";
|
| 254 |
+
if (b < 10) return decadeWord;
|
| 255 |
+
return `${numberToWords(Math.floor(b / 10))} ${decadeWord}`;
|
| 256 |
+
});
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
function replaceNumbers(text: string): string {
|
| 260 |
+
return text.replace(RE_NUMBER, (m) => {
|
| 261 |
+
const clean = m.replace(/,/g, "");
|
| 262 |
+
if (clean.includes(".")) return floatToWords(clean);
|
| 263 |
+
return numberToWords(parseInt(clean, 10));
|
| 264 |
+
});
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
function normalizeLeadingDecimals(text: string): string {
|
| 268 |
+
text = text.replace(/(?<!\d)(-)\.([\d])/g, "$1" + "0.$2");
|
| 269 |
+
text = text.replace(/(?<!\d)\.([\d])/g, "0.$1");
|
| 270 |
+
return text;
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
const RE_URL = /https?:\/\/\S+|www\.\S+/g;
|
| 274 |
+
const RE_EMAIL = /\b[\w.+-]+@[\w-]+\.[a-z]{2,}\b/gi;
|
| 275 |
+
const RE_HTML = /<[^>]+>/g;
|
| 276 |
+
const RE_PUNCT = /[^\w\s.,?!;:\-\u2014\u2013\u2026]/g;
|
| 277 |
+
const RE_SPACES = /\s+/g;
|
| 278 |
+
|
| 279 |
+
export function preprocessText(text: string): string {
|
| 280 |
+
// Remove URLs, emails, HTML
|
| 281 |
+
text = text.replace(RE_URL, "");
|
| 282 |
+
text = text.replace(RE_EMAIL, "");
|
| 283 |
+
text = text.replace(RE_HTML, " ");
|
| 284 |
+
|
| 285 |
+
// Normalize leading decimals
|
| 286 |
+
text = normalizeLeadingDecimals(text);
|
| 287 |
+
|
| 288 |
+
// Expand special forms before generic number replacement
|
| 289 |
+
text = expandCurrency(text);
|
| 290 |
+
text = expandPercentages(text);
|
| 291 |
+
text = expandScientific(text);
|
| 292 |
+
text = expandTime(text);
|
| 293 |
+
text = expandOrdinals(text);
|
| 294 |
+
text = expandUnits(text);
|
| 295 |
+
text = expandScaleSuffixes(text);
|
| 296 |
+
text = expandFractions(text);
|
| 297 |
+
text = expandDecades(text);
|
| 298 |
+
text = expandRanges(text);
|
| 299 |
+
text = expandModelNames(text);
|
| 300 |
+
text = replaceNumbers(text);
|
| 301 |
+
|
| 302 |
+
// Remove non-prosodic punctuation
|
| 303 |
+
text = text.replace(RE_PUNCT, " ");
|
| 304 |
+
|
| 305 |
+
// Lowercase and collapse whitespace
|
| 306 |
+
text = text.toLowerCase();
|
| 307 |
+
text = text.replace(RE_SPACES, " ").trim();
|
| 308 |
+
|
| 309 |
+
return text;
|
| 310 |
+
}
|
src/lib/text-cleaner.ts
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* TextCleaner — maps IPA phoneme characters to integer token IDs.
|
| 3 |
+
* Direct port of KittenTTS Python TextCleaner class.
|
| 4 |
+
* https://github.com/KittenML/KittenTTS
|
| 5 |
+
*/
|
| 6 |
+
|
| 7 |
+
const _pad = "$";
|
| 8 |
+
const _punctuation = ';:,.!?¡¿—…"«»"" ';
|
| 9 |
+
const _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
|
| 10 |
+
const _letters_ipa =
|
| 11 |
+
"ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ";
|
| 12 |
+
|
| 13 |
+
const symbols = [_pad, ..._punctuation, ..._letters, ..._letters_ipa];
|
| 14 |
+
|
| 15 |
+
const charToIndex: Record<string, number> = {};
|
| 16 |
+
for (let i = 0; i < symbols.length; i++) {
|
| 17 |
+
charToIndex[symbols[i]] = i;
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
export function cleanText(text: string): number[] {
|
| 21 |
+
const indexes: number[] = [];
|
| 22 |
+
for (const char of text) {
|
| 23 |
+
const idx = charToIndex[char];
|
| 24 |
+
if (idx !== undefined) {
|
| 25 |
+
indexes.push(idx);
|
| 26 |
+
}
|
| 27 |
+
}
|
| 28 |
+
return indexes;
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
export function tokenize(phonemes: string): number[] {
|
| 32 |
+
const tokens = cleanText(phonemes);
|
| 33 |
+
// Add start/end tokens matching Python: insert 0 at start, append 10, append 0
|
| 34 |
+
tokens.unshift(0);
|
| 35 |
+
tokens.push(10);
|
| 36 |
+
tokens.push(0);
|
| 37 |
+
return tokens;
|
| 38 |
+
}
|
src/main.tsx
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { StrictMode } from "react";
|
| 2 |
+
import { createRoot } from "react-dom/client";
|
| 3 |
+
import "./index.css";
|
| 4 |
+
import App from "./App";
|
| 5 |
+
|
| 6 |
+
createRoot(document.getElementById("root")!).render(
|
| 7 |
+
<StrictMode>
|
| 8 |
+
<App />
|
| 9 |
+
</StrictMode>
|
| 10 |
+
);
|
src/vite-env.d.ts
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/// <reference types="vite/client" />
|
| 2 |
+
|
| 3 |
+
declare module "*?worker" {
|
| 4 |
+
const workerConstructor: {
|
| 5 |
+
new (): Worker;
|
| 6 |
+
};
|
| 7 |
+
export default workerConstructor;
|
| 8 |
+
}
|
src/worker.ts
ADDED
|
@@ -0,0 +1,316 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Web Worker — KittenTTS inference via ONNX Runtime Web (WebGPU/WASM).
|
| 3 |
+
*
|
| 4 |
+
* Models: https://huggingface.co/KittenML
|
| 5 |
+
* Phonemizer: https://github.com/xenova/phonemizer.js (Xenova)
|
| 6 |
+
* ONNX Runtime Web: https://onnxruntime.ai
|
| 7 |
+
*/
|
| 8 |
+
|
| 9 |
+
import { tokenize } from "./lib/text-cleaner";
|
| 10 |
+
import { loadVoices, type VoiceInfo } from "./lib/npz-reader";
|
| 11 |
+
|
| 12 |
+
// Dynamic imports — resolved at runtime to avoid Vite dev server transform issues
|
| 13 |
+
let phonemize: (text: string, lang: string) => Promise<string[]>;
|
| 14 |
+
let ort: any;
|
| 15 |
+
|
| 16 |
+
const HF_BASE = "https://huggingface.co";
|
| 17 |
+
const SAMPLE_RATE = 24000;
|
| 18 |
+
|
| 19 |
+
// Int8 quantized models produce NaN on WebGPU; all fp32 models should be fine
|
| 20 |
+
const WEBGPU_BLOCKED_PATTERNS = ["int8"];
|
| 21 |
+
|
| 22 |
+
interface ModelConfig {
|
| 23 |
+
name: string;
|
| 24 |
+
version: string;
|
| 25 |
+
type: string;
|
| 26 |
+
model: string;
|
| 27 |
+
model_file: string;
|
| 28 |
+
voices: string;
|
| 29 |
+
speed_priors: Record<string, number>;
|
| 30 |
+
voice_aliases: Record<string, string>;
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
let session: any = null;
|
| 34 |
+
let voices: Record<string, VoiceInfo> = {};
|
| 35 |
+
let config: ModelConfig | null = null;
|
| 36 |
+
let currentDevice: "webgpu" | "wasm" = "wasm";
|
| 37 |
+
|
| 38 |
+
function resolveUrl(repoId: string, filename: string): string {
|
| 39 |
+
return `${HF_BASE}/${repoId}/resolve/main/${filename}`;
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
async function detectWebGPU(): Promise<boolean> {
|
| 43 |
+
try {
|
| 44 |
+
if (!("gpu" in navigator)) return false;
|
| 45 |
+
const adapter = await (navigator as any).gpu.requestAdapter();
|
| 46 |
+
return !!adapter;
|
| 47 |
+
} catch {
|
| 48 |
+
return false;
|
| 49 |
+
}
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
async function loadModel(repoId: string) {
|
| 53 |
+
self.postMessage({ type: "status", message: "Detecting hardware..." });
|
| 54 |
+
|
| 55 |
+
const hasWebGPU = await detectWebGPU();
|
| 56 |
+
|
| 57 |
+
// Load runtime dependencies
|
| 58 |
+
self.postMessage({ type: "status", message: "Loading runtime..." });
|
| 59 |
+
const [ortModule, phonemizerModule] = await Promise.all([
|
| 60 |
+
import("onnxruntime-web"),
|
| 61 |
+
import("phonemizer"),
|
| 62 |
+
]);
|
| 63 |
+
ort = ortModule;
|
| 64 |
+
phonemize = phonemizerModule.phonemize;
|
| 65 |
+
|
| 66 |
+
// Load config
|
| 67 |
+
self.postMessage({ type: "status", message: "Loading config..." });
|
| 68 |
+
const configUrl = resolveUrl(repoId, "config.json");
|
| 69 |
+
const configResp = await fetch(configUrl);
|
| 70 |
+
config = (await configResp.json()) as ModelConfig;
|
| 71 |
+
|
| 72 |
+
// Int8 quantized models produce NaN on WebGPU — only block those
|
| 73 |
+
const modelName = config.model || repoId.split("/").pop() || "";
|
| 74 |
+
const isBlocked = WEBGPU_BLOCKED_PATTERNS.some((p) => modelName.includes(p));
|
| 75 |
+
currentDevice = hasWebGPU && !isBlocked ? "webgpu" : "wasm";
|
| 76 |
+
|
| 77 |
+
if (hasWebGPU && isBlocked) {
|
| 78 |
+
console.log(`[KittenTTS] Using WASM for "${modelName}" (int8 models produce NaN on WebGPU)`);
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
self.postMessage({ type: "device", device: currentDevice });
|
| 82 |
+
|
| 83 |
+
// Load voices (.npz) and ONNX model in parallel
|
| 84 |
+
self.postMessage({ type: "status", message: "Downloading model & voices..." });
|
| 85 |
+
|
| 86 |
+
const modelUrl = resolveUrl(repoId, config.model_file);
|
| 87 |
+
|
| 88 |
+
const modelPromise = (async () => {
|
| 89 |
+
const resp = await fetch(modelUrl);
|
| 90 |
+
if (!resp.ok) throw new Error(`Failed to fetch model: ${resp.status}`);
|
| 91 |
+
|
| 92 |
+
const contentLength = parseInt(resp.headers.get("content-length") || "0", 10);
|
| 93 |
+
const reader = resp.body!.getReader();
|
| 94 |
+
const chunks: Uint8Array[] = [];
|
| 95 |
+
let loaded = 0;
|
| 96 |
+
|
| 97 |
+
while (true) {
|
| 98 |
+
const { done, value } = await reader.read();
|
| 99 |
+
if (done) break;
|
| 100 |
+
chunks.push(value);
|
| 101 |
+
loaded += value.length;
|
| 102 |
+
if (contentLength > 0) {
|
| 103 |
+
const pct = Math.round((loaded / contentLength) * 100);
|
| 104 |
+
const mb = (loaded / 1024 / 1024).toFixed(1);
|
| 105 |
+
self.postMessage({
|
| 106 |
+
type: "status",
|
| 107 |
+
message: `Downloading model... ${pct}% (${mb} MB)`,
|
| 108 |
+
});
|
| 109 |
+
}
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
const modelData = new Uint8Array(loaded);
|
| 113 |
+
let offset = 0;
|
| 114 |
+
for (const chunk of chunks) {
|
| 115 |
+
modelData.set(chunk, offset);
|
| 116 |
+
offset += chunk.length;
|
| 117 |
+
}
|
| 118 |
+
return modelData.buffer;
|
| 119 |
+
})();
|
| 120 |
+
|
| 121 |
+
const voicesUrl = resolveUrl(repoId, config.voices);
|
| 122 |
+
const voicesPromise = loadVoices(voicesUrl);
|
| 123 |
+
|
| 124 |
+
const [modelBuffer, loadedVoices] = await Promise.all([modelPromise, voicesPromise]);
|
| 125 |
+
voices = loadedVoices;
|
| 126 |
+
|
| 127 |
+
// Create ONNX inference session
|
| 128 |
+
self.postMessage({
|
| 129 |
+
type: "status",
|
| 130 |
+
message: `Initializing ${currentDevice.toUpperCase()} session...`,
|
| 131 |
+
});
|
| 132 |
+
|
| 133 |
+
const sessionOptions: any = {
|
| 134 |
+
executionProviders: currentDevice === "webgpu" ? ["webgpu"] : ["wasm"],
|
| 135 |
+
};
|
| 136 |
+
|
| 137 |
+
if (currentDevice === "wasm") {
|
| 138 |
+
ort.env.wasm.numThreads = 1;
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
session = await ort.InferenceSession.create(modelBuffer, sessionOptions);
|
| 142 |
+
|
| 143 |
+
const voiceNames = config.voice_aliases
|
| 144 |
+
? Object.keys(config.voice_aliases)
|
| 145 |
+
: Object.keys(voices);
|
| 146 |
+
|
| 147 |
+
self.postMessage({
|
| 148 |
+
type: "ready",
|
| 149 |
+
voices: voiceNames,
|
| 150 |
+
device: currentDevice,
|
| 151 |
+
modelName: config.name,
|
| 152 |
+
});
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
function ensurePunctuation(text: string): string {
|
| 156 |
+
text = text.trim();
|
| 157 |
+
if (!text) return text;
|
| 158 |
+
if (!".!?,;:".includes(text[text.length - 1])) {
|
| 159 |
+
text += ",";
|
| 160 |
+
}
|
| 161 |
+
return text;
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
function chunkText(text: string, maxLen = 400): string[] {
|
| 165 |
+
// Split on sentence boundaries but keep the punctuation
|
| 166 |
+
const sentences = text.match(/[^.!?]*[.!?]+|[^.!?]+$/g) || [text];
|
| 167 |
+
const chunks: string[] = [];
|
| 168 |
+
for (let sentence of sentences) {
|
| 169 |
+
sentence = sentence.trim();
|
| 170 |
+
if (!sentence) continue;
|
| 171 |
+
if (sentence.length <= maxLen) {
|
| 172 |
+
chunks.push(ensurePunctuation(sentence));
|
| 173 |
+
} else {
|
| 174 |
+
const words = sentence.split(/\s+/);
|
| 175 |
+
let temp = "";
|
| 176 |
+
for (const word of words) {
|
| 177 |
+
if (temp.length + word.length + 1 <= maxLen) {
|
| 178 |
+
temp += (temp ? " " : "") + word;
|
| 179 |
+
} else {
|
| 180 |
+
if (temp) chunks.push(ensurePunctuation(temp));
|
| 181 |
+
temp = word;
|
| 182 |
+
}
|
| 183 |
+
}
|
| 184 |
+
if (temp) chunks.push(ensurePunctuation(temp));
|
| 185 |
+
}
|
| 186 |
+
}
|
| 187 |
+
return chunks;
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
function basicTokenize(text: string): string[] {
|
| 191 |
+
// Python's \w matches Unicode word chars (including IPA symbols).
|
| 192 |
+
// JS \w only matches [a-zA-Z0-9_], so we use the Unicode-aware flag.
|
| 193 |
+
return text.match(/[\p{L}\p{N}_]+|[^\p{L}\p{N}_\s]/gu) || [];
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
async function generateChunk(
|
| 197 |
+
text: string,
|
| 198 |
+
voiceKey: string,
|
| 199 |
+
speed: number
|
| 200 |
+
): Promise<Float32Array> {
|
| 201 |
+
if (!session || !config) throw new Error("Model not loaded");
|
| 202 |
+
|
| 203 |
+
let voiceId = voiceKey;
|
| 204 |
+
if (config.voice_aliases?.[voiceKey]) {
|
| 205 |
+
voiceId = config.voice_aliases[voiceKey];
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
const voiceData = voices[voiceId];
|
| 209 |
+
if (!voiceData) throw new Error(`Voice "${voiceKey}" not found`);
|
| 210 |
+
|
| 211 |
+
if (config.speed_priors?.[voiceId]) {
|
| 212 |
+
speed = speed * config.speed_priors[voiceId];
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
// Phonemize text using espeak-ng WASM
|
| 216 |
+
const phonemesList = await phonemize(text, "en-us");
|
| 217 |
+
const phonemesRaw = phonemesList[0] || "";
|
| 218 |
+
const phonemeTokens = basicTokenize(phonemesRaw);
|
| 219 |
+
const phonemesJoined = phonemeTokens.join(" ");
|
| 220 |
+
const inputIds = tokenize(phonemesJoined);
|
| 221 |
+
|
| 222 |
+
// Select voice style reference based on text length (matches Python logic)
|
| 223 |
+
const refId = Math.min(text.length, voiceData.shape[0] - 1);
|
| 224 |
+
const styleDim = voiceData.shape[1];
|
| 225 |
+
const refStyle = voiceData.data.slice(refId * styleDim, (refId + 1) * styleDim);
|
| 226 |
+
|
| 227 |
+
// Create ONNX tensors
|
| 228 |
+
const inputIdsTensor = new ort.Tensor(
|
| 229 |
+
"int64",
|
| 230 |
+
BigInt64Array.from(inputIds.map(BigInt)),
|
| 231 |
+
[1, inputIds.length]
|
| 232 |
+
);
|
| 233 |
+
const styleTensor = new ort.Tensor("float32", refStyle, [1, styleDim]);
|
| 234 |
+
const speedTensor = new ort.Tensor("float32", new Float32Array([speed]), [1]);
|
| 235 |
+
|
| 236 |
+
// Run inference
|
| 237 |
+
const results = await session.run({
|
| 238 |
+
input_ids: inputIdsTensor,
|
| 239 |
+
style: styleTensor,
|
| 240 |
+
speed: speedTensor,
|
| 241 |
+
});
|
| 242 |
+
|
| 243 |
+
// Get output audio
|
| 244 |
+
const outputKey = session.outputNames[0];
|
| 245 |
+
const audioData = results[outputKey].data as Float32Array;
|
| 246 |
+
|
| 247 |
+
// Trim trailing silence (matching Python: audio[..., :-5000])
|
| 248 |
+
return audioData.slice(0, Math.max(0, audioData.length - 5000));
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
async function generate(text: string, voice: string, speed: number) {
|
| 252 |
+
try {
|
| 253 |
+
const chunks = chunkText(text);
|
| 254 |
+
|
| 255 |
+
self.postMessage({
|
| 256 |
+
type: "status",
|
| 257 |
+
message: `Generating (${chunks.length} chunk${chunks.length > 1 ? "s" : ""})...`,
|
| 258 |
+
});
|
| 259 |
+
|
| 260 |
+
const audioChunks: Float32Array[] = [];
|
| 261 |
+
for (let i = 0; i < chunks.length; i++) {
|
| 262 |
+
self.postMessage({
|
| 263 |
+
type: "progress",
|
| 264 |
+
current: i + 1,
|
| 265 |
+
total: chunks.length,
|
| 266 |
+
});
|
| 267 |
+
const audio = await generateChunk(chunks[i], voice, speed);
|
| 268 |
+
audioChunks.push(audio);
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
const totalLen = audioChunks.reduce((s, c) => s + c.length, 0);
|
| 272 |
+
const fullAudio = new Float32Array(totalLen);
|
| 273 |
+
let offset = 0;
|
| 274 |
+
for (const chunk of audioChunks) {
|
| 275 |
+
fullAudio.set(chunk, offset);
|
| 276 |
+
offset += chunk.length;
|
| 277 |
+
}
|
| 278 |
+
|
| 279 |
+
self.postMessage(
|
| 280 |
+
{
|
| 281 |
+
type: "audio",
|
| 282 |
+
audio: fullAudio.buffer,
|
| 283 |
+
sampleRate: SAMPLE_RATE,
|
| 284 |
+
},
|
| 285 |
+
{ transfer: [fullAudio.buffer] }
|
| 286 |
+
);
|
| 287 |
+
} catch (err: any) {
|
| 288 |
+
self.postMessage({ type: "error", error: err.message || String(err) });
|
| 289 |
+
}
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
// Message handler
|
| 293 |
+
self.addEventListener("message", async (e) => {
|
| 294 |
+
const { action, ...data } = e.data;
|
| 295 |
+
switch (action) {
|
| 296 |
+
case "load":
|
| 297 |
+
try {
|
| 298 |
+
await loadModel(data.repoId);
|
| 299 |
+
} catch (err: any) {
|
| 300 |
+
console.error("[KittenTTS Worker] Load error:", err);
|
| 301 |
+
self.postMessage({ type: "error", error: err.message || String(err) });
|
| 302 |
+
}
|
| 303 |
+
break;
|
| 304 |
+
case "generate":
|
| 305 |
+
await generate(data.text, data.voice, data.speed);
|
| 306 |
+
break;
|
| 307 |
+
}
|
| 308 |
+
});
|
| 309 |
+
|
| 310 |
+
self.addEventListener("error", (e) => {
|
| 311 |
+
self.postMessage({ type: "error", error: e.message || "Unknown worker error" });
|
| 312 |
+
});
|
| 313 |
+
|
| 314 |
+
self.addEventListener("unhandledrejection", (e: any) => {
|
| 315 |
+
self.postMessage({ type: "error", error: e.reason?.message || String(e.reason) });
|
| 316 |
+
});
|
style.css
DELETED
|
@@ -1,28 +0,0 @@
|
|
| 1 |
-
body {
|
| 2 |
-
padding: 2rem;
|
| 3 |
-
font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
|
| 4 |
-
}
|
| 5 |
-
|
| 6 |
-
h1 {
|
| 7 |
-
font-size: 16px;
|
| 8 |
-
margin-top: 0;
|
| 9 |
-
}
|
| 10 |
-
|
| 11 |
-
p {
|
| 12 |
-
color: rgb(107, 114, 128);
|
| 13 |
-
font-size: 15px;
|
| 14 |
-
margin-bottom: 10px;
|
| 15 |
-
margin-top: 5px;
|
| 16 |
-
}
|
| 17 |
-
|
| 18 |
-
.card {
|
| 19 |
-
max-width: 620px;
|
| 20 |
-
margin: 0 auto;
|
| 21 |
-
padding: 16px;
|
| 22 |
-
border: 1px solid lightgray;
|
| 23 |
-
border-radius: 16px;
|
| 24 |
-
}
|
| 25 |
-
|
| 26 |
-
.card p:last-child {
|
| 27 |
-
margin-bottom: 0;
|
| 28 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tsconfig.app.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"compilerOptions": {
|
| 3 |
+
"tsBuildInfoFile": "./node_modules/.tmp/tsconfig.app.tsbuildinfo",
|
| 4 |
+
"target": "ES2023",
|
| 5 |
+
"useDefineForClassFields": true,
|
| 6 |
+
"lib": ["ES2023", "DOM", "DOM.Iterable"],
|
| 7 |
+
"module": "ESNext",
|
| 8 |
+
"types": ["vite/client"],
|
| 9 |
+
"skipLibCheck": true,
|
| 10 |
+
|
| 11 |
+
/* Bundler mode */
|
| 12 |
+
"moduleResolution": "bundler",
|
| 13 |
+
"allowImportingTsExtensions": true,
|
| 14 |
+
"verbatimModuleSyntax": true,
|
| 15 |
+
"moduleDetection": "force",
|
| 16 |
+
"noEmit": true,
|
| 17 |
+
"jsx": "react-jsx",
|
| 18 |
+
|
| 19 |
+
/* Linting */
|
| 20 |
+
"strict": true,
|
| 21 |
+
"noUnusedLocals": true,
|
| 22 |
+
"noUnusedParameters": true,
|
| 23 |
+
"erasableSyntaxOnly": true,
|
| 24 |
+
"noFallthroughCasesInSwitch": true,
|
| 25 |
+
"noUncheckedSideEffectImports": true
|
| 26 |
+
},
|
| 27 |
+
"include": ["src"]
|
| 28 |
+
}
|
tsconfig.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"files": [],
|
| 3 |
+
"references": [
|
| 4 |
+
{ "path": "./tsconfig.app.json" },
|
| 5 |
+
{ "path": "./tsconfig.node.json" }
|
| 6 |
+
]
|
| 7 |
+
}
|
tsconfig.node.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"compilerOptions": {
|
| 3 |
+
"tsBuildInfoFile": "./node_modules/.tmp/tsconfig.node.tsbuildinfo",
|
| 4 |
+
"target": "ES2023",
|
| 5 |
+
"lib": ["ES2023"],
|
| 6 |
+
"module": "ESNext",
|
| 7 |
+
"types": ["node"],
|
| 8 |
+
"skipLibCheck": true,
|
| 9 |
+
|
| 10 |
+
/* Bundler mode */
|
| 11 |
+
"moduleResolution": "bundler",
|
| 12 |
+
"allowImportingTsExtensions": true,
|
| 13 |
+
"verbatimModuleSyntax": true,
|
| 14 |
+
"moduleDetection": "force",
|
| 15 |
+
"noEmit": true,
|
| 16 |
+
|
| 17 |
+
/* Linting */
|
| 18 |
+
"strict": true,
|
| 19 |
+
"noUnusedLocals": true,
|
| 20 |
+
"noUnusedParameters": true,
|
| 21 |
+
"erasableSyntaxOnly": true,
|
| 22 |
+
"noFallthroughCasesInSwitch": true,
|
| 23 |
+
"noUncheckedSideEffectImports": true
|
| 24 |
+
},
|
| 25 |
+
"include": ["vite.config.ts"]
|
| 26 |
+
}
|
vite.config.ts
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { defineConfig } from "vite";
|
| 2 |
+
import react from "@vitejs/plugin-react";
|
| 3 |
+
|
| 4 |
+
export default defineConfig({
|
| 5 |
+
plugins: [react()],
|
| 6 |
+
worker: {
|
| 7 |
+
format: "es",
|
| 8 |
+
},
|
| 9 |
+
});
|