File size: 11,731 Bytes
0a01d76 7a253f4 0a01d76 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 |
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>WASM Streaming Speech Recognition</title>
<style>
@import url("https://fonts.googleapis.com/css2?family=Source+Sans+3:wght@300;400;600;700&display=swap");
html, body { font-family: "Source Sans 3", system-ui, -apple-system, Segoe UI, Roboto, sans-serif; }
</style>
<script src="css/tailwind-3.4.17.js"></script>
<script type="module">
const MODEL_ID = "moshi_1b_en_fr_q4k";
const WEIGHTS_URL = "https://huggingface.co/efficient-nlp/stt-1b-en_fr-quantized/resolve/main/model-q4k.gguf";
const MIMI_URL = "https://huggingface.co/efficient-nlp/stt-1b-en_fr-quantized/resolve/main/mimi-pytorch-e351c8d8@125.safetensors";
const TOKENIZER_URL = "https://huggingface.co/efficient-nlp/stt-1b-en_fr-quantized/resolve/main/tokenizer_en_fr_audio_8000.json";
const CONFIG_URL = "https://huggingface.co/efficient-nlp/stt-1b-en_fr-quantized/resolve/main/config.json";
const moshiWorker = new Worker("./moshiWorker.js", { type: "module" });
let mediaRecorder = null;
let audioChunks = [];
let isRecording = false;
let audioStream = null;
let audioContext = null;
let processor = null;
let source = null;
let modelInitialized = false;
let pendingStart = false;
// Performance tracking
let audioChunksProcessed = 0;
let sessionStartTime = 0;
function updateStatusDiv(message) {
document.querySelector("#status-div").textContent = message;
}
function updateDiagnostics() {
const diagnostics = document.querySelector("#diagnostics");
if (!diagnostics) return;
const cpuCount = navigator.hardwareConcurrency || 'unknown';
// Only update metrics when recording, otherwise show final values
if (isRecording && sessionStartTime) {
// Calculate real-time factor (audio processed / wall clock time)
// >1 = faster than real-time, <1 = slower than real-time
const audioProcessed = audioChunksProcessed * (1024 / 24000);
const audioSessionDuration = (Date.now() - sessionStartTime) / 1000;
const realTimeFactor = audioSessionDuration > 0 ? (audioProcessed / audioSessionDuration) : 0;
// Color code based on performance
let factorColor = '';
if (realTimeFactor >= 0.95) {
factorColor = 'text-green-600';
} else if (realTimeFactor >= 0.8) {
factorColor = 'text-yellow-600';
}
else {
factorColor = 'text-red-600';
}
diagnostics.innerHTML = `CPUs: ${cpuCount}, Real-time factor: <span class="${factorColor}">${realTimeFactor.toFixed(2)}x</span>, Duration: ${audioSessionDuration.toFixed(1)}s`;
} else if (!sessionStartTime) {
diagnostics.innerHTML = `CPUs: ${cpuCount}, Real-time factor: <span class="text-gray-600">0.00x</span>, Duration: 0.0s`;
}
}
window.addEventListener('load', updateDiagnostics);
setInterval(updateDiagnostics, 200);
function initializeModel() {
if (modelInitialized) return;
const button = document.querySelector("#speech-button");
button.disabled = true;
button.className = "bg-gray-400 text-gray-700 font-normal py-2 px-4 rounded cursor-not-allowed";
moshiWorker.postMessage({
command: "initialize",
weightsURL: WEIGHTS_URL,
modelID: MODEL_ID,
mimiURL: MIMI_URL,
tokenizerURL: TOKENIZER_URL,
configURL: CONFIG_URL,
});
}
// Handle messages from worker
moshiWorker.addEventListener("message", async (event) => {
const data = event.data;
if (data.status === "model_ready") {
modelInitialized = true;
updateStatusDiv("Model loaded - Ready to start");
const button = document.querySelector("#speech-button");
button.disabled = false;
button.className = "bg-gray-700 hover:bg-gray-800 text-white font-normal py-2 px-4 rounded";
if (pendingStart) {
pendingStart = false;
await startRecording();
}
} else if (data.status === "streaming") {
// Add new word to transcription in real-time
const outputDiv = document.querySelector("#output-generation");
const placeholder = document.querySelector("#output-placeholder");
if (placeholder) placeholder.hidden = true;
if (outputDiv.textContent) {
outputDiv.textContent += " " + data.word;
} else {
outputDiv.textContent = data.word;
}
outputDiv.hidden = false;
} else if (data.status === "chunk_processed") {
audioChunksProcessed++;
} else if (data.status === "loading") {
updateStatusDiv(data.message);
} else if (data.error) {
updateStatusDiv("Error: " + data.error);
pendingStart = false;
}
});
function updateStatus(data) {
const { status, message, word } = data;
const outputDiv = document.querySelector("#output-generation");
if (status === "loading" || status === "decoding") {
updateStatusDiv(message || (status === "loading" ? "Loading..." : "Decoding..."));
} else if (status === "streaming") {
// Add new word to the transcription in real-time
if (outputDiv.textContent) {
outputDiv.textContent += " " + word;
} else {
outputDiv.textContent = word;
}
outputDiv.hidden = false;
} else if (status === "complete") {
updateStatusDiv("Ready");
}
}
async function startMicrophone() {
try {
audioStream = await navigator.mediaDevices.getUserMedia({ audio: true });
updateStatusDiv("Microphone access granted");
audioContext = new AudioContext({ sampleRate: 24000 });
source = audioContext.createMediaStreamSource(audioStream);
processor = audioContext.createScriptProcessor(1024, 1, 1);
processor.onaudioprocess = function(event) {
if (!isRecording || !modelInitialized) return;
const inputBuffer = event.inputBuffer;
const inputData = inputBuffer.getChannelData(0);
// Send audio chunk to worker
const audioChunk = new Float32Array(inputData);
moshiWorker.postMessage({
command: "process_audio",
audioData: audioChunk
}, [audioChunk.buffer]);
};
source.connect(processor);
processor.connect(audioContext.destination);
} catch (error) {
updateStatusDiv("Microphone access denied: " + error.message);
throw error;
}
}
function stopMicrophone() {
// Disconnect audio nodes
if (processor) {
processor.disconnect();
processor = null;
}
if (source) {
source.disconnect();
source = null;
}
if (audioContext) {
audioContext.close();
audioContext = null;
}
// Stop media stream
if (audioStream) {
audioStream.getTracks().forEach(track => track.stop());
audioStream = null;
}
updateStatusDiv("Microphone stopped");
}
async function startRecording() {
const button = document.querySelector("#speech-button");
try {
updateStatusDiv("Requesting microphone access...");
await startMicrophone();
// Reset performance counters
audioChunksProcessed = 0;
sessionStartTime = Date.now();
// Start streaming session
moshiWorker.postMessage({ command: "start_stream" });
isRecording = true;
button.textContent = "Stop Speech";
button.className = "bg-red-600 hover:bg-red-700 text-white font-normal py-2 px-4 rounded";
updateStatusDiv("Listening...");
// Clear previous transcription
document.querySelector("#output-generation").textContent = "";
document.querySelector("#output-generation").hidden = true;
document.querySelector("#output-placeholder").hidden = true;
} catch (error) {
console.error('Error starting microphone:', error);
updateStatusDiv("Error: " + error.message);
pendingStart = false;
}
}
document.querySelector("#speech-button").addEventListener("click", async () => {
const button = document.querySelector("#speech-button");
if (!isRecording) {
// Check if model is ready
if (!modelInitialized) {
pendingStart = true;
initializeModel();
return;
}
await startRecording();
} else {
stopMicrophone();
// End streaming session
moshiWorker.postMessage({ command: "stop_stream" });
isRecording = false;
button.textContent = "Start Speech";
button.className = "bg-gray-700 hover:bg-gray-800 text-white font-normal py-2 px-4 rounded";
updateStatusDiv("Ready to start");
}
});
</script>
</head>
<body class="container max-w-4xl mx-auto p-4">
<main class="grid grid-cols-1 gap-8 relative">
<div>
<h1 class="text-4xl font-bold">WASM Streaming Speech Recognition</h1>
<p class="text-gray-700">
Transcribe audio from your microphone in real time in the browser using Rust/WASM.
This demo runs entirely offline on your CPU after downloading a ~950 MB model.
It understands English and French, and uses the
<a href="https://huggingface.co/kyutai/stt-1b-en_fr" target="_blank" class="underline hover:text-blue-600">Kyutai STT model</a>
together with a WASM runtime built in
<a href="https://github.com/huggingface/candle/" target="_blank" class="underline hover:text-blue-600">Candle</a>.
</p>
</div>
<div>
<button id="speech-button" class="bg-gray-700 hover:bg-gray-800 text-white font-normal py-2 px-4 rounded">
Start Speech
</button>
<div class="mt-2 text-gray-600 text-sm space-y-1">
<div>Status: <span id="status-div">Click "Start Speech" to begin</span></div>
<div id="diagnostics">CPUs: -, Real-time factor: 0.00x, Duration: 0.0s</div>
</div>
</div>
<div>
<h3 class="font-medium">Transcription:</h3>
<div class="min-h-[200px] bg-slate-100 text-gray-700 p-4 rounded-md">
<p id="output-generation" hidden></p>
<span id="output-placeholder" class="font-light text-gray-500">Click "Start Speech" to begin transcription</span>
</div>
</div>
<div class="mt-4 p-3 bg-gray-50 text-gray-700 rounded-md">
💡 This demo shows offline transcription in your browser.
For more accurate cloud transcription and real-time LLM grammar correction, check out
<a href="https://voicewriter.io" target="_blank" class="underline hover:text-blue-600">Voice Writer</a>.
</div>
</main>
</body>
</html> |