wasm-streaming-speech / index.html
luckyt's picture
Initial commit
0a01d76 verified
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>WASM Streaming Speech Recognition</title>
<style>
@import url("https://fonts.googleapis.com/css2?family=Source+Sans+3:wght@300;400;600;700&display=swap");
html, body { font-family: "Source Sans 3", system-ui, -apple-system, Segoe UI, Roboto, sans-serif; }
</style>
<script src="css/tailwind-3.4.17.js"></script>
<script type="module">
const MODEL_ID = "moshi_1b_en_fr_q4k";
const WEIGHTS_URL = "https://huggingface.co/efficient-nlp/stt-1b-en_fr-quantized/resolve/main/model-q4k.gguf";
const MIMI_URL = "https://huggingface.co/efficient-nlp/stt-1b-en_fr-quantized/resolve/main/mimi-pytorch-e351c8d8@125.safetensors";
const TOKENIZER_URL = "https://huggingface.co/efficient-nlp/stt-1b-en_fr-quantized/resolve/main/tokenizer_en_fr_audio_8000.json";
const CONFIG_URL = "https://huggingface.co/efficient-nlp/stt-1b-en_fr-quantized/resolve/main/config.json";
const moshiWorker = new Worker("./moshiWorker.js", { type: "module" });
let mediaRecorder = null;
let audioChunks = [];
let isRecording = false;
let audioStream = null;
let audioContext = null;
let processor = null;
let source = null;
let modelInitialized = false;
let pendingStart = false;
// Performance tracking
let audioChunksProcessed = 0;
let sessionStartTime = 0;
function updateStatusDiv(message) {
document.querySelector("#status-div").textContent = message;
}
function updateDiagnostics() {
const diagnostics = document.querySelector("#diagnostics");
if (!diagnostics) return;
const cpuCount = navigator.hardwareConcurrency || 'unknown';
// Only update metrics when recording, otherwise show final values
if (isRecording && sessionStartTime) {
// Calculate real-time factor (audio processed / wall clock time)
// >1 = faster than real-time, <1 = slower than real-time
const audioProcessed = audioChunksProcessed * (1024 / 24000);
const audioSessionDuration = (Date.now() - sessionStartTime) / 1000;
const realTimeFactor = audioSessionDuration > 0 ? (audioProcessed / audioSessionDuration) : 0;
// Color code based on performance
let factorColor = '';
if (realTimeFactor >= 0.95) {
factorColor = 'text-green-600';
} else if (realTimeFactor >= 0.8) {
factorColor = 'text-yellow-600';
}
else {
factorColor = 'text-red-600';
}
diagnostics.innerHTML = `CPUs: ${cpuCount}, Real-time factor: <span class="${factorColor}">${realTimeFactor.toFixed(2)}x</span>, Duration: ${audioSessionDuration.toFixed(1)}s`;
} else if (!sessionStartTime) {
diagnostics.innerHTML = `CPUs: ${cpuCount}, Real-time factor: <span class="text-gray-600">0.00x</span>, Duration: 0.0s`;
}
}
window.addEventListener('load', updateDiagnostics);
setInterval(updateDiagnostics, 200);
function initializeModel() {
if (modelInitialized) return;
const button = document.querySelector("#speech-button");
button.disabled = true;
button.className = "bg-gray-400 text-gray-700 font-normal py-2 px-4 rounded cursor-not-allowed";
moshiWorker.postMessage({
command: "initialize",
weightsURL: WEIGHTS_URL,
modelID: MODEL_ID,
mimiURL: MIMI_URL,
tokenizerURL: TOKENIZER_URL,
configURL: CONFIG_URL,
});
}
// Handle messages from worker
moshiWorker.addEventListener("message", async (event) => {
const data = event.data;
if (data.status === "model_ready") {
modelInitialized = true;
updateStatusDiv("Model loaded - Ready to start");
const button = document.querySelector("#speech-button");
button.disabled = false;
button.className = "bg-gray-700 hover:bg-gray-800 text-white font-normal py-2 px-4 rounded";
if (pendingStart) {
pendingStart = false;
await startRecording();
}
} else if (data.status === "streaming") {
// Add new word to transcription in real-time
const outputDiv = document.querySelector("#output-generation");
const placeholder = document.querySelector("#output-placeholder");
if (placeholder) placeholder.hidden = true;
if (outputDiv.textContent) {
outputDiv.textContent += " " + data.word;
} else {
outputDiv.textContent = data.word;
}
outputDiv.hidden = false;
} else if (data.status === "chunk_processed") {
audioChunksProcessed++;
} else if (data.status === "loading") {
updateStatusDiv(data.message);
} else if (data.error) {
updateStatusDiv("Error: " + data.error);
pendingStart = false;
}
});
function updateStatus(data) {
const { status, message, word } = data;
const outputDiv = document.querySelector("#output-generation");
if (status === "loading" || status === "decoding") {
updateStatusDiv(message || (status === "loading" ? "Loading..." : "Decoding..."));
} else if (status === "streaming") {
// Add new word to the transcription in real-time
if (outputDiv.textContent) {
outputDiv.textContent += " " + word;
} else {
outputDiv.textContent = word;
}
outputDiv.hidden = false;
} else if (status === "complete") {
updateStatusDiv("Ready");
}
}
async function startMicrophone() {
try {
audioStream = await navigator.mediaDevices.getUserMedia({ audio: true });
updateStatusDiv("Microphone access granted");
audioContext = new AudioContext({ sampleRate: 24000 });
source = audioContext.createMediaStreamSource(audioStream);
processor = audioContext.createScriptProcessor(1024, 1, 1);
processor.onaudioprocess = function(event) {
if (!isRecording || !modelInitialized) return;
const inputBuffer = event.inputBuffer;
const inputData = inputBuffer.getChannelData(0);
// Send audio chunk to worker
const audioChunk = new Float32Array(inputData);
moshiWorker.postMessage({
command: "process_audio",
audioData: audioChunk
}, [audioChunk.buffer]);
};
source.connect(processor);
processor.connect(audioContext.destination);
} catch (error) {
updateStatusDiv("Microphone access denied: " + error.message);
throw error;
}
}
function stopMicrophone() {
// Disconnect audio nodes
if (processor) {
processor.disconnect();
processor = null;
}
if (source) {
source.disconnect();
source = null;
}
if (audioContext) {
audioContext.close();
audioContext = null;
}
// Stop media stream
if (audioStream) {
audioStream.getTracks().forEach(track => track.stop());
audioStream = null;
}
updateStatusDiv("Microphone stopped");
}
async function startRecording() {
const button = document.querySelector("#speech-button");
try {
updateStatusDiv("Requesting microphone access...");
await startMicrophone();
// Reset performance counters
audioChunksProcessed = 0;
sessionStartTime = Date.now();
// Start streaming session
moshiWorker.postMessage({ command: "start_stream" });
isRecording = true;
button.textContent = "Stop Speech";
button.className = "bg-red-600 hover:bg-red-700 text-white font-normal py-2 px-4 rounded";
updateStatusDiv("Listening...");
// Clear previous transcription
document.querySelector("#output-generation").textContent = "";
document.querySelector("#output-generation").hidden = true;
document.querySelector("#output-placeholder").hidden = true;
} catch (error) {
console.error('Error starting microphone:', error);
updateStatusDiv("Error: " + error.message);
pendingStart = false;
}
}
document.querySelector("#speech-button").addEventListener("click", async () => {
const button = document.querySelector("#speech-button");
if (!isRecording) {
// Check if model is ready
if (!modelInitialized) {
pendingStart = true;
initializeModel();
return;
}
await startRecording();
} else {
stopMicrophone();
// End streaming session
moshiWorker.postMessage({ command: "stop_stream" });
isRecording = false;
button.textContent = "Start Speech";
button.className = "bg-gray-700 hover:bg-gray-800 text-white font-normal py-2 px-4 rounded";
updateStatusDiv("Ready to start");
}
});
</script>
</head>
<body class="container max-w-4xl mx-auto p-4">
<main class="grid grid-cols-1 gap-8 relative">
<div>
<h1 class="text-4xl font-bold">WASM Streaming Speech Recognition</h1>
<p class="text-gray-700">
Transcribe audio from your microphone in real time in the browser using Rust/WASM.
This demo runs entirely offline on your CPU after downloading a ~950 MB model.
It understands English and French, and uses the
<a href="https://huggingface.co/kyutai/stt-1b-en_fr" target="_blank" class="underline hover:text-blue-600">Kyutai STT model</a>
together with a WASM runtime built in
<a href="https://github.com/huggingface/candle/" target="_blank" class="underline hover:text-blue-600">Candle</a>.
</p>
</div>
<div>
<button id="speech-button" class="bg-gray-700 hover:bg-gray-800 text-white font-normal py-2 px-4 rounded">
Start Speech
</button>
<div class="mt-2 text-gray-600 text-sm space-y-1">
<div>Status: <span id="status-div">Click "Start Speech" to begin</span></div>
<div id="diagnostics">CPUs: -, Real-time factor: 0.00x, Duration: 0.0s</div>
</div>
</div>
<div>
<h3 class="font-medium">Transcription:</h3>
<div class="min-h-[200px] bg-slate-100 text-gray-700 p-4 rounded-md">
<p id="output-generation" hidden></p>
<span id="output-placeholder" class="font-light text-gray-500">Click "Start Speech" to begin transcription</span>
</div>
</div>
<div class="mt-4 p-3 bg-gray-50 text-gray-700 rounded-md">
💡 This demo shows offline transcription in your browser.
For more accurate cloud transcription and real-time LLM grammar correction, check out
<a href="https://voicewriter.io" target="_blank" class="underline hover:text-blue-600">Voice Writer</a>.
</div>
</main>
</body>
</html>