|
|
<!DOCTYPE html> |
|
|
<html> |
|
|
<head> |
|
|
<meta charset="UTF-8" /> |
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" /> |
|
|
<title>WASM Streaming Speech Recognition</title> |
|
|
<style> |
|
|
@import url("https://fonts.googleapis.com/css2?family=Source+Sans+3:wght@300;400;600;700&display=swap"); |
|
|
html, body { font-family: "Source Sans 3", system-ui, -apple-system, Segoe UI, Roboto, sans-serif; } |
|
|
</style> |
|
|
<script src="css/tailwind-3.4.17.js"></script> |
|
|
<script type="module"> |
|
|
const MODEL_ID = "moshi_1b_en_fr_q4k"; |
|
|
const WEIGHTS_URL = "https://huggingface.co/efficient-nlp/stt-1b-en_fr-quantized/resolve/main/model-q4k.gguf"; |
|
|
const MIMI_URL = "https://huggingface.co/efficient-nlp/stt-1b-en_fr-quantized/resolve/main/mimi-pytorch-e351c8d8@125.safetensors"; |
|
|
const TOKENIZER_URL = "https://huggingface.co/efficient-nlp/stt-1b-en_fr-quantized/resolve/main/tokenizer_en_fr_audio_8000.json"; |
|
|
const CONFIG_URL = "https://huggingface.co/efficient-nlp/stt-1b-en_fr-quantized/resolve/main/config.json"; |
|
|
|
|
|
const moshiWorker = new Worker("./moshiWorker.js", { type: "module" }); |
|
|
let mediaRecorder = null; |
|
|
let audioChunks = []; |
|
|
let isRecording = false; |
|
|
let audioStream = null; |
|
|
let audioContext = null; |
|
|
let processor = null; |
|
|
let source = null; |
|
|
let modelInitialized = false; |
|
|
let pendingStart = false; |
|
|
|
|
|
|
|
|
let audioChunksProcessed = 0; |
|
|
let sessionStartTime = 0; |
|
|
|
|
|
function updateStatusDiv(message) { |
|
|
document.querySelector("#status-div").textContent = message; |
|
|
} |
|
|
|
|
|
function updateDiagnostics() { |
|
|
const diagnostics = document.querySelector("#diagnostics"); |
|
|
if (!diagnostics) return; |
|
|
|
|
|
const cpuCount = navigator.hardwareConcurrency || 'unknown'; |
|
|
|
|
|
|
|
|
if (isRecording && sessionStartTime) { |
|
|
|
|
|
|
|
|
const audioProcessed = audioChunksProcessed * (1024 / 24000); |
|
|
const audioSessionDuration = (Date.now() - sessionStartTime) / 1000; |
|
|
const realTimeFactor = audioSessionDuration > 0 ? (audioProcessed / audioSessionDuration) : 0; |
|
|
|
|
|
|
|
|
let factorColor = ''; |
|
|
if (realTimeFactor >= 0.95) { |
|
|
factorColor = 'text-green-600'; |
|
|
} else if (realTimeFactor >= 0.8) { |
|
|
factorColor = 'text-yellow-600'; |
|
|
} |
|
|
else { |
|
|
factorColor = 'text-red-600'; |
|
|
} |
|
|
|
|
|
diagnostics.innerHTML = `CPUs: ${cpuCount}, Real-time factor: <span class="${factorColor}">${realTimeFactor.toFixed(2)}x</span>, Duration: ${audioSessionDuration.toFixed(1)}s`; |
|
|
} else if (!sessionStartTime) { |
|
|
diagnostics.innerHTML = `CPUs: ${cpuCount}, Real-time factor: <span class="text-gray-600">0.00x</span>, Duration: 0.0s`; |
|
|
} |
|
|
} |
|
|
|
|
|
window.addEventListener('load', updateDiagnostics); |
|
|
setInterval(updateDiagnostics, 200); |
|
|
|
|
|
function initializeModel() { |
|
|
if (modelInitialized) return; |
|
|
|
|
|
const button = document.querySelector("#speech-button"); |
|
|
button.disabled = true; |
|
|
button.className = "bg-gray-400 text-gray-700 font-normal py-2 px-4 rounded cursor-not-allowed"; |
|
|
|
|
|
moshiWorker.postMessage({ |
|
|
command: "initialize", |
|
|
weightsURL: WEIGHTS_URL, |
|
|
modelID: MODEL_ID, |
|
|
mimiURL: MIMI_URL, |
|
|
tokenizerURL: TOKENIZER_URL, |
|
|
configURL: CONFIG_URL, |
|
|
}); |
|
|
} |
|
|
|
|
|
|
|
|
moshiWorker.addEventListener("message", async (event) => { |
|
|
const data = event.data; |
|
|
if (data.status === "model_ready") { |
|
|
modelInitialized = true; |
|
|
updateStatusDiv("Model loaded - Ready to start"); |
|
|
|
|
|
const button = document.querySelector("#speech-button"); |
|
|
button.disabled = false; |
|
|
button.className = "bg-gray-700 hover:bg-gray-800 text-white font-normal py-2 px-4 rounded"; |
|
|
|
|
|
if (pendingStart) { |
|
|
pendingStart = false; |
|
|
await startRecording(); |
|
|
} |
|
|
} else if (data.status === "streaming") { |
|
|
|
|
|
const outputDiv = document.querySelector("#output-generation"); |
|
|
const placeholder = document.querySelector("#output-placeholder"); |
|
|
|
|
|
if (placeholder) placeholder.hidden = true; |
|
|
|
|
|
if (outputDiv.textContent) { |
|
|
outputDiv.textContent += " " + data.word; |
|
|
} else { |
|
|
outputDiv.textContent = data.word; |
|
|
} |
|
|
outputDiv.hidden = false; |
|
|
} else if (data.status === "chunk_processed") { |
|
|
audioChunksProcessed++; |
|
|
} else if (data.status === "loading") { |
|
|
updateStatusDiv(data.message); |
|
|
} else if (data.error) { |
|
|
updateStatusDiv("Error: " + data.error); |
|
|
pendingStart = false; |
|
|
} |
|
|
}); |
|
|
|
|
|
function updateStatus(data) { |
|
|
const { status, message, word } = data; |
|
|
const outputDiv = document.querySelector("#output-generation"); |
|
|
|
|
|
if (status === "loading" || status === "decoding") { |
|
|
updateStatusDiv(message || (status === "loading" ? "Loading..." : "Decoding...")); |
|
|
} else if (status === "streaming") { |
|
|
|
|
|
if (outputDiv.textContent) { |
|
|
outputDiv.textContent += " " + word; |
|
|
} else { |
|
|
outputDiv.textContent = word; |
|
|
} |
|
|
outputDiv.hidden = false; |
|
|
} else if (status === "complete") { |
|
|
updateStatusDiv("Ready"); |
|
|
} |
|
|
} |
|
|
|
|
|
async function startMicrophone() { |
|
|
try { |
|
|
audioStream = await navigator.mediaDevices.getUserMedia({ audio: true }); |
|
|
updateStatusDiv("Microphone access granted"); |
|
|
|
|
|
audioContext = new AudioContext({ sampleRate: 24000 }); |
|
|
source = audioContext.createMediaStreamSource(audioStream); |
|
|
|
|
|
processor = audioContext.createScriptProcessor(1024, 1, 1); |
|
|
|
|
|
processor.onaudioprocess = function(event) { |
|
|
if (!isRecording || !modelInitialized) return; |
|
|
|
|
|
const inputBuffer = event.inputBuffer; |
|
|
const inputData = inputBuffer.getChannelData(0); |
|
|
|
|
|
|
|
|
const audioChunk = new Float32Array(inputData); |
|
|
moshiWorker.postMessage({ |
|
|
command: "process_audio", |
|
|
audioData: audioChunk |
|
|
}, [audioChunk.buffer]); |
|
|
}; |
|
|
|
|
|
source.connect(processor); |
|
|
processor.connect(audioContext.destination); |
|
|
|
|
|
} catch (error) { |
|
|
updateStatusDiv("Microphone access denied: " + error.message); |
|
|
throw error; |
|
|
} |
|
|
} |
|
|
|
|
|
function stopMicrophone() { |
|
|
|
|
|
if (processor) { |
|
|
processor.disconnect(); |
|
|
processor = null; |
|
|
} |
|
|
if (source) { |
|
|
source.disconnect(); |
|
|
source = null; |
|
|
} |
|
|
if (audioContext) { |
|
|
audioContext.close(); |
|
|
audioContext = null; |
|
|
} |
|
|
|
|
|
|
|
|
if (audioStream) { |
|
|
audioStream.getTracks().forEach(track => track.stop()); |
|
|
audioStream = null; |
|
|
} |
|
|
|
|
|
updateStatusDiv("Microphone stopped"); |
|
|
} |
|
|
|
|
|
async function startRecording() { |
|
|
const button = document.querySelector("#speech-button"); |
|
|
|
|
|
try { |
|
|
updateStatusDiv("Requesting microphone access..."); |
|
|
await startMicrophone(); |
|
|
|
|
|
|
|
|
audioChunksProcessed = 0; |
|
|
sessionStartTime = Date.now(); |
|
|
|
|
|
|
|
|
moshiWorker.postMessage({ command: "start_stream" }); |
|
|
|
|
|
isRecording = true; |
|
|
button.textContent = "Stop Speech"; |
|
|
button.className = "bg-red-600 hover:bg-red-700 text-white font-normal py-2 px-4 rounded"; |
|
|
updateStatusDiv("Listening..."); |
|
|
|
|
|
|
|
|
document.querySelector("#output-generation").textContent = ""; |
|
|
document.querySelector("#output-generation").hidden = true; |
|
|
document.querySelector("#output-placeholder").hidden = true; |
|
|
|
|
|
} catch (error) { |
|
|
console.error('Error starting microphone:', error); |
|
|
updateStatusDiv("Error: " + error.message); |
|
|
pendingStart = false; |
|
|
} |
|
|
} |
|
|
|
|
|
document.querySelector("#speech-button").addEventListener("click", async () => { |
|
|
const button = document.querySelector("#speech-button"); |
|
|
|
|
|
if (!isRecording) { |
|
|
|
|
|
if (!modelInitialized) { |
|
|
pendingStart = true; |
|
|
initializeModel(); |
|
|
return; |
|
|
} |
|
|
|
|
|
await startRecording(); |
|
|
} else { |
|
|
stopMicrophone(); |
|
|
|
|
|
|
|
|
moshiWorker.postMessage({ command: "stop_stream" }); |
|
|
|
|
|
isRecording = false; |
|
|
button.textContent = "Start Speech"; |
|
|
button.className = "bg-gray-700 hover:bg-gray-800 text-white font-normal py-2 px-4 rounded"; |
|
|
updateStatusDiv("Ready to start"); |
|
|
} |
|
|
}); |
|
|
</script> |
|
|
</head> |
|
|
<body class="container max-w-4xl mx-auto p-4"> |
|
|
<main class="grid grid-cols-1 gap-8 relative"> |
|
|
<div> |
|
|
<h1 class="text-4xl font-bold">WASM Streaming Speech Recognition</h1> |
|
|
<p class="text-gray-700"> |
|
|
Transcribe audio from your microphone in real time in the browser using Rust/WASM. |
|
|
This demo runs entirely offline on your CPU after downloading a ~950 MB model. |
|
|
It understands English and French, and uses the |
|
|
<a href="https://huggingface.co/kyutai/stt-1b-en_fr" target="_blank" class="underline hover:text-blue-600">Kyutai STT model</a> |
|
|
together with a WASM runtime built in |
|
|
<a href="https://github.com/huggingface/candle/" target="_blank" class="underline hover:text-blue-600">Candle</a>. |
|
|
</p> |
|
|
</div> |
|
|
|
|
|
<div> |
|
|
<button id="speech-button" class="bg-gray-700 hover:bg-gray-800 text-white font-normal py-2 px-4 rounded"> |
|
|
Start Speech |
|
|
</button> |
|
|
<div class="mt-2 text-gray-600 text-sm space-y-1"> |
|
|
<div>Status: <span id="status-div">Click "Start Speech" to begin</span></div> |
|
|
<div id="diagnostics">CPUs: -, Real-time factor: 0.00x, Duration: 0.0s</div> |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
<div> |
|
|
<h3 class="font-medium">Transcription:</h3> |
|
|
<div class="min-h-[200px] bg-slate-100 text-gray-700 p-4 rounded-md"> |
|
|
<p id="output-generation" hidden></p> |
|
|
<span id="output-placeholder" class="font-light text-gray-500">Click "Start Speech" to begin transcription</span> |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
<div class="mt-4 p-3 bg-gray-50 text-gray-700 rounded-md"> |
|
|
💡 This demo shows offline transcription in your browser. |
|
|
For more accurate cloud transcription and real-time LLM grammar correction, check out |
|
|
<a href="https://voicewriter.io" target="_blank" class="underline hover:text-blue-600">Voice Writer</a>. |
|
|
</div> |
|
|
</main> |
|
|
</body> |
|
|
</html> |