Spaces:

efficient-nlp
/

wasm-streaming-speech

Running

File size: 11,731 Bytes

0a01d76
7a253f4
0a01d76

<!DOCTYPE html>
<html>
  <head>
    <meta charset="UTF-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <title>WASM Streaming Speech Recognition</title>
    <style>
      @import url("https://fonts.googleapis.com/css2?family=Source+Sans+3:wght@300;400;600;700&display=swap");
      html, body { font-family: "Source Sans 3", system-ui, -apple-system, Segoe UI, Roboto, sans-serif; }
    </style>
    <script src="css/tailwind-3.4.17.js"></script>
    <script type="module">
      const MODEL_ID = "moshi_1b_en_fr_q4k";
      const WEIGHTS_URL = "https://huggingface.co/efficient-nlp/stt-1b-en_fr-quantized/resolve/main/model-q4k.gguf";
      const MIMI_URL = "https://huggingface.co/efficient-nlp/stt-1b-en_fr-quantized/resolve/main/mimi-pytorch-e351c8d8@125.safetensors";
      const TOKENIZER_URL = "https://huggingface.co/efficient-nlp/stt-1b-en_fr-quantized/resolve/main/tokenizer_en_fr_audio_8000.json";
      const CONFIG_URL = "https://huggingface.co/efficient-nlp/stt-1b-en_fr-quantized/resolve/main/config.json";

      const moshiWorker = new Worker("./moshiWorker.js", { type: "module" });
      let mediaRecorder = null;
      let audioChunks = [];
      let isRecording = false;
      let audioStream = null;
      let audioContext = null;
      let processor = null;
      let source = null;
      let modelInitialized = false;
      let pendingStart = false;
      
      // Performance tracking
      let audioChunksProcessed = 0;
      let sessionStartTime = 0;

      function updateStatusDiv(message) {
        document.querySelector("#status-div").textContent = message;
      }

      function updateDiagnostics() {
        const diagnostics = document.querySelector("#diagnostics");
        if (!diagnostics) return;
        
        const cpuCount = navigator.hardwareConcurrency || 'unknown';
        
        // Only update metrics when recording, otherwise show final values
        if (isRecording && sessionStartTime) {
          // Calculate real-time factor (audio processed / wall clock time)
          // >1 = faster than real-time, <1 = slower than real-time  
          const audioProcessed = audioChunksProcessed * (1024 / 24000);
          const audioSessionDuration = (Date.now() - sessionStartTime) / 1000;
          const realTimeFactor = audioSessionDuration > 0 ? (audioProcessed / audioSessionDuration) : 0;
          
          // Color code based on performance
          let factorColor = '';
          if (realTimeFactor >= 0.95) {
            factorColor = 'text-green-600';
          } else if (realTimeFactor >= 0.8) {
            factorColor = 'text-yellow-600';
          }
          else {
            factorColor = 'text-red-600';
          }
          
          diagnostics.innerHTML = `CPUs: ${cpuCount}, Real-time factor: <span class="${factorColor}">${realTimeFactor.toFixed(2)}x</span>, Duration: ${audioSessionDuration.toFixed(1)}s`;
        } else if (!sessionStartTime) {
          diagnostics.innerHTML = `CPUs: ${cpuCount}, Real-time factor: <span class="text-gray-600">0.00x</span>, Duration: 0.0s`;
        } 
      }

      window.addEventListener('load', updateDiagnostics);
      setInterval(updateDiagnostics, 200);

      function initializeModel() {
        if (modelInitialized) return;
        
        const button = document.querySelector("#speech-button");
        button.disabled = true;
        button.className = "bg-gray-400 text-gray-700 font-normal py-2 px-4 rounded cursor-not-allowed";
        
        moshiWorker.postMessage({
          command: "initialize",
          weightsURL: WEIGHTS_URL,
          modelID: MODEL_ID,
          mimiURL: MIMI_URL,
          tokenizerURL: TOKENIZER_URL,
          configURL: CONFIG_URL,
        });
      }

      // Handle messages from worker
      moshiWorker.addEventListener("message", async (event) => {
        const data = event.data;
        if (data.status === "model_ready") {
          modelInitialized = true;
          updateStatusDiv("Model loaded - Ready to start");
          
          const button = document.querySelector("#speech-button");
          button.disabled = false;
          button.className = "bg-gray-700 hover:bg-gray-800 text-white font-normal py-2 px-4 rounded";
          
          if (pendingStart) {
            pendingStart = false;
            await startRecording();
          }
        } else if (data.status === "streaming") {
          // Add new word to transcription in real-time
          const outputDiv = document.querySelector("#output-generation");
          const placeholder = document.querySelector("#output-placeholder");
          
          if (placeholder) placeholder.hidden = true;
          
          if (outputDiv.textContent) {
            outputDiv.textContent += " " + data.word;
          } else {
            outputDiv.textContent = data.word;
          }
          outputDiv.hidden = false;
        } else if (data.status === "chunk_processed") {
          audioChunksProcessed++;
        } else if (data.status === "loading") {
          updateStatusDiv(data.message);
        } else if (data.error) {
          updateStatusDiv("Error: " + data.error);
          pendingStart = false;
        }
      });

      function updateStatus(data) {
        const { status, message, word } = data;
        const outputDiv = document.querySelector("#output-generation");
        
        if (status === "loading" || status === "decoding") {
          updateStatusDiv(message || (status === "loading" ? "Loading..." : "Decoding..."));
        } else if (status === "streaming") {
          // Add new word to the transcription in real-time
          if (outputDiv.textContent) {
            outputDiv.textContent += " " + word;
          } else {
            outputDiv.textContent = word;
          }
          outputDiv.hidden = false;
        } else if (status === "complete") {
          updateStatusDiv("Ready");
        }
      }

      async function startMicrophone() {
        try {
          audioStream = await navigator.mediaDevices.getUserMedia({ audio: true });
          updateStatusDiv("Microphone access granted");
          
          audioContext = new AudioContext({ sampleRate: 24000 });
          source = audioContext.createMediaStreamSource(audioStream);
          
          processor = audioContext.createScriptProcessor(1024, 1, 1);
          
          processor.onaudioprocess = function(event) {
            if (!isRecording || !modelInitialized) return;
            
            const inputBuffer = event.inputBuffer;
            const inputData = inputBuffer.getChannelData(0);
            
            // Send audio chunk to worker
            const audioChunk = new Float32Array(inputData);
            moshiWorker.postMessage({
              command: "process_audio",
              audioData: audioChunk
            }, [audioChunk.buffer]);
          };
          
          source.connect(processor);
          processor.connect(audioContext.destination);
          
        } catch (error) {
          updateStatusDiv("Microphone access denied: " + error.message);
          throw error;
        }
      }
      
      function stopMicrophone() {
        // Disconnect audio nodes
        if (processor) {
          processor.disconnect();
          processor = null;
        }
        if (source) {
          source.disconnect();
          source = null;
        }
        if (audioContext) {
          audioContext.close();
          audioContext = null;
        }
        
        // Stop media stream
        if (audioStream) {
          audioStream.getTracks().forEach(track => track.stop());
          audioStream = null;
        }
        
        updateStatusDiv("Microphone stopped");
      }

      async function startRecording() {
        const button = document.querySelector("#speech-button");
        
        try {
          updateStatusDiv("Requesting microphone access...");
          await startMicrophone();
          
          // Reset performance counters
          audioChunksProcessed = 0;
          sessionStartTime = Date.now();
          
          // Start streaming session
          moshiWorker.postMessage({ command: "start_stream" });
          
          isRecording = true;
          button.textContent = "Stop Speech";
          button.className = "bg-red-600 hover:bg-red-700 text-white font-normal py-2 px-4 rounded";
          updateStatusDiv("Listening...");
          
          // Clear previous transcription
          document.querySelector("#output-generation").textContent = "";
          document.querySelector("#output-generation").hidden = true;
          document.querySelector("#output-placeholder").hidden = true;
          
        } catch (error) {
          console.error('Error starting microphone:', error);
          updateStatusDiv("Error: " + error.message);
          pendingStart = false;
        }
      }

      document.querySelector("#speech-button").addEventListener("click", async () => {
        const button = document.querySelector("#speech-button");
        
        if (!isRecording) {
          // Check if model is ready
          if (!modelInitialized) {
            pendingStart = true;
            initializeModel();
            return;
          }
          
          await startRecording();
        } else {
          stopMicrophone();
          
          // End streaming session
          moshiWorker.postMessage({ command: "stop_stream" });
          
          isRecording = false;
          button.textContent = "Start Speech";
          button.className = "bg-gray-700 hover:bg-gray-800 text-white font-normal py-2 px-4 rounded";
          updateStatusDiv("Ready to start");
        }
      });
    </script>
  </head>
  <body class="container max-w-4xl mx-auto p-4">
    <main class="grid grid-cols-1 gap-8 relative">
      <div>
        <h1 class="text-4xl font-bold">WASM Streaming Speech Recognition</h1>
        <p class="text-gray-700">
          Transcribe audio from your microphone in real time in the browser using Rust/WASM.
          This demo runs entirely offline on your CPU after downloading a ~950 MB model.
          It understands English and French, and uses the
          <a href="https://huggingface.co/kyutai/stt-1b-en_fr" target="_blank" class="underline hover:text-blue-600">Kyutai STT model</a>
          together with a WASM runtime built in 
          <a href="https://github.com/huggingface/candle/" target="_blank" class="underline hover:text-blue-600">Candle</a>.
        </p>
      </div>

      <div>
        <button id="speech-button" class="bg-gray-700 hover:bg-gray-800 text-white font-normal py-2 px-4 rounded">
          Start Speech
        </button>
        <div class="mt-2 text-gray-600 text-sm space-y-1">
          <div>Status: <span id="status-div">Click "Start Speech" to begin</span></div>
          <div id="diagnostics">CPUs: -, Real-time factor: 0.00x, Duration: 0.0s</div>
        </div>
      </div>

      <div>
        <h3 class="font-medium">Transcription:</h3>
        <div class="min-h-[200px] bg-slate-100 text-gray-700 p-4 rounded-md">
          <p id="output-generation" hidden></p>
          <span id="output-placeholder" class="font-light text-gray-500">Click "Start Speech" to begin transcription</span>
        </div>
      </div>

      <div class="mt-4 p-3 bg-gray-50 text-gray-700 rounded-md">
        💡 This demo shows offline transcription in your browser.  
        For more accurate cloud transcription and real-time LLM grammar correction, check out  
        <a href="https://voicewriter.io" target="_blank" class="underline hover:text-blue-600">Voice Writer</a>.
      </div>
    </main>
  </body>
</html>