Banafo's picture
Add caching support
e3c8414
// This file copies and modifies code
// from https://mdn.github.io/web-dictaphone/scripts/app.js
// and https://gist.github.com/meziantou/edb7217fddfbb70e899e
const urlParams = new URLSearchParams(window.location.search);
const lang = urlParams.get('lang') ?? 'en';
const activeTab = urlParams.get('tab') ?? 'single';
const startBtn = document.getElementById('recordBtn');
const hint = document.getElementById('hint');
const soundClips = document.getElementById('fileInput');
const playAllBtn = document.getElementById('playAllBtn');
let started = false;
let multistreamStarted = false;
let textArea = document.getElementById('results');
let lastResult = '';
let resultList = [];
function clear() {
resultList = [];
textArea.value = getDisplayResult();
textArea.scrollTop = textArea.scrollHeight; // auto scroll
};
function getDisplayResult() {
let i = 0;
let ans = '';
for (let s in resultList) {
if (resultList[s] == '') {
continue;
}
ans += '' + i + ': ' + resultList[s] + '\n';
i += 1;
}
if (lastResult.length > 0) {
ans += '' + i + ': ' + lastResult + '\n';
}
return ans;
}
Module = {};
Module.locateFile = function(path, scriptDirectory = '') {
if (path.endsWith('.js.metadata')) {
return scriptDirectory + path.replace('.js.metadata', '.json');
}
return scriptDirectory + path;
};
Module.setStatus = function(status) {
const statusElement = document.getElementById('status');
statusElement.textContent = status;
if (status === '') {
statusElement.style.display = 'none';
document.querySelectorAll('.tab-content').forEach((tabContentElement) => {
tabContentElement.classList.remove('loading');
});
} else {
statusElement.style.display = 'block';
document.querySelectorAll('.tab-content').forEach((tabContentElement) => {
tabContentElement.classList.add('loading');
});
}
};
Module.onRuntimeInitialized = function() {
console.log('inited!');
//hint.innerText = 'Model loaded! Please click start';
started = false;
recognizer = createOnlineRecognizer(Module);
console.log('recognizer is created!', recognizer);
};
function loadScript(src) {
const scriptElement = document.createElement('script');
scriptElement.src = src;
document.body.append(scriptElement);
}
loadScript('./' + lang + '.js');
loadScript('./sherpa-onnx-wasm-main-asr.js');
let audioCtx;
let mediaStream;
let expectedSampleRate = 16000;
let recordSampleRate; // the sampleRate of the microphone
let recorder = null; // the microphone
let leftchannel = []; // TODO: Use a single channel
let recordingLength = 0; // number of samples so far
let recognizer = null;
let recognizer_stream = null;
if (navigator.mediaDevices.getUserMedia) {
console.log('getUserMedia supported.');
// see https://w3c.github.io/mediacapture-main/#dom-mediadevices-getusermedia
const constraints = {audio: true};
let onSuccess = function(stream) {
if (!audioCtx) {
audioCtx = new AudioContext({sampleRate: 16000});
}
console.log(audioCtx);
recordSampleRate = audioCtx.sampleRate;
console.log('sample rate ' + recordSampleRate);
// creates an audio node from the microphone incoming stream
mediaStream = audioCtx.createMediaStreamSource(stream);
console.log('media stream', mediaStream);
// https://developer.mozilla.org/en-US/docs/Web/API/AudioContext/createScriptProcessor
// bufferSize: the onaudioprocess event is called when the buffer is full
var bufferSize = 4096;
var numberOfInputChannels = 1;
var numberOfOutputChannels = 2;
if (audioCtx.createScriptProcessor) {
recorder = audioCtx.createScriptProcessor(
bufferSize, numberOfInputChannels, numberOfOutputChannels);
} else {
recorder = audioCtx.createJavaScriptNode(
bufferSize, numberOfInputChannels, numberOfOutputChannels);
}
console.log('recorder', recorder);
recorder.onaudioprocess = function(e) {
let samples = new Float32Array(e.inputBuffer.getChannelData(0))
samples = downsampleBuffer(samples, expectedSampleRate);
if (recognizer_stream == null) {
recognizer_stream = recognizer.createStream();
}
recognizer_stream.acceptWaveform(expectedSampleRate, samples);
while (recognizer.isReady(recognizer_stream)) {
recognizer.decode(recognizer_stream);
}
let isEndpoint = recognizer.isEndpoint(recognizer_stream);
let result = recognizer.getResult(recognizer_stream).text;
if (recognizer.config.modelConfig.paraformer.encoder != '') {
let tailPaddings = new Float32Array(expectedSampleRate);
recognizer_stream.acceptWaveform(expectedSampleRate, tailPaddings);
while (recognizer.isReady(recognizer_stream)) {
recognizer.decode(recognizer_stream);
}
result = recognizer.getResult(recognizer_stream).text;
}
if (result.length > 0 && lastResult != result) {
lastResult = result;
}
if (isEndpoint) {
if (lastResult.length > 0) {
resultList.push(lastResult);
lastResult = '';
}
recognizer.reset(recognizer_stream);
}
textArea.value = getDisplayResult();
textArea.scrollTop = textArea.scrollHeight; // auto scroll
let buf = new Int16Array(samples.length);
for (var i = 0; i < samples.length; ++i) {
let s = samples[i];
if (s >= 1)
s = 1;
else if (s <= -1)
s = -1;
samples[i] = s;
buf[i] = s * 32767;
}
leftchannel.push(buf);
recordingLength += bufferSize;
};
startBtn.onclick = function() {
if(started) {
console.log('recorder stopped');
recorder.disconnect(audioCtx.destination);
mediaStream.disconnect(recorder);
started = false;
var clipName = new Date().toISOString();
const clipContainer = document.createElement('article');
const clipLabel = document.createElement('p');
const audio = document.createElement('audio');
const deleteButton = document.createElement('button');
clipContainer.classList.add('clip');
audio.setAttribute('controls', '');
deleteButton.textContent = 'Delete';
deleteButton.className = 'delete';
clipLabel.textContent = clipName;
clipContainer.appendChild(audio);
clipContainer.appendChild(clipLabel);
clipContainer.appendChild(deleteButton);
soundClips.appendChild(clipContainer);
audio.controls = true;
let samples = flatten(leftchannel);
const blob = toWav(samples);
leftchannel = [];
const audioURL = window.URL.createObjectURL(blob);
audio.src = audioURL;
console.log('recorder stopped');
deleteButton.onclick = function(e) {
let evtTgt = e.target;
evtTgt.parentNode.parentNode.removeChild(evtTgt.parentNode);
};
clipLabel.onclick = function() {
const existingName = clipLabel.textContent;
const newClipName = prompt('Enter a new name for your sound clip?');
if (newClipName === null) {
clipLabel.textContent = existingName;
} else {
clipLabel.textContent = newClipName;
}
};
}
else {
mediaStream.connect(recorder);
recorder.connect(audioCtx.destination);
console.log('recorder started');
started = true;
}
};
};
let onError = function(err) {
console.log('The following error occured: ' + err);
};
navigator.mediaDevices.getUserMedia(constraints).then(onSuccess, onError);
} else {
console.log('getUserMedia not supported on your browser!');
alert('getUserMedia not supported on your browser!');
}
playAllBtn.onclick = function() {
if(!multistreamStarted) {
multistreamStarted = true;
playAllBtn.textContent = "Stop All Streams";
playAllBtn.style.backgroundColor = "#d9534f";
transcribe(audioSources[lang][0], 'transcript1', 0);
transcribe(audioSources[lang][1], 'transcript2', 1);
transcribe(audioSources[lang][2], 'transcript3', 2);
transcribe(audioSources[lang][3], 'transcript4', 3);
transcribe(audioSources[lang][4], 'transcript5', 4);
}
else {
audios[0].pause();
audios[1].pause();
audios[2].pause();
audios[3].pause();
audios[4].pause();
audios[0].currentTime = 0;
audios[1].currentTime = 0;
audios[2].currentTime = 0;
audios[3].currentTime = 0;
audios[4].currentTime = 0;
playAllBtn.textContent = "Play All Streams";
playAllBtn.style.backgroundColor = "#007bff";
multistreamStarted = false;
}
}
// this function is copied/modified from
// https://gist.github.com/meziantou/edb7217fddfbb70e899e
function flatten(listOfSamples) {
let n = 0;
for (let i = 0; i < listOfSamples.length; ++i) {
n += listOfSamples[i].length;
}
let ans = new Int16Array(n);
let offset = 0;
for (let i = 0; i < listOfSamples.length; ++i) {
ans.set(listOfSamples[i], offset);
offset += listOfSamples[i].length;
}
return ans;
}
// this function is copied/modified from
// https://gist.github.com/meziantou/edb7217fddfbb70e899e
function toWav(samples) {
let buf = new ArrayBuffer(44 + samples.length * 2);
var view = new DataView(buf);
// http://soundfile.sapp.org/doc/WaveFormat/
// F F I R
view.setUint32(0, 0x46464952, true); // chunkID
view.setUint32(4, 36 + samples.length * 2, true); // chunkSize
// E V A W
view.setUint32(8, 0x45564157, true); // format
//
// t m f
view.setUint32(12, 0x20746d66, true); // subchunk1ID
view.setUint32(16, 16, true); // subchunk1Size, 16 for PCM
view.setUint32(20, 1, true); // audioFormat, 1 for PCM
view.setUint16(22, 1, true); // numChannels: 1 channel
view.setUint32(24, expectedSampleRate, true); // sampleRate
view.setUint32(28, expectedSampleRate * 2, true); // byteRate
view.setUint16(32, 2, true); // blockAlign
view.setUint16(34, 16, true); // bitsPerSample
view.setUint32(36, 0x61746164, true); // Subchunk2ID
view.setUint32(40, samples.length * 2, true); // subchunk2Size
let offset = 44;
for (let i = 0; i < samples.length; ++i) {
view.setInt16(offset, samples[i], true);
offset += 2;
}
return new Blob([view], {type: 'audio/wav'});
}
// this function is copied from
// https://github.com/awslabs/aws-lex-browser-audio-capture/blob/master/lib/worker.js#L46
function downsampleBuffer(buffer, exportSampleRate) {
if (exportSampleRate === recordSampleRate) {
return buffer;
}
var sampleRateRatio = recordSampleRate / exportSampleRate;
var newLength = Math.round(buffer.length / sampleRateRatio);
var result = new Float32Array(newLength);
var offsetResult = 0;
var offsetBuffer = 0;
while (offsetResult < result.length) {
var nextOffsetBuffer = Math.round((offsetResult + 1) * sampleRateRatio);
var accum = 0, count = 0;
for (var i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++) {
accum += buffer[i];
count++;
}
result[offsetResult] = accum / count;
offsetResult++;
offsetBuffer = nextOffsetBuffer;
}
return result;
};
async function processArrayBufferWithASR(arrayBuffer, file) {
// Check if recognizer is ready.
if (recognizer === null) {
console.error("Recognizer not yet initialized! Please wait for WASM to load.");
//resultsTextarea.value = "Error: Recognizer not ready.";
return;
}
// Create an AudioContext. (On some platforms, creating multiple AudioContexts can be problematic.
// If needed, consider reusing a global AudioContext.)
const audioCtx = new (window.AudioContext || window.webkitAudioContext)({
sampleRate: expectedSampleRate
});
try {
const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer);
console.log("AudioBuffer decoded. Duration (s):", audioBuffer.duration);
let channelData = audioBuffer.getChannelData(0);
console.log("Channel data length:", channelData.length);
// Downsample if necessary.
if (audioBuffer.sampleRate !== expectedSampleRate) {
console.log("Downsampling from", audioBuffer.sampleRate, "to", expectedSampleRate);
channelData = downsampleBuffer(channelData, expectedSampleRate, audioBuffer.sampleRate);
console.log("Downsampled channel data length:", channelData.length);
}
// Create a new recognizer stream.
const stream = recognizer.createStream();
const chunkSize = expectedSampleRate; // assume 1 second worth of samples per chunk.
for (let i = 0; i < channelData.length; i += chunkSize) {
const chunk = channelData.subarray(i, i + chunkSize);
stream.acceptWaveform(expectedSampleRate, chunk);
while (recognizer.isReady(stream)) {
recognizer.decode(stream);
}
}
// Flush any tail data if necessary.
const tail = new Float32Array(expectedSampleRate);
stream.acceptWaveform(expectedSampleRate, tail);
while (recognizer.isReady(stream)) {
recognizer.decode(stream);
}
const fileResult = recognizer.getResult(stream).text || "";
console.log("ASR result for file:", fileResult);
textArea.value = fileResult;
} catch (err) {
console.error("Error decoding audio data:", err);
//resultsTextarea.value = "Error processing audio: " + err.message;
}
}
const audios = []
const recorders = []
async function loadAudio(url) {
try {
const response = await fetch(url, { mode: "cors" });
if (!response.ok) throw new Error("Network response was not ok");
const blob = await response.blob();
const objectUrl = URL.createObjectURL(blob);
return new Audio(objectUrl);
} catch (error) {
console.error("Error loading audio:", error);
}
}
async function transcribe(url, output, index) {
let urlLabel = document.createElement('h2');
//urlLabel.textContent = url;
let textarea = document.getElementById(output);
textarea.value = '';
//textarea.readOnly = true;
//textarea.rows = 10;
//document.querySelector('#container').append(urlLabel);
//document.querySelector('#container').append(textarea);
let lastResult = '';
let resultList = [];
function getDisplayResult() {
let i = 0;
let ans = '';
for (let s in resultList) {
if (resultList[s] == '') {
continue;
}
ans += '' + i + ': ' + resultList[s] + '\n';
i += 1;
}
if (lastResult.length > 0) {
ans += '' + i + ': ' + lastResult + '\n';
}
return ans;
}
if(!audios[index]) {
audios[index] = await loadAudio(url);
}
audios[index].play();
console.log(audioCtx);
let recordSampleRate = audioCtx.sampleRate;
console.log('sample rate ' + recordSampleRate);
// https://developer.mozilla.org/en-US/docs/Web/API/AudioContext/createScriptProcessor
// bufferSize: the onaudioprocess event is called when the buffer is full
var bufferSize = 4096;
var numberOfInputChannels = 1;
var numberOfOutputChannels = 2;
if(!recorders[index]) {
recorders[index] = audioCtx.createScriptProcessor(
bufferSize, numberOfInputChannels, numberOfOutputChannels);
let mediaStream = audioCtx.createMediaElementSource(audios[index]);
mediaStream.connect(recorders[index]);
recorders[index].connect(audioCtx.destination);
}
let recognizer_stream = null;
recorders[index].onaudioprocess = function(e) {
let samples = new Float32Array(e.inputBuffer.getChannelData(0))
e.outputBuffer.copyToChannel(samples, 0);
if (recognizer_stream == null) {
recognizer_stream = recognizer.createStream();
}
recognizer_stream.acceptWaveform(expectedSampleRate, samples);
while (recognizer.isReady(recognizer_stream)) {
recognizer.decode(recognizer_stream);
}
let isEndpoint = recognizer.isEndpoint(recognizer_stream);
let result = recognizer.getResult(recognizer_stream).text;
if (result.length > 0 && lastResult != result) {
lastResult = result;
}
if (isEndpoint) {
if (lastResult.length > 0) {
resultList.push(lastResult);
lastResult = '';
}
recognizer.reset(recognizer_stream);
}
textarea.value = getDisplayResult();
textarea.scrollTop = textarea.scrollHeight; // auto scroll
let buf = new Int16Array(samples.length);
for (var i = 0; i < samples.length; ++i) {
let s = samples[i];
if (s >= 1)
s = 1;
else if (s <= -1)
s = -1;
samples[i] = s;
buf[i] = s * 32767;
}
};
recorders[index]?.addEventListener("recordingStopped", () => {
console.log("Decoding has stopped.");
mediaStream.disconnect(recorders[index]);
});
}
soundClips.addEventListener("change", function (event) {
if (!event.target.files || !event.target.files[0]) {
console.log("No file selected.");
return;
}
const file = event.target.files[0];
console.log("Selected file:", file.name, file.type, file.size, "bytes");
const reader = new FileReader();
reader.onload = function (ev) {
console.log("FileReader onload called.");
const arrayBuffer = ev.target.result;
console.log("ArrayBuffer length:", arrayBuffer.byteLength);
var url = URL.createObjectURL(file);
transcribe(url, 'results');
//processArrayBufferWithASR(arrayBuffer, file);
};
reader.onerror = function (err) {
console.error("FileReader error:", err);
};
console.log("Starting FileReader.readAsArrayBuffer...");
reader.readAsArrayBuffer(file);
});
const singleAudioTab = document.getElementById("singleAudioTab");
const multistreamTab = document.getElementById("multistreamTab");
const singleAudioContent = document.getElementById("singleAudioContent");
const multistreamContent = document.getElementById("multistreamContent");
const audioElements = [
document.getElementById("audio1"),
document.getElementById("audio2"),
document.getElementById("audio3"),
document.getElementById("audio4"),
document.getElementById("audio5"),
];
const audioSources = {
"de": [
"./de1.mp3",
"./de2.mp3",
"./de3.mp3",
"./de4.mp3",
"./de5.mp3",
],
"en": [
"./en1.mp3",
"./en2.mp3",
"./en3.mp3",
"./en4.mp3",
"./en5.mp3",
],
"fr": [
"./fr1.mp3",
"./fr2.mp3",
"./fr3.mp3",
"./fr4.mp3",
"./fr5.mp3",
],
};
// Tab switching logic
singleAudioTab.addEventListener("click", () => {
singleAudioTab.classList.add("active");
multistreamTab.classList.remove("active");
singleAudioContent.style.display = "block";
multistreamContent.style.display = "none";
singleAudioTab.style.borderBottomColor = "#007bff";
multistreamTab.style.borderBottomColor = "transparent";
singleAudioTab.style.color = "#007bff";
multistreamTab.style.color = "#6c757d";
const url = new URL(window.location.href);
url.searchParams.set("tab", "single");
window.history.pushState({}, "", url.toString());
});
multistreamTab.addEventListener("click", () => {
multistreamTab.classList.add("active");
singleAudioTab.classList.remove("active");
multistreamContent.style.display = "block";
singleAudioContent.style.display = "none";
multistreamTab.style.borderBottomColor = "#007bff";
singleAudioTab.style.borderBottomColor = "transparent";
multistreamTab.style.color = "#007bff";
singleAudioTab.style.color = "#6c757d";
const url = new URL(window.location.href);
url.searchParams.set("tab", "multi");
window.history.pushState({}, "", url.toString());
});
// Load audio sources
audioElements.forEach((audio, index) => {
audio.src = audioSources[lang][index];
});
// Microphone recording logic
const recordBtn = document.getElementById("recordBtn");
const outputText = document.getElementById("outputText");
const audioPlayback = document.getElementById("audioPlayback");
let mediaRecorder;
let audioChunks = [];
recordBtn.addEventListener("click", async () => {
if (!mediaRecorder || mediaRecorder.state === "inactive") {
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
mediaRecorder = new MediaRecorder(stream);
mediaRecorder.ondataavailable = (event) => {
audioChunks.push(event.data);
};
mediaRecorder.onstop = () => {
const audioBlob = new Blob(audioChunks, { type: "audio/wav" });
audioChunks = [];
const audioURL = URL.createObjectURL(audioBlob);
audioPlayback.src = audioURL;
audioPlayback.style.display = "block";
//outputText.value = "Recording completed. Playback is ready.";
};
mediaRecorder.start();
recordBtn.textContent = "Stop Recording";
recordBtn.style.color = "#5cb85c";
//outputText.value = "Recording...";
} catch (err) {
//outputText.value = "Error accessing microphone: " + err.message;
}
} else if (mediaRecorder.state === "recording") {
mediaRecorder.stop();
recordBtn.textContent = "Use Microphone";
recordBtn.style.color = "#d9534f";
}
});
// Function to handle language change and update URL
document.querySelectorAll('input[name="language"]').forEach((input) => {
input.addEventListener('change', function() {
const selectedLang = this.value;
// Update URL with new language parameter
const url = new URL(window.location.href);
url.searchParams.set('lang', selectedLang);
// Reload the page with new language setting
window.location.href = url.toString();
});
});
document.querySelectorAll('input[name="language"]').forEach((input) => {
input.checked = input.value === lang;
});
if (activeTab === "multi") {
multistreamTab.classList.add("active");
singleAudioTab.classList.remove("active");
multistreamContent.style.display = "block";
singleAudioContent.style.display = "none";
multistreamTab.style.borderBottomColor = "#007bff";
singleAudioTab.style.borderBottomColor = "transparent";
multistreamTab.style.color = "#007bff";
singleAudioTab.style.color = "#6c757d";
}