Spaces:
Running
Running
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <title>Voice Chat Bot with Advanced Echo Cancellation</title> | |
| <script src="https://cdn.jsdelivr.net/npm/onnxruntime-web/dist/ort.js"></script> | |
| <script src="https://cdn.jsdelivr.net/npm/@ricky0123/vad-web@0.0.18/dist/bundle.min.js"></script> | |
| <script src="https://cdn.jsdelivr.net/npm/@xenova/transformers@2.17.2"></script> | |
| <style> | |
| body { | |
| font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; | |
| margin: 0; | |
| padding: 20px; | |
| background-color: #1a1a1a; | |
| color: #f0f0f0; | |
| } | |
| .container { | |
| max-width: 800px; | |
| margin: 0 auto; | |
| } | |
| h1 { | |
| color: #ffd700; | |
| text-align: center; | |
| margin-bottom: 10px; | |
| } | |
| .subtitle { | |
| text-align: center; | |
| color: #ffd700; | |
| margin-bottom: 20px; | |
| } | |
| #chat-container { | |
| display: flex; | |
| flex-direction: column; | |
| height: 70vh; | |
| } | |
| #conversation { | |
| flex-grow: 1; | |
| border: 1px solid #444; | |
| padding: 10px; | |
| overflow-y: scroll; | |
| background-color: #2a2a2a; | |
| border-radius: 5px; | |
| margin-bottom: 20px; | |
| } | |
| #controls { | |
| display: flex; | |
| justify-content: center; | |
| margin-bottom: 20px; | |
| } | |
| button { | |
| font-size: 18px; | |
| padding: 10px 20px; | |
| background-color: #ffd700; | |
| color: #1a1a1a; | |
| border: none; | |
| border-radius: 5px; | |
| cursor: pointer; | |
| transition: background-color 0.3s; | |
| } | |
| button:hover { | |
| background-color: #ffec8b; | |
| } | |
| button:disabled { | |
| background-color: #666; | |
| cursor: not-allowed; | |
| } | |
| #visualizer { | |
| width: 100%; | |
| height: 100px; | |
| background-color: #2a2a2a; | |
| border-radius: 5px; | |
| overflow: hidden; | |
| margin-bottom: 20px; | |
| } | |
| .bar { | |
| width: 5px; | |
| height: 100%; | |
| background-color: #ffd700; | |
| display: inline-block; | |
| margin-right: 1px; | |
| } | |
| #loading { | |
| position: fixed; | |
| top: 0; | |
| left: 0; | |
| width: 100%; | |
| height: 100%; | |
| background-color: rgba(0, 0, 0, 0.8); | |
| display: flex; | |
| justify-content: center; | |
| align-items: center; | |
| z-index: 1000; | |
| } | |
| .spinner { | |
| width: 50px; | |
| height: 50px; | |
| border: 5px solid #f3f3f3; | |
| border-top: 5px solid #ffd700; | |
| border-radius: 50%; | |
| animation: spin 1s linear infinite; | |
| } | |
| @keyframes spin { | |
| 0% { transform: rotate(0deg); } | |
| 100% { transform: rotate(360deg); } | |
| } | |
| #configuration { | |
| margin-bottom: 20px; | |
| } | |
| select { | |
| width: 100%; | |
| padding: 10px; | |
| font-size: 16px; | |
| background-color: #2a2a2a; | |
| color: #f0f0f0; | |
| border: 1px solid #444; | |
| border-radius: 5px; | |
| } | |
| #model-info { | |
| margin-top: 10px; | |
| font-size: 14px; | |
| color: #aaa; | |
| } | |
| #logs { | |
| background-color: #2a2a2a; | |
| border: 1px solid #444; | |
| border-radius: 5px; | |
| padding: 10px; | |
| height: 200px; | |
| overflow-y: scroll; | |
| font-family: monospace; | |
| font-size: 14px; | |
| } | |
| #clear-logs { | |
| margin-top: 10px; | |
| font-size: 14px; | |
| padding: 5px 10px; | |
| } | |
| #localVideo, #remoteVideo { | |
| display: none; | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| <div id="loading"> | |
| <div class="spinner"></div> | |
| </div> | |
| <div class="container"> | |
| <h1>Digital Human Voice Chat</h1> | |
| <p class="subtitle">For best results, use headphones.</p> | |
| <div id="chat-container"> | |
| <div id="controls"> | |
| <button id="startButton" disabled>Begin Call</button> | |
| </div> | |
| <div id="configuration"> | |
| <select id="configSelect"> | |
| <option value="fastest">Fastest</option> | |
| <option value="balanced">Balanced</option> | |
| <option value="quality">Highest Quality</option> | |
| </select> | |
| <div id="model-info"> | |
| TTS: Xenova/mms-tts-eng / STT: Xenova/whisper-tiny.en / LLM: Placeholder | |
| </div> | |
| </div> | |
| <div id="visualizer"></div> | |
| <div id="conversation"></div> | |
| </div> | |
| <h2>Logs</h2> | |
| <div id="logs"></div> | |
| <button id="clear-logs">Clear</button> | |
| </div> | |
| <video id="localVideo" autoplay></video> | |
| <video id="remoteVideo" autoplay></video> | |
| <script type="module"> | |
| import { pipeline, env } from 'https://cdn.jsdelivr.net/npm/@xenova/transformers@2.17.2'; | |
| env.localModelPath = './models'; | |
| //BELOW 5 statements added by RAHUL | |
| // Configure environment before initializing pipelines | |
| env.backends = ['wasm']; | |
| env.wasm = env.wasm || {}; | |
| env.wasm.wasmPaths = 'https://cdn.jsdelivr.net/npm/@xenova/transformers@2.17.2/'; // Ensure correct WASM paths | |
| env.wasm.simd = true; // Enable SIMD if available | |
| env.numThreads = navigator.hardwareConcurrency || 4; // Use available CPU cores | |
| const conversationDiv = document.getElementById('conversation'); | |
| const startButton = document.getElementById('startButton'); | |
| const visualizer = document.getElementById('visualizer'); | |
| const loadingDiv = document.getElementById('loading'); | |
| const logsDiv = document.getElementById('logs'); | |
| const clearLogsButton = document.getElementById('clear-logs'); | |
| const localVideo = document.getElementById('localVideo'); | |
| const remoteVideo = document.getElementById('remoteVideo'); | |
| let myvad; | |
| let sttPipeline; | |
| let ttsPipeline; | |
| let audioContext; | |
| let analyser; | |
| let dataArray; | |
| let bars; | |
| let animationId; | |
| let isListening = false; | |
| let microphoneStream; | |
| let isSpeaking = false; | |
| let currentAudioSource = null; | |
| let rtcConnection = null; | |
| let rtcLoopbackConnection = null; | |
| let loopbackStream = new MediaStream(); | |
| function createVisualizer() { | |
| const barCount = 64; | |
| for (let i = 0; i < barCount; i++) { | |
| const bar = document.createElement('div'); | |
| bar.className = 'bar'; | |
| visualizer.appendChild(bar); | |
| } | |
| bars = visualizer.getElementsByClassName('bar'); | |
| } | |
| function updateVisualizer() { | |
| analyser.getByteFrequencyData(dataArray); | |
| for (let i = 0; i < bars.length; i++) { | |
| const barHeight = dataArray[i] / 2; | |
| bars[i].style.height = barHeight + 'px'; | |
| } | |
| // Use setTimeout instead of requestAnimationFrame to reduce update frequency - RAHUL ATLURY | |
| animationId = setTimeout(updateVisualizer, 50); // Update every 50ms - RAHUL ATLURY | |
| //animationId = requestAnimationFrame(updateVisualizer); | |
| } | |
| async function initializePipelines() { | |
| try { | |
| //sttPipeline = await pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en', { quantized: true }); // added , { quantized: true } | |
| //ttsPipeline = await pipeline('text-to-speech', 'Xenova/mms-tts-eng', { | |
| // quantized: true, //changed to true - RAHUL ATLURY | |
| //}); | |
| [sttPipeline, ttsPipeline] = await Promise.all([ | |
| pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en', { quantized: true }), | |
| pipeline('text-to-speech', 'Xenova/mms-tts-eng', { quantized: true }) | |
| ]); | |
| addLog('System: Digital Human Voice Chat initialized. Click "Begin Call" to start.'); | |
| startButton.disabled = false; | |
| loadingDiv.style.display = 'none'; | |
| } catch (error) { | |
| console.error('Error initializing pipelines:', error); | |
| addLog('System: Error initializing Digital Human Voice Chat. Please check the console for details.'); | |
| loadingDiv.style.display = 'none'; | |
| } | |
| } | |
| async function processSpeech(audio) { | |
| try { | |
| if (!sttPipeline || !ttsPipeline) { | |
| throw new Error('Pipelines not initialized'); | |
| } | |
| const transcription = await sttPipeline(audio); | |
| addLog(`User: ${transcription.text}`); | |
| const botResponse = `I heard you say: "${transcription.text}".`; | |
| addLog(`Bot: ${botResponse}`); | |
| isSpeaking = true; | |
| const speechOutput = await ttsPipeline(botResponse); | |
| await playAudio(speechOutput.audio); | |
| isSpeaking = false; | |
| } catch (error) { | |
| console.error('Error processing speech:', error); | |
| addLog('System: Error processing speech. Please try again.'); | |
| } | |
| } | |
| function addLog(message) { | |
| const now = new Date(); | |
| const timestamp = now.toLocaleTimeString(); | |
| const logMessage = `[${timestamp}] ${message}`; | |
| const messageElement = document.createElement('div'); | |
| messageElement.textContent = logMessage; | |
| logsDiv.appendChild(messageElement); | |
| logsDiv.scrollTop = logsDiv.scrollHeight; | |
| } | |
| function playAudio(audioArray) { | |
| return new Promise((resolve) => { | |
| const audioBuffer = audioContext.createBuffer(1, audioArray.length, 16000); | |
| const channelData = audioBuffer.getChannelData(0); | |
| channelData.set(audioArray); | |
| const source = audioContext.createBufferSource(); | |
| currentAudioSource = source; | |
| source.buffer = audioBuffer; | |
| source.connect(analyser); | |
| analyser.connect(audioContext.destination); | |
| source.start(); | |
| source.onended = () => { | |
| currentAudioSource = null; | |
| resolve(); | |
| }; | |
| }); | |
| } | |
| function stopCurrentAudio() { | |
| if (currentAudioSource) { | |
| currentAudioSource.stop(); | |
| currentAudioSource = null; | |
| } | |
| } | |
| async function toggleListening() { | |
| if (isListening) { | |
| await stopListening(); | |
| } else { | |
| await startListening(); | |
| } | |
| } | |
| async function startListening() { | |
| try { | |
| audioContext = new (window.AudioContext || window.webkitAudioContext)(); | |
| analyser = audioContext.createAnalyser(); | |
| analyser.fftSize = 128; | |
| dataArray = new Uint8Array(analyser.frequencyBinCount); | |
| localVideo.volume = 0; | |
| localVideo.muted = true; | |
| document.getElementById('localVideo').volume = 0; | |
| remoteVideo.volume = 0; | |
| remoteVideo.muted = true; | |
| document.getElementById('remoteVideo').volume = 0; | |
| // Request both audio and video streams | |
| microphoneStream = await navigator.mediaDevices.getUserMedia({ | |
| audio: true, | |
| video: { width: 1, height: 1 } // Minimal video for echo cancellation | |
| }); | |
| localVideo.srcObject = microphoneStream; | |
| await localVideo.play(); | |
| console.log('Active constraints:', microphoneStream.getAudioTracks()[0].getConstraints()); | |
| console.log('Microphone stream settings:', microphoneStream.getAudioTracks()[0].getSettings()); | |
| // Implement loopback hack for improved echo cancellation | |
| const offerOptions = { | |
| offerToReceiveAudio: true, | |
| offerToReceiveVideo: false, | |
| }; | |
| rtcConnection = new RTCPeerConnection(); | |
| rtcLoopbackConnection = new RTCPeerConnection(); | |
| rtcConnection.onicecandidate = e => e.candidate && rtcLoopbackConnection.addIceCandidate(new RTCIceCandidate(e.candidate)); | |
| rtcLoopbackConnection.onicecandidate = e => e.candidate && rtcConnection.addIceCandidate(new RTCIceCandidate(e.candidate)); | |
| rtcLoopbackConnection.ontrack = e => e.streams[0].getTracks().forEach(track => loopbackStream.addTrack(track)); | |
| microphoneStream.getTracks().forEach(track => rtcConnection.addTrack(track, microphoneStream)); | |
| const offer = await rtcConnection.createOffer(offerOptions); | |
| await rtcConnection.setLocalDescription(offer); | |
| await rtcLoopbackConnection.setRemoteDescription(offer); | |
| const answer = await rtcLoopbackConnection.createAnswer(); | |
| await rtcLoopbackConnection.setLocalDescription(answer); | |
| await rtcConnection.setRemoteDescription(answer); | |
| // Use the loopback stream for audio processing | |
| const source = audioContext.createMediaStreamSource(loopbackStream); | |
| source.connect(analyser); | |
| myvad = await vad.MicVAD.new({ | |
| noiseSuppression: true, ///Added by RAHUL Atlury | |
| aggressiveness: 3, // Higher value for more aggressive detection Added by RAHUL ATLURY | |
| onSpeechStart: () => { | |
| addLog('--- Voice activity: speech start'); | |
| updateVisualizer(); | |
| if (isSpeaking) { | |
| addLog('User interrupted. Stopping bot speech.'); | |
| stopCurrentAudio(); | |
| isSpeaking = false; | |
| } | |
| }, | |
| onSpeechEnd: (audio) => { | |
| addLog('--- Voice activity: speech end'); | |
| cancelAnimationFrame(animationId); | |
| processSpeech(audio); | |
| } | |
| }); | |
| await myvad.start(); | |
| startButton.textContent = 'End Call'; | |
| isListening = true; | |
| addLog('System: Listening...'); | |
| } catch (error) { | |
| console.error('Error starting voice activity:', error); | |
| addLog('System: Error starting voice detection. Please check your microphone and try again.'); | |
| } | |
| } | |
| async function stopListening() { | |
| if (myvad) { | |
| try { | |
| await myvad.destroy(); | |
| } catch (error) { | |
| console.error('Error stopping voice activity:', error); | |
| } | |
| myvad = null; | |
| } | |
| if (microphoneStream) { | |
| microphoneStream.getTracks().forEach(track => track.stop()); | |
| microphoneStream = null; | |
| } | |
| if (audioContext) { | |
| await audioContext.close(); | |
| audioContext = null; | |
| } | |
| if (localVideo) { | |
| localVideo.srcObject = null; | |
| } | |
| if (remoteVideo) { | |
| remoteVideo.srcObject = null; | |
| } | |
| if (rtcConnection) { | |
| rtcConnection.close(); | |
| rtcConnection = null; | |
| } | |
| if (rtcLoopbackConnection) { | |
| rtcLoopbackConnection.close(); | |
| rtcLoopbackConnection = null; | |
| } | |
| loopbackStream = new MediaStream(); | |
| stopCurrentAudio(); | |
| startButton.textContent = 'Begin Call'; | |
| isListening = false; | |
| addLog('System: Stopped listening.'); | |
| cancelAnimationFrame(animationId); | |
| addLog('System: Microphone closed'); | |
| } | |
| startButton.addEventListener('click', toggleListening); | |
| clearLogsButton.addEventListener('click', () => { | |
| logsDiv.innerHTML = ''; | |
| }); | |
| createVisualizer(); | |
| initializePipelines(); | |
| </script> | |
| </body> | |
| </html> |