Live-Video-Chat

Running

App Files Files Community

KingNish commited on Sep 28

Commit

1fa2d43

•

1 Parent(s): 15991ac

Previous one is working but slow

Browse files

Files changed (1) hide show

script1.js +361 -316

script1.js CHANGED Viewed

@@ -1,57 +1,6 @@
-// script1.js
-import { client, handle_file } from 'https://cdn.jsdelivr.net/npm/@gradio/client/+esm';
-const video = document.getElementById('webcam');
-let gradioApp; // Declare gradioApp outside the function
-const GRADIO_CLIENTS = [
-  "multimodalart/Florence-2-l4",
-  "gokaygokay/Florence-2",
-  "multimodalart/Florence-2-l4-2",
-  "gokaygokay/Florence-2",
-];
-async function startWebcam() {
-  try {
-    const stream = await navigator.mediaDevices.getUserMedia({ video: true });
-    video.srcObject = stream;
-  } catch (error) {
-    console.error("Error accessing webcam:", error);
-  }
-}
-async function getCaption() {
-  if (!gradioApp) {
-    try {
-      const randomClient = GRADIO_CLIENTS[Math.floor(Math.random() * GRADIO_CLIENTS.length)];
-      gradioApp = await client(randomClient);
-    } catch (error) {
-      console.error("Error loading Gradio client:", error);
-      return "Error getting caption"; // Or some other default
-    }
-  }
-  try {
-    const canvas = document.createElement('canvas');
-    canvas.width = video.videoWidth;
-    canvas.height = video.videoHeight;
-    const context = canvas.getContext('2d');
-    context.drawImage(video, 0, 0, canvas.width, canvas.height);
-    const blob = await new Promise(resolve => canvas.toBlob(resolve, 'image/png'));
-    const handledFile = await handle_file(blob);
-    const result = await gradioApp.predict("/process_image", [handledFile, "More Detailed Caption"]);
-    return result.data[0];
-  } catch (error) {
-    console.error("Error getting caption:", error);
-    return "Error getting caption"; // Or handle the error differently
-  }
-}
 // Constants and Configuration
 const USER_SPEECH_INTERRUPT_DELAY = 500;
-const TEXT_TO_SPEECH_API_ENDPOINT = "https://api.streamelements.com/kappa/v2/speech";
 const CHUNK_SIZE = 300;
 const MAX_PREFETCH_REQUESTS = 10;
 const PREFETCH_CACHE_EXPIRATION = 60000; // 1 minute
@@ -61,7 +10,7 @@ const AUDIO_CACHE_EXPIRATION = 3600000; // 1 hour
 const startStopButton = document.getElementById('startStopButton');
 const voiceSelectionDropdown = document.getElementById('voiceSelect');
 const modelSelectionDropdown = document.getElementById('modelSelect');
-const noiseSuppressionCheckbox = document.getElementById('noiseSuppression');
 const responseTimeDisplay = document.getElementById('responseTime');
 const userActivityIndicator = document.getElementById('userIndicator');
 const aiActivityIndicator = document.getElementById('aiIndicator');
@@ -95,6 +44,19 @@ let conversationHistory = [];
 // Audio Caching
 const audioCache = new Map();
 // Utility Functions
 // Normalize query text
@@ -246,54 +208,51 @@ const cancelPrefetchRequests = (query) => {
 // AI Interaction Functions
-// Modify sendQueryToAI to include the caption
-async function sendQueryToAI(query) {
-  const caption = await getCaption();
-  const modifiedQuery = JSON.stringify({ USER: query, CAPTION: caption });
-  console.log("Sending query to AI:", modifiedQuery);
-  isRequestInProgress = true;
-  updateActivityIndicators();
-  firstResponseTextTimestamp = null;
-  const normalizedQuery = normalizeQueryText(query);
-  const cacheKey = generateCacheKey(normalizedQuery, modelSelectionDropdown.value, conversationHistory, modelSelectionDropdown.value);
-  queryStartTime = Date.now();
-  // Check prefetch cache
-  if (prefetchCache.has(cacheKey)) {
-    const cachedData = prefetchCache.get(cacheKey);
-    if (Date.now() - cachedData.timestamp < PREFETCH_CACHE_EXPIRATION) {
-      audioPlaybackQueue.push({ url: cachedData.url, isPrefetched: true });
-      playNextAudio();
-    } else {
-      prefetchCache.delete(cacheKey);
     }
-  }
-  requestAbortController = new AbortController();
-  try {
-    await streamAndHandleAudioResponse(modifiedQuery, voiceSelectionDropdown.value, requestAbortController.signal);
-  } catch (error) {
-    if (error.name !== 'AbortError') {
-      console.error("Error sending query to AI:", error);
     }
-  } finally {
-    isRequestInProgress = false;
-    updateActivityIndicators();
-  }
 };
 // Process the final speech transcript
 const processSpeechTranscript = (transcript) => {
-  const trimmedTranscript = transcript.trimStart();
-  if (trimmedTranscript !== '' && !isRequestInProgress) {
-    activeQuery = trimmedTranscript;
-    sendQueryToAI(activeQuery);
-    addToConversationHistory('user', activeQuery);
-  }
 };
@@ -301,288 +260,374 @@ const processSpeechTranscript = (transcript) => {
 // Stream AI response and handle audio
 const streamAndHandleAudioResponse = async (query, voice, abortSignal) => {
-  const response = await fetchAIResponse(query, abortSignal);
-  if (!response.ok) {
-    if (response.status === 429) {
-      console.log("Rate limit hit, retrying in 1 second...");
-      await new Promise(resolve => setTimeout(resolve, 1000));
-      await sendQueryToAI(query);
-      return;
     }
-    throw new Error(`Network response was not ok: ${response.status}`);
-  }
-  console.log("Streaming audio response received");
-  await handleStreamingResponse(response.body, voice, abortSignal);
 };
 // Stream AI response for prefetching
 const streamAndPrefetchAudio = async (query, voice, abortSignal) => {
-  const response = await fetchAIResponse(query, abortSignal);
-  if (!response.ok) throw new Error('Network response was not ok');
-  return handleStreamingResponseForPrefetch(response.body, voice, abortSignal);
 };
 // Fetch AI response
 const fetchAIResponse = async (query, abortSignal) => {
-  const userSambanovaKey = document.getElementById('apiKey').value.trim() !== '' ? document.getElementById('apiKey').value.trim() : 'none';
-  const url = '/stream_text';
-  const requestBody = {
-    query: query,
-    history: JSON.stringify(conversationHistory),
-    model: modelSelectionDropdown.value,
-    api_key: userSambanovaKey
-  };
-  return fetch(url, {
-    method: 'POST',
-    headers: {
-      'Accept': 'text/event-stream',
-      'Content-Type': 'application/json'
-    },
-    body: JSON.stringify(requestBody),
-    signal: abortSignal
-  });
 };
 // Handle the streaming response for prefetching
 const handleStreamingResponseForPrefetch = async (responseStream, voice, abortSignal) => {
-  const reader = responseStream.getReader();
-  const decoder = new TextDecoder("utf-8");
-  let buffer = "";
-  try {
-    while (true) {
-      const { done, value } = await reader.read();
-      if (done) break;
-      if (abortSignal.aborted) throw new DOMException('Request aborted', 'AbortError');
-      const chunk = decoder.decode(value, { stream: true });
-      buffer += chunk;
-      const lines = buffer.split('\n');
-      for (let i = 0; i < lines.length - 1; i++) {
-        const line = lines[i];
-        if (line.startsWith('data: ')) {
-          const textContent = line.substring(6).trim();
-          if (textContent) {
-            return await generateTextToSpeechAudio(textContent, voice);
-          }
-        }
-      }
-      buffer = lines[lines.length - 1];
     }
-  } catch (error) {
-    console.error("Error in handleStreamingResponseForPrefetch:", error);
-  } finally {
-    reader.releaseLock();
-  }
-  return null;
 };
 // Handle the streaming audio response
 const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
-  const reader = responseStream.getReader();
-  const decoder = new TextDecoder("utf-8");
-  let buffer = "";
-  let initialChunksSent = 0;
-  let fullResponseText = "";
-  let fullResponseText2 = "";
-  let textChunk = "";
-  let sentText = "";
-  try {
-    while (true) {
-      const { done, value } = await reader.read();
-      if (done) break;
-      if (abortSignal.aborted) throw new DOMException('Request aborted', 'AbortError');
-      if (isUserSpeaking) {
-        interruptAudioPlayback('user is speaking');
-        break;
-      }
-      const chunk = decoder.decode(value, { stream: true });
-      buffer += chunk;
-      const lines = buffer.split('\n');
-      for (let i = 0; i < lines.length - 1; i++) {
-        const line = lines[i];
-        if (line.startsWith('data: ')) {
-          const textContent = line.substring(6).trim();
-          if (textContent) {
-            if (!firstResponseTextTimestamp) firstResponseTextTimestamp = Date.now();
-            fullResponseText += textContent + " ";
-            fullResponseText2 += textContent + " ";
-            textChunk += textContent + " ";
-            transcriptDiv.textContent = fullResponseText2;
-            if (initialChunksSent < 2) {
-              const audioUrl = await generateTextToSpeechAudio(textContent, voice);
-              if (audioUrl) {
-                audioPlaybackQueue.push({ url: audioUrl, isPrefetched: false });
-                if (!currentAudio) playNextAudio();
-              }
-              sentText += textContent + " ";
-              initialChunksSent++;
-            } else {
-              let unsentTextChunk = textChunk.replace(sentText, '').trim();
-              if (unsentTextChunk.length >= CHUNK_SIZE) {
-                const audioUrl = await generateTextToSpeechAudio(unsentTextChunk, voice);
-                if (audioUrl) {
-                  audioPlaybackQueue.push({ url: audioUrl, isPrefetched: false });
-                  if (!currentAudio) playNextAudio();
                 }
-                textChunk = "";
-              }
             }
-            if (fullResponseText !== '') {
-              fullResponseText = '';
             }
-          }
         }
-      }
-      buffer = lines[lines.length - 1];
-    }
-  } catch (error) {
-    console.error("Error in handleStreamingResponse:", error);
-  } finally {
-    reader.releaseLock();
-    let unsentTextChunk = textChunk.replace(sentText, '').trim();
-    if (unsentTextChunk !== "") {
-      const audioUrl = await generateTextToSpeechAudio(unsentTextChunk, voice);
-      if (audioUrl) {
-        audioPlaybackQueue.push({ url: audioUrl, isPrefetched: false });
-        if (!currentAudio) playNextAudio();
-      }
-    }
-    if (fullResponseText !== '') {
-      fullResponseText = '';
-    }
-    if (fullResponseText2 !== '') {
-      addToConversationHistory('assistant', fullResponseText2);
-      fullResponseText2 = '';
     }
-  }
 };
 // Generate Text-to-Speech audio with caching
 const generateTextToSpeechAudio = async (text, voice) => {
-  const normalizedText = normalizeQueryText(text);
-  const cacheKey = `${normalizedText}-${voice}`;
-  if (audioCache.has(cacheKey)) {
-    const cachedData = audioCache.get(cacheKey);
-    if (Date.now() - cachedData.timestamp < AUDIO_CACHE_EXPIRATION) {
-      return cachedData.url;
-    } else {
-      audioCache.delete(cacheKey);
     }
-  }
-  try {
-    const response = await fetch(`${TEXT_TO_SPEECH_API_ENDPOINT}?voice=${voice}&text=${encodeURIComponent(text)}`, { method: 'GET' });
-    if (!response.ok) throw new Error('Network response was not ok');
-    const audioBlob = await response.blob();
-    const audioUrl = URL.createObjectURL(audioBlob);
-    audioCache.set(cacheKey, { url: audioUrl, timestamp: Date.now() });
-    return audioUrl;
-  } catch (error) {
-    console.error("Error generating TTS audio:", error);
-    return null;
-  }
 };
 // Speech Recognition Initialization
 if ('webkitSpeechRecognition' in window) {
-  speechRecognizer = new webkitSpeechRecognition();
-  Object.assign(speechRecognizer, {
-    continuous: true,
-    interimResults: true,
-    language: 'en-US',
-    maxAlternatives: 3
-  });
-  speechRecognizer.onstart = () => {
-    console.log("Speech recognition started");
-    isUserSpeaking = true;
-    lastUserSpeechTimestamp = Date.now();
-    updateActivityIndicators();
-    startStopButton.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M9 9h6v6h-6z"></path><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Stop Listening';
-  };
-  speechRecognizer.onresult = (event) => {
-    let interimTranscript = '';
-    for (let i = event.resultIndex; i < event.results.length; i++) {
-      const transcript = event.results[i][0].transcript;
-      if (event.results[i].isFinal) {
-        interruptAudioPlayback('final');
-        processSpeechTranscript(transcript);
-        isUserSpeaking = false;
-        updateActivityIndicators();
-        queryStartTime = Date.now();
-      } else {
-        interimTranscript += transcript;
         isUserSpeaking = true;
         lastUserSpeechTimestamp = Date.now();
         updateActivityIndicators();
-        if (interimTranscript.length > prefetchTextQuery.length + 5) {
-          cancelPrefetchRequests(prefetchTextQuery);
         }
-        prefetchTextQuery = interimTranscript;
-        prefetchFirstAudioChunk(interimTranscript, voiceSelectionDropdown.value);
-        if (isRequestInProgress && shouldInterruptAudioPlayback(interimTranscript)) {
-          interruptAudioPlayback('interim');
         }
-      }
     }
-  };
-  speechRecognizer.onerror = (event) => {
-    console.error('Speech recognition error:', event.error);
-    if (isSpeechRecognitionActive) speechRecognizer.start();
-  };
-  speechRecognizer.onend = () => {
-    isUserSpeaking = false;
-    updateActivityIndicators();
-    if (isSpeechRecognitionActive) speechRecognizer.start();
-  };
-  startStopButton.addEventListener('click', () => {
-    if (isSpeechRecognitionActive) {
-      speechRecognizer.stop();
-      isSpeechRecognitionActive = false;
-      startStopButton.innerHTML = '<svg id="microphoneIcon" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Start Listening';
-    } else {
-      speechRecognizer.start();
-      isSpeechRecognitionActive = true;
-      startStopButton.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M9 9h6v6h-6z"></path><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Stop Listening';
     }
-  });
-} else {
-  alert('Your browser does not support the Web Speech API.');
 }
-setInterval(updateLatency, 100);
-window.onload = startWebcam;

 // Constants and Configuration
 const USER_SPEECH_INTERRUPT_DELAY = 500;
+const TEXT_TO_SPEECH_API_ENDPOINT = "https://api.streamelements.com/kappa/v2/speech"; // Replace with your TTS endpoint
 const CHUNK_SIZE = 300;
 const MAX_PREFETCH_REQUESTS = 10;
 const PREFETCH_CACHE_EXPIRATION = 60000; // 1 minute
 const startStopButton = document.getElementById('startStopButton');
 const voiceSelectionDropdown = document.getElementById('voiceSelect');
 const modelSelectionDropdown = document.getElementById('modelSelect');
+const noiseSuppressionCheckbox = document.getElementById('noiseSuppression'); // Assuming you have this in your HTML
 const responseTimeDisplay = document.getElementById('responseTime');
 const userActivityIndicator = document.getElementById('userIndicator');
 const aiActivityIndicator = document.getElementById('aiIndicator');
 // Audio Caching
 const audioCache = new Map();
+// Webcam and Gradio Integration
+import { client, handle_file } from 'https://cdn.jsdelivr.net/npm/@gradio/client/+esm';
+const video = document.getElementById('webcam');
+const clients = [
+    "multimodalart/Florence-2-l4",
+    "gokaygokay/Florence-2",
+    "multimodalart/Florence-2-l4-2",
+    "gokaygokay/Florence-2",
+]; // Or your preferred Gradio models
+let app;
+let lastCaption = "";
 // Utility Functions
 // Normalize query text
 // AI Interaction Functions
+// Send a query to the AI
+const sendQueryToAI = async (query) => {
+    console.log("Sending query to AI:", query);
+    isRequestInProgress = true;
+    updateActivityIndicators();
+    firstResponseTextTimestamp = null;
+    const normalizedQuery = normalizeQueryText(query);
+    const cacheKey = generateCacheKey(normalizedQuery, modelSelectionDropdown.value, conversationHistory, modelSelectionDropdown.value);
+    queryStartTime = Date.now();
+    // Check prefetch cache
+    if (prefetchCache.has(cacheKey)) {
+        const cachedData = prefetchCache.get(cacheKey);
+        if (Date.now() - cachedData.timestamp < PREFETCH_CACHE_EXPIRATION) {
+            audioPlaybackQueue.push({ url: cachedData.url, isPrefetched: true });
+            playNextAudio();
+        } else {
+            prefetchCache.delete(cacheKey);
+        }
     }
+    requestAbortController = new AbortController();
+    try {
+        await streamAndHandleAudioResponse(query, voiceSelectionDropdown.value, requestAbortController.signal);
+    } catch (error) {
+        if (error.name !== 'AbortError') {
+            console.error("Error sending query to AI:", error);
+        }
+    } finally {
+        isRequestInProgress = false;
+        updateActivityIndicators();
     }
 };
 // Process the final speech transcript
 const processSpeechTranscript = (transcript) => {
+    const trimmedTranscript = transcript.trimStart();
+    if (trimmedTranscript !== '' && !isRequestInProgress) {
+        activeQuery = trimmedTranscript;
+        sendQueryToAI(activeQuery);
+        addToConversationHistory('user', activeQuery);
+    }
 };
 // Stream AI response and handle audio
 const streamAndHandleAudioResponse = async (query, voice, abortSignal) => {
+    const response = await fetchAIResponse(query, abortSignal);
+    if (!response.ok) {
+        if (response.status === 429) {
+            console.log("Rate limit hit, retrying in 1 second...");
+            await new Promise(resolve => setTimeout(resolve, 1000));
+            await sendQueryToAI(query);
+            return;
+        }
+        throw new Error(`Network response was not ok: ${response.status}`);
     }
+    console.log("Streaming audio response received");
+    await handleStreamingResponse(response.body, voice, abortSignal);
 };
 // Stream AI response for prefetching
 const streamAndPrefetchAudio = async (query, voice, abortSignal) => {
+    const response = await fetchAIResponse(query, abortSignal);
+    if (!response.ok) throw new Error('Network response was not ok');
+    return handleStreamingResponseForPrefetch(response.body, voice, abortSignal);
 };
 // Fetch AI response
 const fetchAIResponse = async (query, abortSignal) => {
+    const userSambanovaKey = document.getElementById('apiKey').value.trim() !== '' ? document.getElementById('apiKey').value.trim() : 'none';
+    const url = '/stream_text';
+    const requestBody = {
+        query: query,
+        history: JSON.stringify(conversationHistory),
+        model: modelSelectionDropdown.value,
+        api_key: userSambanovaKey
+    };
+    return fetch(url, {
+        method: 'POST',
+        headers: {
+            'Accept': 'text/event-stream',
+            'Content-Type': 'application/json'
+        },
+        body: JSON.stringify(requestBody),
+        signal: abortSignal
+    });
 };
 // Handle the streaming response for prefetching
 const handleStreamingResponseForPrefetch = async (responseStream, voice, abortSignal) => {
+    const reader = responseStream.getReader();
+    const decoder = new TextDecoder("utf-8");
+    let buffer = "";
+    try {
+        while (true) {
+            const { done, value } = await reader.read();
+            if (done) break;
+            if (abortSignal.aborted) throw new DOMException('Request aborted', 'AbortError');
+            const chunk = decoder.decode(value, { stream: true });
+            buffer += chunk;
+            const lines = buffer.split('\n');
+            for (let i = 0; i < lines.length - 1; i++) {
+                const line = lines[i];
+                if (line.startsWith('data: ')) {
+                    const textContent = line.substring(6).trim();
+                    if (textContent) {
+                        return await generateTextToSpeechAudio(textContent, voice);
+                    }
+                }
+            }
+            buffer = lines[lines.length - 1];
+        }
+    } catch (error) {
+        console.error("Error in handleStreamingResponseForPrefetch:", error);
+    } finally {
+        reader.releaseLock();
     }
+    return null;
 };
 // Handle the streaming audio response
 const handleStreamingResponse = async (responseStream, voice, abortSignal) => {
+    const reader = responseStream.getReader();
+    const decoder = new TextDecoder("utf-8");
+    let buffer = "";
+    let initialChunksSent = 0;
+    let fullResponseText = "";
+    let fullResponseText2 = "";
+    let textChunk = "";
+    let sentText = "";
+    try {
+        while (true) {
+            const { done, value } = await reader.read();
+            if (done) break;
+            if (abortSignal.aborted) throw new DOMException('Request aborted', 'AbortError');
+            if (isUserSpeaking) {
+                interruptAudioPlayback('user is speaking');
+                break;
+            }
+            const chunk = decoder.decode(value, { stream: true });
+            buffer += chunk;
+            const lines = buffer.split('\n');
+            for (let i = 0; i < lines.length - 1; i++) {
+                const line = lines[i];
+                if (line.startsWith('data: ')) {
+                    const textContent = line.substring(6).trim();
+                    if (textContent) {
+                        if (!firstResponseTextTimestamp) firstResponseTextTimestamp = Date.now();
+                        fullResponseText += textContent + " ";
+                        fullResponseText2 += textContent + " ";
+                        textChunk += textContent + " ";
+                        transcriptDiv.textContent = fullResponseText2;
+                        if (initialChunksSent < 2) {
+                            const audioUrl = await generateTextToSpeechAudio(textContent, voice);
+                            if (audioUrl) {
+                                audioPlaybackQueue.push({ url: audioUrl, isPrefetched: false });
+                                if (!currentAudio) playNextAudio();
+                            }
+                            sentText += textContent + " ";
+                            initialChunksSent++;
+                        } else {
+                            let unsentTextChunk = textChunk.replace(sentText, '').trim();
+                            if (unsentTextChunk.length >= CHUNK_SIZE) {
+                                const audioUrl = await generateTextToSpeechAudio(unsentTextChunk, voice);
+                                if (audioUrl) {
+                                    audioPlaybackQueue.push({ url: audioUrl, isPrefetched: false });
+                                    if (!currentAudio) playNextAudio();
+                                }
+                                textChunk = "";
+                            }
+                        }
+                        if (fullResponseText !== '') {
+                            fullResponseText = '';
+                        }
+                    }
                 }
             }
+            buffer = lines[lines.length - 1];
+        }
+    } catch (error) {
+        console.error("Error in handleStreamingResponse:", error);
+    } finally {
+        reader.releaseLock();
+        let unsentTextChunk = textChunk.replace(sentText, '').trim();
+        if (unsentTextChunk !== "") {
+            const audioUrl = await generateTextToSpeechAudio(unsentTextChunk, voice);
+            if (audioUrl) {
+                audioPlaybackQueue.push({ url: audioUrl, isPrefetched: false });
+                if (!currentAudio) playNextAudio();
             }
         }
+        if (fullResponseText !== '') {
+            fullResponseText = '';
+        }
+        if (fullResponseText2 !== '') {
+            addToConversationHistory('assistant', fullResponseText2);
+            fullResponseText2 = '';
+        }
     }
 };
 // Generate Text-to-Speech audio with caching
 const generateTextToSpeechAudio = async (text, voice) => {
+    const normalizedText = normalizeQueryText(text);
+    const cacheKey = `${normalizedText}-${voice}`;
+    if (audioCache.has(cacheKey)) {
+        const cachedData = audioCache.get(cacheKey);
+        if (Date.now() - cachedData.timestamp < AUDIO_CACHE_EXPIRATION) {
+            return cachedData.url;
+        } else {
+            audioCache.delete(cacheKey);
+        }
     }
+    try {
+        const response = await fetch(`${TEXT_TO_SPEECH_API_ENDPOINT}?voice=${voice}&text=${encodeURIComponent(text)}`, { method: 'GET' });
+        if (!response.ok) throw new Error('Network response was not ok');
+        const audioBlob = await response.blob();
+        const audioUrl = URL.createObjectURL(audioBlob);
+        audioCache.set(cacheKey, { url: audioUrl, timestamp: Date.now() });
+        return audioUrl;
+    } catch (error) {
+        console.error("Error generating TTS audio:", error);
+        return null;
+    }
 };
 // Speech Recognition Initialization
 if ('webkitSpeechRecognition' in window) {
+    speechRecognizer = new webkitSpeechRecognition();
+    Object.assign(speechRecognizer, {
+        continuous: true,
+        interimResults: true,
+        language: 'en-US',
+        maxAlternatives: 3
+    });
+    speechRecognizer.onstart = () => {
+        console.log("Speech recognition started");
         isUserSpeaking = true;
         lastUserSpeechTimestamp = Date.now();
         updateActivityIndicators();
+        startStopButton.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M9 9h6v6h-6z"></path><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Stop Listening';
+    };
+    speechRecognizer.onresult = (event) => {
+        let interimTranscript = '';
+        for (let i = event.resultIndex; i < event.results.length; i++) {
+            const transcript = event.results[i][0].transcript;
+            if (event.results[i].isFinal) {
+                interruptAudioPlayback('final');
+                processSpeechTranscript(transcript);
+                isUserSpeaking = false;
+                updateActivityIndicators();
+                queryStartTime = Date.now();
+            } else {
+                interimTranscript += transcript;
+                isUserSpeaking = true;
+                lastUserSpeechTimestamp = Date.now();
+                updateActivityIndicators();
+                if (interimTranscript.length > prefetchTextQuery.length + 5) {
+                    cancelPrefetchRequests(prefetchTextQuery);
+                }
+                prefetchTextQuery = interimTranscript;
+                prefetchFirstAudioChunk(interimTranscript, voiceSelectionDropdown.value);
+                if (isRequestInProgress && shouldInterruptAudioPlayback(interimTranscript)) {
+                    interruptAudioPlayback('interim');
+                }
+            }
         }
+    };
+    speechRecognizer.onerror = (event) => {
+        console.error('Speech recognition error:', event.error);
+        if (isSpeechRecognitionActive) speechRecognizer.start();
+    };
+    speechRecognizer.onend = () => {
+        isUserSpeaking = false;
+        updateActivityIndicators();
+        if (isSpeechRecognitionActive) speechRecognizer.start();
+    };
+    startStopButton.addEventListener('click', () => {
+        if (isSpeechRecognitionActive) {
+            speechRecognizer.stop();
+            isSpeechRecognitionActive = false;
+            startStopButton.innerHTML = '<svg id="microphoneIcon" xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Start Listening';
+        } else {
+            speechRecognizer.start();
+            isSpeechRecognitionActive = true;
+            startStopButton.innerHTML = '<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M9 9h6v6h-6z"></path><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path><path d="M19 10v2a7 7 0 0 1-14 0v-2"></path><line x1="12" y1="19" x2="12" y2="23"></line><line x1="8" y1="23" x2="16" y2="23"></line></svg> Stop Listening';
         }
+    });
+} else {
+    alert('Your browser does not support the Web Speech API.');
+}
+setInterval(updateLatency, 100);
+// Webcam Functions
+async function startWebcam() {
+    try {
+        const stream = await navigator.mediaDevices.getUserMedia({ video: true });
+        video.srcObject = stream;
+        setInterval(captureAndProcessImage, 5000); // Adjust interval as needed
+    } catch (error) {
+        console.error("Error accessing webcam: ", error);
+        // Consider adding user feedback here, e.g., alert or display a message.
     }
+}
+async function captureAndProcessImage() {
+    const canvas = document.createElement('canvas');
+    canvas.width = video.videoWidth;
+    canvas.height = video.videoHeight;
+    const context = canvas.getContext('2d');
+    context.drawImage(video, 0, 0, canvas.width, canvas.height);
+    const blob = await new Promise(resolve => canvas.toBlob(resolve, 'image/png'));
+    await processWithGradio(blob);
+}
+async function processWithGradio(imageBlob) {
+    try {
+        const randomClient = clients[Math.floor(Math.random() * clients.length)];
+        app = await client(randomClient);
+        const handledFile = await handle_file(imageBlob);
+        const result = await app.predict("/process_image", [handledFile, "Detailed Caption"]);
+        const dataString = result.data[0]; // Assuming the caption is the first element in the response
+        lastCaption = dataString || ""; // Handle potential errors
+    } catch (error) {
+        console.error("Error processing with Gradio:", error);
+        // Add error handling here (e.g., display a message to the user).
+        lastCaption = ""; // Reset caption if there's an error.
     }
 }
+// Modify sendQueryToAI to include the caption
+async function sendQueryToAI(query) {
+    console.log("Sending query to AI:", query);
+    isRequestInProgress = true;
+    updateActivityIndicators();
+    firstResponseTextTimestamp = null;
+    const normalizedQuery = normalizeQueryText(query);
+    const cacheKey = generateCacheKey(normalizedQuery, modelSelectionDropdown.value, conversationHistory, modelSelectionDropdown.value);
+    queryStartTime = Date.now();
+    // Check prefetch cache
+    if (prefetchCache.has(cacheKey)) {
+        const cachedData = prefetchCache.get(cacheKey);
+        if (Date.now() - cachedData.timestamp < PREFETCH_CACHE_EXPIRATION) {
+            audioPlaybackQueue.push({ url: cachedData.url, isPrefetched: true });
+            playNextAudio();
+        } else {
+            prefetchCache.delete(cacheKey);
+        }
+    }
+    requestAbortController = new AbortController();
+    try {
+        const combinedQuery = `{USER: "${query}"}, ${lastCaption}, {USER: "${query}"}`;
+        await streamAndHandleAudioResponse(combinedQuery, voiceSelectionDropdown.value, requestAbortController.signal);
+    } catch (error) {
+        if (error.name !== 'AbortError') {
+            console.error("Error sending query to AI:", error);
+        }
+    } finally {
+        isRequestInProgress = false;
+        updateActivityIndicators();
+    }
+};
+// Initialize Webcam and Speech Recognition on Load
+window.onload = () => {
+    startWebcam();
+};