Spaces:

mantrakp
/

aai

Runtime error

+import fastapi
+import numpy as np
+import torch
+import torchaudio
+from silero_vad import get_speech_timestamps, load_silero_vad
+import whisperx
+import edge_tts
+import gc
+import logging
+import time
+from openai import OpenAI
+import threading
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+# Configure FastAPI
+app = fastapi.FastAPI()
+# Load Silero VAD model
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+logging.info(f'Using device: {device}')
+vad_model = load_silero_vad().to(device)  # Ensure the model is on the correct device
+logging.info('Loaded Silero VAD model')
+# Load WhisperX model
+whisper_model = whisperx.load_model("tiny", device, compute_type="float16")
+logging.info('Loaded WhisperX model')
+OPENAI_API_KEY = "sk-proj-gcrtuxd5qzaRYT82Ii3eT3BlbkFJpVQHBc9ZJrmSksLbQc3C" # os.getenv("OPENAI_API_KEY")
+if not OPENAI_API_KEY:
+    logging.error("OpenAI API key not found. Please set the OPENAI_API_KEY environment variable.")
+    raise ValueError("OpenAI API key not found.")
+# Initialize OpenAI client
+openai_client = OpenAI(api_key=OPENAI_API_KEY)
+logging.info('Initialized OpenAI client')
+# TTS Voice
+TTS_VOICE = "en-GB-SoniaNeural"
+# Function to check voice activity using Silero VAD
+def check_vad(audio_data, sample_rate):
+    logging.info('Checking voice activity')
+    # Resample to 16000 Hz if necessary
+    target_sample_rate = 16000
+    if sample_rate != target_sample_rate:
+        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
+        audio_tensor = resampler(torch.from_numpy(audio_data))
+    else:
+        audio_tensor = torch.from_numpy(audio_data)
+    audio_tensor = audio_tensor.to(device)
+    # Log audio data details
+    logging.info(f'Audio tensor shape: {audio_tensor.shape}, dtype: {audio_tensor.dtype}, device: {audio_tensor.device}')
+    # Get speech timestamps
+    speech_timestamps = get_speech_timestamps(audio_tensor, vad_model, sampling_rate=target_sample_rate)
+    logging.info(f'Found {len(speech_timestamps)} speech timestamps')
+    return len(speech_timestamps) > 0
+# Function to transcribe audio using WhisperX
+def transcript(audio_data, sample_rate):
+    logging.info('Transcribing audio')
+    # Resample to 16000 Hz if necessary
+    target_sample_rate = 16000
+    if sample_rate != target_sample_rate:
+        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
+        audio_data = resampler(torch.from_numpy(audio_data)).numpy()
+    else:
+        audio_data = audio_data
+    # Transcribe
+    batch_size = 16  # Adjust as needed
+    result = whisper_model.transcribe(audio_data, batch_size=batch_size)
+    text = result["segments"][0]["text"] if len(result["segments"]) > 0 else ""
+    logging.info(f'Transcription result: {text}')
+    # Clear GPU memory
+    del result
+    gc.collect()
+    if device == 'cuda':
+        torch.cuda.empty_cache()
+    return text
+# Function to get streaming response from OpenAI API
+def llm(text):
+    logging.info('Getting response from OpenAI API')
+    response = openai_client.chat.completions.create(
+        model="gpt-4o",  # Updated to a more recent model
+        messages=[
+            {"role": "system", "content": "You respond to the following transcript from the conversation that you are having with the user."},
+            {"role": "user", "content": text}
+        ],
+        stream=True,
+        temperature=0.7,  # Optional: Adjust as needed
+        top_p=0.9,        # Optional: Adjust as needed
+    )
+    for chunk in response:
+        yield chunk.choices[0].delta.content
+# Function to perform TTS per sentence using Edge-TTS
+def tts_streaming(text_stream):
+    logging.info('Performing TTS')
+    buffer = ""
+    punctuation = {'.', '!', '?'}
+    for text_chunk in text_stream:
+        if text_chunk is not None:
+            buffer += text_chunk
+        # Check for sentence completion
+        sentences = []
+        start = 0
+        for i, char in enumerate(buffer):
+            if (char in punctuation):
+                sentences.append(buffer[start:i+1].strip())
+                start = i+1
+        buffer = buffer[start:]
+        for sentence in sentences:
+            if sentence:
+                communicate = edge_tts.Communicate(sentence, TTS_VOICE)
+                for chunk in communicate.stream_sync():
+                    if chunk["type"] == "audio":
+                        yield chunk["data"]
+    # Process any remaining text
+    if buffer.strip():
+        communicate = edge_tts.Communicate(buffer.strip(), TTS_VOICE)
+        for chunk in communicate.stream_sync():
+            if chunk["type"] == "audio":
+                yield chunk["data"]
+# Function to handle LLM and TTS
+def llm_and_tts(transcribed_text, state):
+    logging.info('Handling LLM and TTS')
+    # Get streaming response from LLM
+    for text_chunk in llm(transcribed_text):
+        if state.get('stop_signal'):
+            logging.info('LLM and TTS task stopped')
+            break
+        # Get audio data from TTS
+        for audio_chunk in tts_streaming([text_chunk]):
+            if state.get('stop_signal'):
+                logging.info('LLM and TTS task stopped during TTS')
+                break
+            yield np.frombuffer(audio_chunk, dtype=np.int16)
+state = {
+    'mode': 'idle',
+    'chunk_queue': [],
+    'transcription': '',
+    'in_transcription': False,
+    'previous_no_vad_audio': [],
+    'llm_task': None,
+    'instream': None,
+    'stop_signal': False,
+    'args': {
+        'sample_rate': 16000,
+        'chunk_size': 0.5, # seconds
+        'transcript_chunk_size': 2, # seconds
+    }
+}
+def transcript_loop():
+    while True:
+        if len(state['chunk_queue']) > 0:
+            accumulated_audio = np.concatenate(state['chunk_queue'])
+            total_samples = sum(len(chunk) for chunk in state['chunk_queue'])
+            total_duration = total_samples / state['sample_rate']
+            # Run transcription on the first 2 seconds if len > 3 seconds
+            if total_duration > 3.0 and state['in_transcription'] == True:
+                first_two_seconds_samples = int(2.0 * state['sample_rate'])
+                first_two_seconds_audio = accumulated_audio[:first_two_seconds_samples]
+                transcribed_text = transcript(first_two_seconds_audio, state['sample_rate'])
+                state['transcription'] += transcribed_text
+                remaining_audio = accumulated_audio[first_two_seconds_samples:]
+                state['chunk_queue'] = [remaining_audio]
+            else: # Run transcription on the accumulated audio
+                transcribed_text = transcript(accumulated_audio, state['sample_rate'])
+                state['transcription'] += transcribed_text
+                state['chunk_queue'] = []
+                state['in_transcription'] = False
+        else:
+            time.sleep(0.1)
+        if len(state['chunk_queue']) == 0 and state['mode'] == any(['idle', 'processing']):
+            state['in_transcription'] = False
+            break
+def process_audio(audio_chunk):
+    # returns output audio
+    sample_rate, audio_data = audio_chunk
+    audio_data = np.array(audio_data, dtype=np.float32)
+    # convert to mono if necessary
+    if audio_data.ndim > 1:
+        audio_data = np.mean(audio_data, axis=1)
+    mode = state['mode']
+    chunk_queue = state['chunk_queue']
+    transcription = state['transcription']
+    in_transcription = state['in_transcription']
+    previous_no_vad_audio = state['previous_no_vad_audio']
+    llm_task = state['llm_task']
+    instream = state['instream']
+    stop_signal = state['stop_signal']
+    args = state['args']
+    args['sample_rate'] = sample_rate
+    # check for voice activity
+    vad = check_vad(audio_data, sample_rate)
+    if vad:
+        logging.info(f'Voice activity detected in mode: {mode}')
+        if mode == 'idle':
+            mode = 'listening'
+        elif mode == 'speaking':
+            # Stop llm and tts tasks
+            if llm_task and llm_task.is_alive():
+                # Implement task cancellation logic if possible
+                logging.info('Stopping LLM and TTS tasks')
+                # Since we cannot kill threads directly, we need to handle this in the tasks
+                stop_signal = True
+                llm_task.join()
+            mode = 'listening'
+        if mode == 'listening':
+            if previous_no_vad_audio is not None:
+                chunk_queue.append(previous_no_vad_audio)
+                previous_no_vad_audio = None
+            # Accumulate audio chunks
+            chunk_queue.append(audio_data)
+            # Start transcription thread if not already running
+            if not in_transcription:
+                in_transcription = True
+                transcription_task = threading.Thread(target=transcript_loop, args=(chunk_queue, sample_rate))
+                transcription_task.start()
+        elif mode == 'speaking':
+            # Continue accumulating audio chunks
+            chunk_queue.append(audio_data)
+    else:
+        logging.info(f'No voice activity detected in mode: {mode}')
+        if mode == 'listening':
+            # Add the last chunk to queue
+            chunk_queue.append(audio_data)
+            # Change mode to processing
+            mode = 'processing'
+            # Wait for transcription to complete
+            while in_transcription:
+                time.sleep(0.1)
+            # Check if transcription is complete
+            if len(chunk_queue) == 0:
+                # Start LLM and TTS tasks
+                if not llm_task or not llm_task.is_alive():
+                    stop_signal = False
+                    llm_task = threading.Thread(target=llm_and_tts, args=(transcription, state))
+                    llm_task.start()
+        if mode == 'processing':
+            # Wait for LLM and TTS tasks to start yielding audio
+            if llm_task and llm_task.is_alive():
+                mode = 'responding'
+        if mode == 'responding':
+            for audio_chunk in llm_task:
+                if instream is None:
+                    instream = audio_chunk
+                else:
+                    instream = np.concatenate((instream, audio_chunk))
+                # Send audio to output stream
+                yield instream
+            # Cleanup
+            llm_task = None
+            transcription = ''
+            mode = 'idle'
+            # Updaate state
+            state['mode'] = mode
+            state['chunk_queue'] = chunk_queue
+            state['transcription'] = transcription
+            state['in_transcription'] = in_transcription
+            state['previous_no_vad_audio'] = previous_no_vad_audio
+            state['llm_task'] = llm_task
+            state['instream'] = instream
+            state['stop_signal'] = stop_signal
+            state['args'] = args
+        # Store previous audio chunk with no voice activity
+        previous_no_vad_audio = audio_data
+        # Update state
+        state['mode'] = mode
+        state['chunk_queue'] = chunk_queue
+        state['transcription'] = transcription
+        state['in_transcription'] = in_transcription
+        state['previous_no_vad_audio'] = previous_no_vad_audio
+        state['llm_task'] = llm_task
+        state['instream'] = instream
+        state['stop_signal'] = stop_signal
+        state['args'] = args
+@app.websocket('/ws')
+def websocket_endpoint(websocket: fastapi.WebSocket):
+    logging.info('WebSocket connection established')
+    try:
+        while True:
+            time.sleep(state['args']['chunk_size'])
+            audio_chunk = websocket.receive_bytes()
+            if audio_chunk is None:
+                break
+            for audio_data in process_audio(audio_chunk):
+                websocket.send_bytes(audio_data.tobytes())
+    except Exception as e:
+        logging.error(f'WebSocket error: {e}')
+    finally:
+        logging.info('WebSocket connection closed')
+        websocket.close()
+@app.get('/')
+def index():
+    return fastapi.FileResponse('index.html')

playground/testapp/audio.mp3 ADDED Viewed

Binary file (386 kB). View file

playground/testapp/index.html ADDED Viewed

	@@ -0,0 +1,79 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Voice Assistant</title>
+    <style>
+        body {
+            font-family: Arial, sans-serif;
+            margin: 20px;
+        }
+        #transcription {
+            margin-top: 20px;
+            padding: 10px;
+            border: 1px solid #ccc;
+            height: 150px;
+            overflow-y: auto;
+        }
+        #audio-player {
+            margin-top: 20px;
+        }
+    </style>
+</head>
+<body>
+    <h1>Voice Assistant</h1>
+    <button id="start-btn">Start Recording</button>
+    <button id="stop-btn" disabled>Stop Recording</button>
+    <div id="transcription"></div>
+    <audio id="audio-player" controls></audio>
+    <script>
+        const startBtn = document.getElementById('start-btn');
+        const stopBtn = document.getElementById('stop-btn');
+        const transcriptionDiv = document.getElementById('transcription');
+        const audioPlayer = document.getElementById('audio-player');
+        let websocket;
+        let mediaRecorder;
+        let audioChunks = [];
+        startBtn.addEventListener('click', async () => {
+            startBtn.disabled = true;
+            stopBtn.disabled = false;
+            websocket = new WebSocket('ws://localhost:8000/ws');
+            websocket.binaryType = 'arraybuffer';
+            websocket.onmessage = (event) => {
+                if (event.data instanceof ArrayBuffer) {
+                    const audioBlob = new Blob([event.data], { type: 'audio/wav' });
+                    audioPlayer.src = URL.createObjectURL(audioBlob);
+                    audioPlayer.play();
+                } else {
+                    transcriptionDiv.innerText += event.data + '\n';
+                }
+            };
+            const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+            mediaRecorder = new MediaRecorder(stream);
+            mediaRecorder.ondataavailable = (event) => {
+                if (event.data.size > 0) {
+                    audioChunks.push(event.data);
+                    websocket.send(event.data);
+                }
+            };
+            mediaRecorder.start(1000); // Send audio data every second
+        });
+        stopBtn.addEventListener('click', () => {
+            startBtn.disabled = false;
+            stopBtn.disabled = true;
+            mediaRecorder.stop();
+            websocket.close();
+        });
+    </script>
+</body>
+</html>

playground/testapp/main.py ADDED Viewed

	@@ -0,0 +1,292 @@

+import fastapi
+import numpy as np
+import torch
+import torchaudio
+from silero_vad import get_speech_timestamps, load_silero_vad
+import whisperx
+import edge_tts
+import gc
+import logging
+import time
+import os
+from openai import AsyncOpenAI
+import asyncio
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+# Configure FastAPI
+app = fastapi.FastAPI()
+# Load Silero VAD model
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+logging.info(f'Using device: {device}')
+vad_model = load_silero_vad().to(device)
+logging.info('Loaded Silero VAD model')
+# Load WhisperX model
+whisper_model = whisperx.load_model("tiny", device, compute_type="float16")
+logging.info('Loaded WhisperX model')
+OPENAI_API_KEY = "sk-proj-gcrtuxd5qzaRYT82Ii3eT3BlbkFJpVQHBc9ZJrmSksLbQc3C"
+if not OPENAI_API_KEY:
+    logging.error("OpenAI API key not found. Please set the OPENAI_API_KEY environment variable.")
+    raise ValueError("OpenAI API key not found.")
+logging.info('Initialized OpenAI client')
+aclient = AsyncOpenAI(api_key=OPENAI_API_KEY)  # Corrected import
+# TTS Voice
+TTS_VOICE = "en-GB-SoniaNeural"
+# Function to check voice activity using Silero VAD
+def check_vad(audio_data, sample_rate):
+    logging.info('Checking voice activity')
+    target_sample_rate = 16000
+    if sample_rate != target_sample_rate:
+        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
+        audio_tensor = resampler(torch.from_numpy(audio_data))
+    else:
+        audio_tensor = torch.from_numpy(audio_data)
+    audio_tensor = audio_tensor.to(device)
+    speech_timestamps = get_speech_timestamps(audio_tensor, vad_model, sampling_rate=target_sample_rate)
+    logging.info(f'Found {len(speech_timestamps)} speech timestamps')
+    return len(speech_timestamps) > 0
+# Async function to transcribe audio using WhisperX
+def transcript_sync(audio_data, sample_rate):
+    logging.info('Transcribing audio')
+    target_sample_rate = 16000
+    if sample_rate != target_sample_rate:
+        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
+        audio_data = resampler(torch.from_numpy(audio_data)).numpy()
+    else:
+        audio_data = audio_data
+    batch_size = 16  # Adjust as needed
+    result = whisper_model.transcribe(audio_data, batch_size=batch_size)
+    text = result["segments"][0]["text"] if len(result["segments"]) > 0 else ""
+    logging.info(f'Transcription result: {text}')
+    del result
+    gc.collect()
+    if device == 'cuda':
+        torch.cuda.empty_cache()
+    return text
+async def transcript(audio_data, sample_rate):
+    loop = asyncio.get_running_loop()
+    text = await loop.run_in_executor(None, transcript_sync, audio_data, sample_rate)
+    return text
+# Async function to get streaming response from OpenAI API
+async def llm(text):
+    logging.info('Getting response from OpenAI API')
+    response = await aclient.chat.completions.create(model="gpt-4",  # Updated to a more recent model
+    messages=[
+        {"role": "system", "content": "You respond to the following transcript from the conversation that you are having with the user."},
+        {"role": "user", "content": text}
+    ],
+    stream=True,
+    temperature=0.7,
+    top_p=0.9)
+    async for chunk in response:
+        yield chunk.choices[0].delta.content
+# Async function to perform TTS using Edge-TTS
+async def tts_streaming(text_stream):
+    logging.info('Performing TTS')
+    buffer = ""
+    punctuation = {'.', '!', '?'}
+    for text_chunk in text_stream:
+        if text_chunk is not None:
+            buffer += text_chunk
+        # Check for sentence completion
+        sentences = []
+        start = 0
+        for i, char in enumerate(buffer):
+            if char in punctuation:
+                sentences.append(buffer[start:i+1].strip())
+                start = i+1
+        buffer = buffer[start:]
+        for sentence in sentences:
+            if sentence:
+                communicate = edge_tts.Communicate(sentence, TTS_VOICE)
+                async for chunk in communicate.stream():
+                    if chunk["type"] == "audio":
+                        yield chunk["data"]
+    # Process any remaining text
+    if buffer.strip():
+        communicate = edge_tts.Communicate(buffer.strip(), TTS_VOICE)
+        async for chunk in communicate.stream():
+            if chunk["type"] == "audio":
+                yield chunk["data"]
+class Conversation:
+    def __init__(self):
+        self.mode = 'idle'
+        self.chunk_queue = []
+        self.transcription = ''
+        self.in_transcription = False
+        self.previous_no_vad_audio = None
+        self.llm_task = None
+        self.transcription_task = None
+        self.stop_signal = False
+        self.sample_rate = 16000  # default sample rate
+        self.instream = None
+    async def process_audio(self, audio_chunk):
+        sample_rate, audio_data = audio_chunk
+        self.sample_rate = sample_rate
+        audio_data = np.array(audio_data, dtype=np.float32)
+        # convert to mono if necessary
+        if audio_data.ndim > 1:
+            audio_data = np.mean(audio_data, axis=1)
+        # check for voice activity
+        vad = check_vad(audio_data, sample_rate)
+        if vad:
+            logging.info(f'Voice activity detected in mode: {self.mode}')
+            if self.mode == 'idle':
+                self.mode = 'listening'
+            elif self.mode == 'speaking':
+                # Stop llm and tts tasks
+                if self.llm_task and not self.llm_task.done():
+                    logging.info('Stopping LLM and TTS tasks')
+                    self.stop_signal = True
+                    await self.llm_task
+                self.mode = 'listening'
+            if self.mode == 'listening':
+                if self.previous_no_vad_audio is not None:
+                    self.chunk_queue.append(self.previous_no_vad_audio)
+                    self.previous_no_vad_audio = None
+                # Accumulate audio chunks
+                self.chunk_queue.append(audio_data)
+                # Start transcription task if not already running
+                if not self.in_transcription:
+                    self.in_transcription = True
+                    self.transcription_task = asyncio.create_task(self.transcript_loop())
+        else:
+            logging.info(f'No voice activity detected in mode: {self.mode}')
+            if self.mode == 'listening':
+                # Add the last chunk to queue
+                self.chunk_queue.append(audio_data)
+                # Change mode to processing
+                self.mode = 'processing'
+                # Wait for transcription to complete
+                while self.in_transcription:
+                    await asyncio.sleep(0.1)
+                # Check if transcription is complete
+                if len(self.chunk_queue) == 0:
+                    # Start LLM and TTS tasks
+                    if not self.llm_task or self.llm_task.done():
+                        self.stop_signal = False
+                        self.llm_task = self.llm_and_tts()
+                        self.mode = 'responding'
+            if self.mode == 'responding':
+                async for audio_chunk in self.llm_task:
+                    if self.instream is None:
+                        self.instream = audio_chunk
+                    else:
+                        self.instream = np.concatenate((self.instream, audio_chunk))
+                    # Send audio to output stream
+                    yield self.instream
+                # Cleanup
+                self.llm_task = None
+                self.transcription = ''
+                self.mode = 'idle'
+                self.instream = None
+            # Store previous audio chunk with no voice activity
+            self.previous_no_vad_audio = audio_data
+    async def transcript_loop(self):
+        while True:
+            if len(self.chunk_queue) > 0:
+                accumulated_audio = np.concatenate(self.chunk_queue)
+                total_samples = len(accumulated_audio)
+                total_duration = total_samples / self.sample_rate
+                if total_duration > 3.0 and self.in_transcription == True:
+                    first_two_seconds_samples = int(2.0 * self.sample_rate)
+                    first_two_seconds_audio = accumulated_audio[:first_two_seconds_samples]
+                    transcribed_text = await transcript(first_two_seconds_audio, self.sample_rate)
+                    self.transcription += transcribed_text
+                    remaining_audio = accumulated_audio[first_two_seconds_samples:]
+                    self.chunk_queue = [remaining_audio]
+                else:
+                    transcribed_text = await transcript(accumulated_audio, self.sample_rate)
+                    self.transcription += transcribed_text
+                    self.chunk_queue = []
+                    self.in_transcription = False
+            else:
+                await asyncio.sleep(0.1)
+            if len(self.chunk_queue) == 0 and self.mode in ['idle', 'processing']:
+                self.in_transcription = False
+                break
+    async def llm_and_tts(self):
+        logging.info('Handling LLM and TTS')
+        async for text_chunk in llm(self.transcription):
+            if self.stop_signal:
+                logging.info('LLM and TTS task stopped')
+                break
+            async for audio_chunk in tts_streaming([text_chunk]):
+                if self.stop_signal:
+                    logging.info('LLM and TTS task stopped during TTS')
+                    break
+                yield np.frombuffer(audio_chunk, dtype=np.int16)
+@app.websocket('/ws')
+async def websocket_endpoint(websocket: fastapi.WebSocket):
+    await websocket.accept()
+    logging.info('WebSocket connection established')
+    conversation = Conversation()
+    audio_buffer = []
+    buffer_duration = 0.5  # 500ms
+    try:
+        while True:
+            audio_chunk_bytes = await websocket.receive_bytes()
+            if audio_chunk_bytes is None:
+                break
+            audio_chunk = (conversation.sample_rate, np.frombuffer(audio_chunk_bytes, dtype=np.int16))
+            audio_buffer.append(audio_chunk[1])
+            # Calculate the duration of the buffered audio
+            total_samples = sum(len(chunk) for chunk in audio_buffer)
+            total_duration = total_samples / conversation.sample_rate
+            if total_duration >= buffer_duration:
+                # Concatenate buffered audio chunks
+                buffered_audio = np.concatenate(audio_buffer)
+                audio_buffer = []  # Reset buffer
+                # Process the buffered audio
+                async for audio_data in conversation.process_audio((conversation.sample_rate, buffered_audio)):
+                    if audio_data is not None:
+                        await websocket.send_bytes(audio_data.tobytes())
+    except Exception as e:
+        logging.error(f'WebSocket error: {e}')
+    finally:
+        logging.info('WebSocket connection closed')
+        await websocket.close()
+@app.get('/')
+def index():
+    return fastapi.responses.FileResponse('index.html')
+if __name__ == '__main__':
+    import uvicorn
+    uvicorn.run(app, host='0.0.0.0', port=8000)

playground/testapp/test.ipynb ADDED Viewed

	@@ -0,0 +1,478 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import fastapi\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "import torchaudio\n",
+    "from silero_vad import get_speech_timestamps, load_silero_vad\n",
+    "import whisperx\n",
+    "import edge_tts\n",
+    "import gc\n",
+    "import logging\n",
+    "import time\n",
+    "from openai import OpenAI\n",
+    "import threading\n",
+    "import asyncio\n",
+    "\n",
+    "# Configure logging\n",
+    "logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')\n",
+    "\n",
+    "# Configure FastAPI\n",
+    "app = fastapi.FastAPI()\n",
+    "\n",
+    "# Load Silero VAD model\n",
+    "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
+    "logging.info(f'Using device: {device}')\n",
+    "vad_model = load_silero_vad().to(device)  # Ensure the model is on the correct device\n",
+    "logging.info('Loaded Silero VAD model')\n",
+    "\n",
+    "# Load WhisperX model\n",
+    "whisper_model = whisperx.load_model(\"tiny\", device, compute_type=\"float16\")\n",
+    "logging.info('Loaded WhisperX model')\n",
+    "\n",
+    "# OpenAI API Key from environment variable for security\n",
+    "OPENAI_API_KEY = \"sk-proj-gcrtuxd5qzaRYT82Ii3eT3BlbkFJpVQHBc9ZJrmSksLbQc3C\" # os.getenv(\"OPENAI_API_KEY\")\n",
+    "if not OPENAI_API_KEY:\n",
+    "    logging.error(\"OpenAI API key not found. Please set the OPENAI_API_KEY environment variable.\")\n",
+    "    raise ValueError(\"OpenAI API key not found.\")\n",
+    "\n",
+    "# Initialize OpenAI client\n",
+    "openai_client = OpenAI(api_key=OPENAI_API_KEY)\n",
+    "logging.info('Initialized OpenAI client')\n",
+    "\n",
+    "# TTS Voice\n",
+    "TTS_VOICE = \"en-GB-SoniaNeural\"\n",
+    "\n",
+    "# Function to check voice activity using Silero VAD\n",
+    "def check_vad(audio_data, sample_rate):\n",
+    "    logging.info('Checking voice activity')\n",
+    "    # Resample to 16000 Hz if necessary\n",
+    "    target_sample_rate = 16000\n",
+    "    if sample_rate != target_sample_rate:\n",
+    "        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)\n",
+    "        audio_tensor = resampler(torch.from_numpy(audio_data))\n",
+    "    else:\n",
+    "        audio_tensor = torch.from_numpy(audio_data)\n",
+    "    audio_tensor = audio_tensor.to(device)\n",
+    "\n",
+    "    # Log audio data details\n",
+    "    logging.info(f'Audio tensor shape: {audio_tensor.shape}, dtype: {audio_tensor.dtype}, device: {audio_tensor.device}')\n",
+    "\n",
+    "    # Get speech timestamps\n",
+    "    speech_timestamps = get_speech_timestamps(audio_tensor, vad_model, sampling_rate=target_sample_rate)\n",
+    "    logging.info(f'Found {len(speech_timestamps)} speech timestamps')\n",
+    "    return len(speech_timestamps) > 0\n",
+    "\n",
+    "# Function to transcribe audio using WhisperX\n",
+    "def transcript(audio_data, sample_rate):\n",
+    "    logging.info('Transcribing audio')\n",
+    "    # Resample to 16000 Hz if necessary\n",
+    "    target_sample_rate = 16000\n",
+    "    if sample_rate != target_sample_rate:\n",
+    "        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)\n",
+    "        audio_data = resampler(torch.from_numpy(audio_data)).numpy()\n",
+    "    else:\n",
+    "        audio_data = audio_data\n",
+    "\n",
+    "    # Transcribe\n",
+    "    batch_size = 16  # Adjust as needed\n",
+    "    result = whisper_model.transcribe(audio_data, batch_size=batch_size)\n",
+    "    text = result[\"segments\"][0][\"text\"] if len(result[\"segments\"]) > 0 else \"\"\n",
+    "    logging.info(f'Transcription result: {text}')\n",
+    "    # Clear GPU memory\n",
+    "    del result\n",
+    "    gc.collect()\n",
+    "    if device == 'cuda':\n",
+    "        torch.cuda.empty_cache()\n",
+    "    return text\n",
+    "\n",
+    "# Function to get streaming response from OpenAI API\n",
+    "def llm(text):\n",
+    "    logging.info('Getting response from OpenAI API')\n",
+    "    response = openai_client.chat.completions.create(\n",
+    "        model=\"gpt-4o\",  # Updated to a more recent model\n",
+    "        messages=[\n",
+    "            {\"role\": \"system\", \"content\": \"You respond to the following transcript from the conversation that you are having with the user.\"},\n",
+    "            {\"role\": \"user\", \"content\": text}  \n",
+    "        ],\n",
+    "        stream=True,\n",
+    "        temperature=0.7,  # Optional: Adjust as needed\n",
+    "        top_p=0.9,        # Optional: Adjust as needed\n",
+    "    )\n",
+    "    for chunk in response:\n",
+    "        yield chunk.choices[0].delta.content\n",
+    "\n",
+    "# Function to perform TTS per sentence using Edge-TTS\n",
+    "def tts_streaming(text_stream):\n",
+    "    logging.info('Performing TTS')\n",
+    "    buffer = \"\"\n",
+    "    punctuation = {'.', '!', '?'}\n",
+    "    for text_chunk in text_stream:\n",
+    "        if text_chunk is not None:\n",
+    "            buffer += text_chunk\n",
+    "        # Check for sentence completion\n",
+    "        sentences = []\n",
+    "        start = 0\n",
+    "        for i, char in enumerate(buffer):\n",
+    "            if (char in punctuation):\n",
+    "                sentences.append(buffer[start:i+1].strip())\n",
+    "                start = i+1\n",
+    "        buffer = buffer[start:]\n",
+    "\n",
+    "        for sentence in sentences:\n",
+    "            if sentence:\n",
+    "                communicate = edge_tts.Communicate(sentence, TTS_VOICE)\n",
+    "                for chunk in communicate.stream_sync():\n",
+    "                    if chunk[\"type\"] == \"audio\":\n",
+    "                        yield chunk[\"data\"]\n",
+    "    # Process any remaining text\n",
+    "    if buffer.strip():\n",
+    "        communicate = edge_tts.Communicate(buffer.strip(), TTS_VOICE)\n",
+    "        for chunk in communicate.stream_sync():\n",
+    "            if chunk[\"type\"] == \"audio\":\n",
+    "                yield chunk[\"data\"]\n",
+    "\n",
+    "# Function to handle LLM and TTS\n",
+    "def llm_and_tts(transcribed_text):\n",
+    "    logging.info('Handling LLM and TTS')\n",
+    "    # Get streaming response from LLM\n",
+    "    for text_chunk in llm(transcribed_text):\n",
+    "        if state.get('stop_signal'):\n",
+    "            logging.info('LLM and TTS task stopped')\n",
+    "            break\n",
+    "        # Get audio data from TTS\n",
+    "        for audio_chunk in tts_streaming([text_chunk]):\n",
+    "            if state.get('stop_signal'):\n",
+    "                logging.info('LLM and TTS task stopped during TTS')\n",
+    "                break\n",
+    "            yield np.frombuffer(audio_chunk, dtype=np.int16)\n",
+    "\n",
+    "state = {\n",
+    "    'mode': 'idle',\n",
+    "    'chunk_queue': [],\n",
+    "    'transcription': '',\n",
+    "    'in_transcription': False,\n",
+    "    'previous_no_vad_audio': [],\n",
+    "    'llm_task': None,\n",
+    "    'instream': None,\n",
+    "    'stop_signal': False,\n",
+    "    'args': {\n",
+    "        'sample_rate': 16000,\n",
+    "        'chunk_size': 0.5, # seconds\n",
+    "        'transcript_chunk_size': 2, # seconds\n",
+    "    }\n",
+    "}\n",
+    "\n",
+    "def transcript_loop():\n",
+    "    while True:\n",
+    "        if len(state['chunk_queue']) > 0:\n",
+    "            accumulated_audio = np.concatenate(state['chunk_queue'])\n",
+    "            total_samples = sum(len(chunk) for chunk in state['chunk_queue'])\n",
+    "            total_duration = total_samples / state['args']['sample_rate']\n",
+    "            \n",
+    "            # Run transcription on the first 2 seconds if len > 3 seconds\n",
+    "            if total_duration > 3.0 and state['in_transcription'] == True:\n",
+    "                first_two_seconds_samples = int(2.0 * state['args']['sample_rate'])\n",
+    "                first_two_seconds_audio = accumulated_audio[:first_two_seconds_samples]\n",
+    "                transcribed_text = transcript(first_two_seconds_audio, state['args']['sample_rate'])\n",
+    "                state['transcription'] += transcribed_text\n",
+    "                remaining_audio = accumulated_audio[first_two_seconds_samples:]\n",
+    "                state['chunk_queue'] = [remaining_audio]\n",
+    "            else: # Run transcription on the accumulated audio\n",
+    "                transcribed_text = transcript(accumulated_audio, state['args']['sample_rate'])\n",
+    "                state['transcription'] += transcribed_text\n",
+    "                state['chunk_queue'] = []\n",
+    "                state['in_transcription'] = False\n",
+    "        else:\n",
+    "            time.sleep(0.1)\n",
+    "\n",
+    "        if len(state['chunk_queue']) == 0 and state['mode'] == any(['idle', 'processing']):\n",
+    "            state['in_transcription'] = False\n",
+    "            break\n",
+    "\n",
+    "def process_audio(audio_chunk):\n",
+    "    # returns output audio\n",
+    "    \n",
+    "    sample_rate, audio_data = audio_chunk\n",
+    "    audio_data = np.array(audio_data, dtype=np.float32)\n",
+    "    \n",
+    "    # convert to mono if necessary\n",
+    "    if audio_data.ndim > 1:\n",
+    "        audio_data = np.mean(audio_data, axis=1)\n",
+    "\n",
+    "    mode = state['mode']\n",
+    "    chunk_queue = state['chunk_queue']\n",
+    "    transcription = state['transcription']\n",
+    "    in_transcription = state['in_transcription']\n",
+    "    previous_no_vad_audio = state['previous_no_vad_audio']\n",
+    "    llm_task = state['llm_task']\n",
+    "    instream = state['instream']\n",
+    "    stop_signal = state['stop_signal']\n",
+    "    args = state['args']\n",
+    "    \n",
+    "    args['sample_rate'] = sample_rate\n",
+    "    \n",
+    "    # check for voice activity\n",
+    "    vad = check_vad(audio_data, sample_rate)\n",
+    "    \n",
+    "    if vad:\n",
+    "        logging.info(f'Voice activity detected in mode: {mode}')\n",
+    "        if mode == 'idle':\n",
+    "            mode = 'listening'\n",
+    "        elif mode == 'speaking':\n",
+    "            # Stop llm and tts tasks\n",
+    "            if llm_task and llm_task.is_alive():\n",
+    "                # Implement task cancellation logic if possible\n",
+    "                logging.info('Stopping LLM and TTS tasks')\n",
+    "                # Since we cannot kill threads directly, we need to handle this in the tasks\n",
+    "                stop_signal = True\n",
+    "                llm_task.join()\n",
+    "            mode = 'listening'\n",
+    "\n",
+    "        if mode == 'listening':\n",
+    "            if previous_no_vad_audio is not None:\n",
+    "                chunk_queue.append(previous_no_vad_audio)\n",
+    "                previous_no_vad_audio = None\n",
+    "            # Accumulate audio chunks\n",
+    "            chunk_queue.append(audio_data)\n",
+    "            \n",
+    "            # Start transcription thread if not already running\n",
+    "            if not in_transcription:\n",
+    "                in_transcription = True\n",
+    "                transcription_task = threading.Thread(target=transcript_loop)\n",
+    "                transcription_task.start()\n",
+    "        \n",
+    "        elif mode == 'speaking':\n",
+    "            # Continue accumulating audio chunks\n",
+    "            chunk_queue.append(audio_data)\n",
+    "    else:\n",
+    "        logging.info(f'No voice activity detected in mode: {mode}')\n",
+    "        if mode == 'listening':\n",
+    "            # Add the last chunk to queue\n",
+    "            chunk_queue.append(audio_data)\n",
+    "            \n",
+    "            # Change mode to processing\n",
+    "            mode = 'processing'\n",
+    "            \n",
+    "            # Wait for transcription to complete\n",
+    "            while in_transcription:\n",
+    "                time.sleep(0.1)\n",
+    "            \n",
+    "            # Check if transcription is complete\n",
+    "            if len(chunk_queue) == 0:\n",
+    "                # Start LLM and TTS tasks\n",
+    "                if not llm_task or not llm_task.is_alive():\n",
+    "                    stop_signal = False\n",
+    "                    llm_task = threading.Thread(target=llm_and_tts, args=(transcription))\n",
+    "                    llm_task.start()\n",
+    "        \n",
+    "        if mode == 'processing':\n",
+    "            # Wait for LLM and TTS tasks to start yielding audio\n",
+    "            if llm_task and llm_task.is_alive():\n",
+    "                mode = 'responding'\n",
+    "        \n",
+    "        if mode == 'responding':\n",
+    "            for audio_chunk in llm_task:\n",
+    "                if instream is None:\n",
+    "                    instream = audio_chunk\n",
+    "                else:\n",
+    "                    instream = np.concatenate((instream, audio_chunk))\n",
+    "                \n",
+    "                # Send audio to output stream\n",
+    "                yield instream\n",
+    "            \n",
+    "            # Cleanup\n",
+    "            llm_task = None\n",
+    "            transcription = ''\n",
+    "            mode = 'idle'\n",
+    "            \n",
+    "            # Updaate state\n",
+    "            state['mode'] = mode\n",
+    "            state['chunk_queue'] = chunk_queue\n",
+    "            state['transcription'] = transcription\n",
+    "            state['in_transcription'] = in_transcription\n",
+    "            state['previous_no_vad_audio'] = previous_no_vad_audio\n",
+    "            state['llm_task'] = llm_task\n",
+    "            state['instream'] = instream\n",
+    "            state['stop_signal'] = stop_signal\n",
+    "            state['args'] = args\n",
+    "        \n",
+    "        # Store previous audio chunk with no voice activity\n",
+    "        previous_no_vad_audio = audio_data\n",
+    "        \n",
+    "        # Update state\n",
+    "        state['mode'] = mode\n",
+    "        state['chunk_queue'] = chunk_queue\n",
+    "        state['transcription'] = transcription\n",
+    "        state['in_transcription'] = in_transcription\n",
+    "        state['previous_no_vad_audio'] = previous_no_vad_audio\n",
+    "        state['llm_task'] = llm_task\n",
+    "        state['instream'] = instream\n",
+    "        state['stop_signal'] = stop_signal\n",
+    "        state['args'] = args"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 1. Load audio.mp3\n",
+    "# 2. Split audio into chunks\n",
+    "# 3. Process each chunk inside a loop\n",
+    "\n",
+    "# Split audio into chunks of 500 ms or less\n",
+    "from pydub import AudioSegment\n",
+    "audio_segment = AudioSegment.from_file('audio.mp3')\n",
+    "chunks = [chunk for chunk in audio_segment[::500]]\n",
+    "chunks[0]\n",
+    "chunks = [(chunk.frame_rate, np.array(chunk.get_array_of_samples(), dtype=np.int16)) for chunk in chunks]\n",
+    "\n",
+    "output_audio = []\n",
+    "# Process each chunk\n",
+    "for chunk in chunks:\n",
+    "    for audio_chunk in process_audio(chunk):\n",
+    "        output_audio.append(audio_chunk)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output_audio"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import asyncio\n",
+    "import websockets\n",
+    "from pydub import AudioSegment\n",
+    "import numpy as np\n",
+    "import simpleaudio as sa\n",
+    "\n",
+    "# Constants\n",
+    "AUDIO_FILE = 'audio.mp3'  # Input audio file\n",
+    "CHUNK_DURATION_MS = 250   # Duration of each chunk in milliseconds\n",
+    "WEBSOCKET_URI = 'ws://localhost:8000/ws'  # WebSocket endpoint\n",
+    "\n",
+    "async def send_audio_chunks(uri):\n",
+    "    # Load audio file using pydub\n",
+    "    audio = AudioSegment.from_file(AUDIO_FILE)\n",
+    "\n",
+    "    # Ensure audio is mono and 16kHz\n",
+    "    if audio.channels > 1:\n",
+    "        audio = audio.set_channels(1)\n",
+    "    if audio.frame_rate != 16000:\n",
+    "        audio = audio.set_frame_rate(16000)\n",
+    "    if audio.sample_width != 2:  # 2 bytes for int16\n",
+    "        audio = audio.set_sample_width(2)\n",
+    "\n",
+    "    # Split audio into chunks\n",
+    "    chunks = [audio[i:i+CHUNK_DURATION_MS] for i in range(0, len(audio), CHUNK_DURATION_MS)]\n",
+    "\n",
+    "    # Store received audio data\n",
+    "    received_audio_data = b''\n",
+    "\n",
+    "    async with websockets.connect(uri) as websocket:\n",
+    "        print(\"Connected to server.\")\n",
+    "        for idx, chunk in enumerate(chunks):\n",
+    "            # Get raw audio data\n",
+    "            raw_data = chunk.raw_data\n",
+    "\n",
+    "            # Send audio chunk to server\n",
+    "            await websocket.send(raw_data)\n",
+    "            print(f\"Sent chunk {idx+1}/{len(chunks)}\")\n",
+    "\n",
+    "            # Receive response (non-blocking)\n",
+    "            try:\n",
+    "                response = await asyncio.wait_for(websocket.recv(), timeout=0.1)\n",
+    "                if isinstance(response, bytes):\n",
+    "                    received_audio_data += response\n",
+    "                    print(f\"Received audio data of length {len(response)} bytes\")\n",
+    "            except asyncio.TimeoutError:\n",
+    "                pass  # No response received yet\n",
+    "\n",
+    "            # Simulate real-time by waiting for chunk duration\n",
+    "            await asyncio.sleep(CHUNK_DURATION_MS / 1000.0)\n",
+    "\n",
+    "        # Send a final empty message to indicate end of transmission\n",
+    "        await websocket.send(b'')\n",
+    "        print(\"Finished sending audio. Waiting for responses...\")\n",
+    "\n",
+    "        # Receive any remaining responses\n",
+    "        while True:\n",
+    "            try:\n",
+    "                response = await asyncio.wait_for(websocket.recv(), timeout=1)\n",
+    "                if isinstance(response, bytes):\n",
+    "                    received_audio_data += response\n",
+    "                    print(f\"Received audio data of length {len(response)} bytes\")\n",
+    "            except asyncio.TimeoutError:\n",
+    "                print(\"No more responses. Closing connection.\")\n",
+    "                break\n",
+    "\n",
+    "        print(\"Connection closed.\")\n",
+    "\n",
+    "    # Save received audio data to a file or play it\n",
+    "    if received_audio_data:\n",
+    "        # Convert bytes to numpy array\n",
+    "        audio_array = np.frombuffer(received_audio_data, dtype=np.int16)\n",
+    "\n",
+    "        # Play audio using simpleaudio\n",
+    "        play_obj = sa.play_buffer(audio_array, 1, 2, 16000)\n",
+    "        play_obj.wait_done()\n",
+    "\n",
+    "        # Optionally, save to a WAV file\n",
+    "        output_audio = AudioSegment(\n",
+    "            data=received_audio_data,\n",
+    "            sample_width=2,  # 2 bytes for int16\n",
+    "            frame_rate=16000,\n",
+    "            channels=1\n",
+    "        )\n",
+    "        output_audio.export(\"output_response.wav\", format=\"wav\")\n",
+    "        print(\"Saved response audio to 'output_response.wav'\")\n",
+    "    else:\n",
+    "        print(\"No audio data received.\")\n",
+    "\n",
+    "def main():\n",
+    "    asyncio.run(send_audio_chunks(WEBSOCKET_URI))\n",
+    "\n",
+    "if __name__ == '__main__':\n",
+    "    main()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

playground/testapp/test.py ADDED Viewed

	@@ -0,0 +1,257 @@

+import fastapi
+import numpy as np
+import torch
+import torchaudio
+from silero_vad import get_speech_timestamps, load_silero_vad
+import whisperx
+import edge_tts
+import gc
+import logging
+import time
+import os
+from openai import OpenAI
+import asyncio
+from pydub import AudioSegment
+from io import BytesIO
+import threading
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+# Configure FastAPI
+app = fastapi.FastAPI()
+# Load Silero VAD model
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+logging.info(f'Using device: {device}')
+vad_model = load_silero_vad().to(device)
+logging.info('Loaded Silero VAD model')
+# Load WhisperX model
+whisper_model = whisperx.load_model("tiny", device, compute_type="float16")
+logging.info('Loaded WhisperX model')
+OPENAI_API_KEY = "sk-proj-gcrtuxd5qzaRYT82Ii3eT3BlbkFJpVQHBc9ZJrmSksLbQc3C"
+if not OPENAI_API_KEY:
+    logging.error("OpenAI API key not found. Please set the OPENAI_API_KEY environment variable.")
+    raise ValueError("OpenAI API key not found.")
+logging.info('Initialized OpenAI client')
+llm_client = OpenAI(api_key=OPENAI_API_KEY)  # Corrected import
+# TTS Voice
+TTS_VOICE = "en-GB-SoniaNeural"
+# Function to check voice activity using Silero VAD
+def check_vad(audio_data, sample_rate):
+    logging.info('Checking voice activity')
+    target_sample_rate = 16000
+    if sample_rate != target_sample_rate:
+        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
+        audio_tensor = resampler(torch.from_numpy(audio_data))
+    else:
+        audio_tensor = torch.from_numpy(audio_data)
+    audio_tensor = audio_tensor.to(device)
+    speech_timestamps = get_speech_timestamps(audio_tensor, vad_model, sampling_rate=target_sample_rate)
+    logging.info(f'Found {len(speech_timestamps)} speech timestamps')
+    return len(speech_timestamps) > 0
+# Async function to transcribe audio using WhisperX
+def transcribe(audio_data, sample_rate):
+    logging.info('Transcribing audio')
+    target_sample_rate = 16000
+    if sample_rate != target_sample_rate:
+        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
+        audio_data = resampler(torch.from_numpy(audio_data)).numpy()
+    else:
+        audio_data = audio_data
+    batch_size = 16  # Adjust as needed
+    result = whisper_model.transcribe(audio_data, batch_size=batch_size)
+    text = result["segments"][0]["text"] if len(result["segments"]) > 0 else ""
+    logging.info(f'Transcription result: {text}')
+    del result
+    gc.collect()
+    if device == 'cuda':
+        torch.cuda.empty_cache()
+    return text
+# Function to convert text to speech using Edge TTS and stream the audio
+def tts_streaming(text_stream):
+    logging.info('Performing TTS')
+    buffer = ""
+    punctuation = {'.', '!', '?'}
+    for text_chunk in text_stream:
+        if text_chunk is not None:
+            buffer += text_chunk
+        # Check for sentence completion
+        sentences = []
+        start = 0
+        for i, char in enumerate(buffer):
+            if char in punctuation:
+                sentences.append(buffer[start:i+1].strip())
+                start = i+1
+        buffer = buffer[start:]
+        for sentence in sentences:
+            if sentence:
+                communicate = edge_tts.Communicate(sentence, TTS_VOICE)
+                for chunk in communicate.stream_sync():
+                    if chunk["type"] == "audio":
+                        yield chunk["data"]
+    # Process any remaining text
+    if buffer.strip():
+        communicate = edge_tts.Communicate(buffer.strip(), TTS_VOICE)
+        for chunk in communicate.stream_sync():
+            if chunk["type"] == "audio":
+                yield chunk["data"]
+# Function to perform language model completion using OpenAI API
+def llm(text):
+    logging.info('Getting response from OpenAI API')
+    response = llm_client.chat.completions.create(
+        model="gpt-4o",  # Updated to a more recent model
+        messages=[
+            {"role": "system", "content": "You respond to the following transcript from the conversation that you are having with the user."},
+            {"role": "user", "content": text}
+        ],
+        stream=True,
+        temperature=0.7,
+        top_p=0.9
+    )
+    for chunk in response:
+        yield chunk.choices[0].delta.content
+class Conversation:
+    def __init__(self):
+        self.mode = 'idle' # idle, listening, speaking
+        self.audio_stream = []
+        self.valid_chunk_queue = []
+        self.first_valid_chunk = None
+        self.last_valid_chunks = []
+        self.valid_chunk_transcriptions = ''
+        self.in_transcription = False
+        self.llm_n_tts_task = None
+        self.stop_signal = False
+        self.sample_rate = 0
+        self.out_audio_stream = []
+        self.chunk_buffer = 0.5 # seconds
+    def llm_n_tts(self):
+        for text_chunk in llm(self.transcription):
+            if self.stop_signal:
+                break
+            for audio_chunk in tts_streaming([text_chunk]):
+                if self.stop_signal:
+                    break
+                self.out_audio_stream.append(np.frombuffer(audio_chunk, dtype=np.int16))
+    def process_audio_chunk(self, audio_chunk):
+        # Construct audio stream
+        audio_data = AudioSegment.from_file(BytesIO(audio_chunk), format="wav")
+        audio_data = np.array(audio_data.get_array_of_samples())
+        self.sample_rate = audio_data.frame_rate
+        # Check for voice activity
+        vad = check_vad(audio_data, self.sample_rate)
+        if vad: # Voice activity detected
+            if self.first_valid_chunk is not None:
+                self.valid_chunk_queue.append(self.first_valid_chunk)
+                self.first_valid_chunk = None
+            self.valid_chunk_queue.append(audio_chunk)
+            if len(self.valid_chunk_queue) > 2:
+                # i.e. 3 chunks: 1 non valid chunk + 2 valid chunks
+                # this is to ensure that the speaker is speaking
+                if self.mode == 'idle':
+                    self.mode = 'listening'
+                elif self.mode == 'speaking':
+                    # Stop llm and tts
+                    if self.llm_n_tts_task is not None:
+                        self.stop_signal = True
+                        self.llm_n_tts_task
+                        self.stop_signal = False
+                    self.mode = 'listening'
+        else: # No voice activity
+            if self.mode == 'listening':
+                self.last_valid_chunks.append(audio_chunk)
+                if len(self.last_valid_chunks) > 2:
+                    # i.e. 2 chunks where the speaker stopped speaking, but we account for natural pauses
+                    # so on the 1.5th second of no voice activity, we append the first 2 of the last valid chunks to the valid chunk queue
+                    # stop listening and start speaking
+                    self.valid_chunk_queue.extend(self.last_valid_chunks[:2])
+                    self.last_valid_chunks = []
+                while len(self.valid_chunk_queue) > 0:
+                    time.sleep(0.1)
+                self.mode = 'speaking'
+                self.llm_n_tts_task = threading.Thread(target=self.llm_n_tts)
+                self.llm_n_tts_task.start()
+    def transcribe_loop(self):
+        while True:
+            if self.mode == 'listening':
+                if len(self.valid_chunk_queue) > 0:
+                    accumulated_chunks = np.concatenate(self.valid_chunk_queue)
+                    total_duration = len(accumulated_chunks) / self.sample_rate
+                    if total_duration >= 3.0 and self.in_transcription == True:
+                        # i.e. we have at least 3 seconds of audio so we can start transcribing to reduce latency
+                        first_2s_audio = accumulated_chunks[:int(2 * self.sample_rate)]
+                        transcribed_text = transcribe(first_2s_audio, self.sample_rate)
+                        self.valid_chunk_transcriptions += transcribed_text
+                        self.valid_chunk_queue = [accumulated_chunks[int(2 * self.sample_rate):]]
+                    if self.mode == any(['idle', 'speaking']):
+                        # i.e. the request to stop transcription has been made
+                        # so process the remaining audio
+                        transcribed_text = transcribe(accumulated_chunks, self.sample_rate)
+                        self.valid_chunk_transcriptions += transcribed_text
+                        self.valid_chunk_queue = []
+                else:
+                    time.sleep(0.1)
+    def stream_out_audio(self):
+        while True:
+            if len(self.out_audio_stream) > 0:
+                yield AudioSegment(data=self.out_audio_stream.pop(0), sample_width=2, frame_rate=self.sample_rate, channels=1).raw_data
+@app.websocket("/ws")
+async def websocket_endpoint(websocket: fastapi.WebSocket):
+    # Accept connection
+    await websocket.accept()
+    # Initialize conversation
+    conversation = Conversation()
+    # Start conversation threads
+    transcribe_thread = threading.Thread(target=conversation.transcribe_loop)
+    transcribe_thread.start()
+    # Process audio chunks
+    chunk_buffer_size = conversation.chunk_buffer
+    while True:
+        try:
+            audio_chunk = await websocket.receive_bytes()
+            conversation.process_audio_chunk(audio_chunk)
+            if conversation.mode == 'speaking':
+                for audio_chunk in conversation.stream_out_audio():
+                    await websocket.send_bytes(audio_chunk)
+            else:
+                await websocket.send_bytes(b'')
+        except Exception as e:
+            logging.error(e)
+            break
+@app.get("/")
+async def index():
+    return fastapi.responses.FileResponse("index.html")
+if __name__ == '__main__':
+    import uvicorn
+    uvicorn.run(app, host='0.0.0.0', port=8000)