Spaces:

wishitwerethe90s
/

voice-assistant

Sleeping

App Files Files Community

voice-assistant / main.py

wishitwerethe90s

Upload folder using huggingface_hub

c2ac364 verified 2 months ago

raw

history blame contribute delete

38.9 kB

	# main.py
	import asyncio
	import base64
	import io
	import logging
	import os
	from threading import Thread, Event # Added Event for better thread control
	import time # For timeout checks

	import soundfile as sf
	import torch
	import uvicorn
	import whisper
	from fastapi import FastAPI, File, UploadFile, WebSocket, WebSocketDisconnect
	from fastapi.responses import HTMLResponse, JSONResponse
	from fastapi.middleware.cors import CORSMiddleware
	from parler_tts import ParlerTTSForConditionalGeneration, ParlerTTSStreamer
	from transformers import AutoTokenizer, GenerationConfig # Keep transformers.GenerationConfig
	import google.generativeai as genai
	import numpy as np

	# --- Configuration ---
	WHISPER_MODEL_SIZE = os.getenv("WHISPER_MODEL_SIZE", "tiny")
	TTS_MODEL_NAME = "ai4bharat/indic-parler-tts"
	GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "AIzaSyD6x3Yoby4eQ6QL2kaaG_Rz3fG3rh7wPB8")
	GEMINI_MODEL_NAME = "gemini-1.5-flash-latest"

	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

	attn_implementation = "flash_attention_2" if torch.cuda.is_available() else "eager"

	torch_dtype_tts = torch.bfloat16 if DEVICE == "cuda" and torch.cuda.is_bf16_supported() else (torch.float16 if DEVICE == "cuda" else torch.float32)
	torch_dtype_whisper = torch.float16 if DEVICE == "cuda" else torch.float32

	TTS_DEFAULT_PARAMS = {
	"do_sample": True,
	"temperature": 1.0,
	"top_k": 50,
	"top_p": 0.95,
	"min_new_tokens": 5, # Reduced for quicker start with streamer
	# "max_new_tokens": 256, # Optional global cap
	}

	# --- Logging ---
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# --- FastAPI App Initialization ---
	app = FastAPI(title="Conversational AI Chatbot with Enhanced Stream Abortion")

	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	# --- Global Model Variables ---
	whisper_model = None
	gemini_model_instance = None
	tts_model = None
	tts_tokenizer = None
	# We will build the GenerationConfig object from TTS_DEFAULT_PARAMS inside the functions
	# or store it globally if preferred, initialized from transformers.GenerationConfig

	# --- Model Loading & API Configuration ---
	@app.on_event("startup")
	async def load_resources():
	global whisper_model, tts_model, tts_tokenizer, gemini_model_instance
	logger.info(f"Loading local models. Whisper on {DEVICE} with {torch_dtype_whisper}, TTS on {DEVICE} with {torch_dtype_tts}")

	try:
	logger.info(f"Loading Whisper model: {WHISPER_MODEL_SIZE}")
	whisper_model = whisper.load_model(WHISPER_MODEL_SIZE, device=DEVICE)
	logger.info("Whisper model loaded successfully.")

	logger.info(f"Loading IndicParler-TTS model: {TTS_MODEL_NAME}")
	tts_model = ParlerTTSForConditionalGeneration.from_pretrained(TTS_MODEL_NAME, attn_implementation=attn_implementation).to(DEVICE, dtype=torch_dtype_tts)
	tts_tokenizer = AutoTokenizer.from_pretrained(TTS_MODEL_NAME)

	if tts_tokenizer:
	if tts_tokenizer.pad_token_id is not None:
	TTS_DEFAULT_PARAMS["pad_token_id"] = tts_tokenizer.pad_token_id
	# ParlerTTS uses a special token_id for silence, not eos_token_id for generation end.
	# eos_token_id is more for text models.
	# if tts_tokenizer.eos_token_id is not None:
	# TTS_DEFAULT_PARAMS["eos_token_id"] = tts_tokenizer.eos_token_id
	logger.info(f"IndicParler-TTS model loaded. Default generation params: {TTS_DEFAULT_PARAMS}")

	if not GEMINI_API_KEY:
	logger.warning("GEMINI_API_KEY not found. LLM functionality will be limited.")
	else:
	try:
	genai.configure(api_key=GEMINI_API_KEY)
	gemini_model_instance = genai.GenerativeModel(GEMINI_MODEL_NAME)
	logger.info(f"Gemini API configured with model: {GEMINI_MODEL_NAME}")
	except Exception as e:
	logger.error(f"Failed to configure Gemini API: {e}", exc_info=True)
	gemini_model_instance = None

	except Exception as e:
	logger.error(f"Error loading models: {e}", exc_info=True)
	logger.info("Local models and API configurations loaded.")


	# --- Helper Functions ---
	async def transcribe_audio_bytes(audio_bytes: bytes) -> str:
	if not whisper_model:
	raise RuntimeError("Whisper model not loaded.")
	temp_audio_path = f"temp_audio_main_{os.urandom(4).hex()}.wav"
	try:
	with open(temp_audio_path, "wb") as f:
	f.write(audio_bytes)
	result = whisper_model.transcribe(temp_audio_path, fp16=(DEVICE == "cuda" and torch_dtype_whisper == torch.float16))
	transcribed_text = result["text"].strip()
	logger.info(f"Transcription: {transcribed_text}")
	return transcribed_text
	except Exception as e:
	logger.error(f"Error during transcription: {e}", exc_info=True)
	return ""
	finally:
	if os.path.exists(temp_audio_path):
	try:
	os.remove(temp_audio_path)
	except Exception as e_del:
	logger.error(f"Error deleting temp audio file {temp_audio_path}: {e_del}")


	async def generate_gemini_response(text: str) -> str:
	if not gemini_model_instance:
	logger.error("Gemini model instance not available.")
	return "Sorry, the language model is currently unavailable."
	try:
	full_prompt = f"User: {text}\nAssistant:"
	loop = asyncio.get_event_loop()
	response = await loop.run_in_executor(None, gemini_model_instance.generate_content, full_prompt)
	response_text = "I'm sorry, I couldn't generate a response for that."
	if hasattr(response, 'text') and response.text: # For simple text responses
	response_text = response.text.strip()
	elif response.parts: # New way to access parts for gemini-1.5-flash and pro
	response_text = "".join(part.text for part in response.parts).strip()
	elif response.candidates and response.candidates[0].content.parts: # Older way
	response_text = response.candidates[0].content.parts[0].text.strip()
	else:
	safety_feedback = ""
	if hasattr(response, 'prompt_feedback') and response.prompt_feedback:
	safety_feedback = f" Safety Feedback: {response.prompt_feedback}"
	elif response.candidates and hasattr(response.candidates[0], 'finish_reason') and response.candidates[0].finish_reason != "STOP":
	safety_feedback = f" Finish Reason: {response.candidates[0].finish_reason}"
	logger.warning(f"Gemini response might be empty or blocked.{safety_feedback}")
	logger.info(f"Gemini LLM Response: {response_text}")
	return response_text
	except Exception as e:
	logger.error(f"Error during Gemini LLM generation: {e}", exc_info=True)
	return "Sorry, I encountered an error trying to respond."


	async def synthesize_speech_streaming(text: str, description: str = "A clear, female voice speaking in English.", play_steps_in_s: float = 0.4, cancellation_event: Event = Event()):
	if not tts_model or not tts_tokenizer:
	logger.error("TTS model or tokenizer not loaded.")
	if cancellation_event and cancellation_event.is_set(): logger.info("TTS cancelled before start."); yield b""; return
	yield b""
	return

	if not text or not text.strip():
	logger.warning("TTS input text is empty. Yielding empty audio.")
	if cancellation_event and cancellation_event.is_set(): logger.info("TTS cancelled before start (empty text)."); yield b""; return
	yield b""
	return

	streamer = None
	thread = None

	try:
	logger.info(f"Starting TTS streaming with ParlerTTSStreamer for: \"{text[:50]}...\"")

	# Ensure sampling_rate is correctly accessed from the model's config
	# For ParlerTTS, it's usually under model.config.audio_encoder.sampling_rate
	if hasattr(tts_model.config, 'audio_encoder') and hasattr(tts_model.config.audio_encoder, 'sampling_rate'):
	sampling_rate = tts_model.config.audio_encoder.sampling_rate
	else:
	logger.warning("Could not find tts_model.config.audio_encoder.sampling_rate, defaulting to 24000")
	sampling_rate = 24000 # A common default for ParlerTTS if not found

	try:
	frame_rate = getattr(tts_model.config.audio_encoder, 'frame_rate', 100)
	except AttributeError:
	logger.warning("frame_rate not found in tts_model.config.audio_encoder. Using default of 100 Hz for play_steps calculation.")
	frame_rate = 100

	play_steps = int(frame_rate * play_steps_in_s)
	if play_steps == 0 : play_steps = 1

	logger.info(f"Streamer params: sampling_rate={sampling_rate}, frame_rate={frame_rate}, play_steps_in_s={play_steps_in_s}, play_steps={play_steps}")

	streamer = ParlerTTSStreamer(tts_model, device=DEVICE, play_steps=play_steps)

	description_inputs = tts_tokenizer(description, return_tensors="pt")
	prompt_inputs = tts_tokenizer(text, return_tensors="pt")

	gen_config_dict = TTS_DEFAULT_PARAMS.copy()
	# ParlerTTS generate method might not take a GenerationConfig object directly,
	# but rather individual kwargs. The streamer example passes them as kwargs.
	# We ensure pad_token_id and eos_token_id are set if the tokenizer has them.
	if tts_tokenizer.pad_token_id is not None:
	gen_config_dict["pad_token_id"] = tts_tokenizer.pad_token_id
	# ParlerTTS might not use eos_token_id in the same way as text models.
	# if tts_tokenizer.eos_token_id is not None:
	# gen_config_dict["eos_token_id"] = tts_tokenizer.eos_token_id


	thread_generation_kwargs = {
	"input_ids": description_inputs.input_ids.to(DEVICE),
	"prompt_input_ids": prompt_inputs.input_ids.to(DEVICE),
	"attention_mask": description_inputs.attention_mask.to(DEVICE) if hasattr(description_inputs, 'attention_mask') else None,
	"streamer": streamer,
	**gen_config_dict # Spread the generation parameters
	}
	if thread_generation_kwargs["attention_mask"] is None:
	del thread_generation_kwargs["attention_mask"]

	def _generate_in_thread():
	try:
	logger.info(f"TTS generation thread started.")
	with torch.no_grad():
	tts_model.generate(**thread_generation_kwargs)
	logger.info("TTS generation thread finished model.generate().")
	except Exception as e_thread:
	logger.error(f"Error in TTS generation thread: {e_thread}", exc_info=True)
	finally:
	if streamer: streamer.end()
	logger.info("TTS generation thread called streamer.end().")

	thread = Thread(target=_generate_in_thread)
	thread.daemon = True
	thread.start()

	loop = asyncio.get_event_loop()
	while True:
	if cancellation_event and cancellation_event.is_set():
	logger.info("TTS streaming cancelled by event.")
	break

	try:
	# Run the blocking streamer.__next__() in an executor
	audio_chunk_tensor = await loop.run_in_executor(None, streamer.__next__)

	if audio_chunk_tensor is None:
	logger.info("Streamer yielded None explicitly, ending stream.")
	break
	# This check for numel == 0 is important as streamer might yield empty tensors
	if not isinstance(audio_chunk_tensor, torch.Tensor) or audio_chunk_tensor.numel() == 0:
	# REMOVED: if streamer.is_done(): (AttributeError)
	# Instead, rely on StopIteration or explicit None from streamer
	await asyncio.sleep(0.01) # Small sleep if empty but not done
	continue

	audio_chunk_np = audio_chunk_tensor.cpu().to(torch.float32).numpy().squeeze()
	if audio_chunk_np.size == 0:
	continue

	audio_chunk_int16 = np.clip(audio_chunk_np * 32767, -32768, 32767).astype(np.int16)
	yield audio_chunk_int16.tobytes()
	# No need for sleep here if chunks are substantial, client will process
	# await asyncio.sleep(0.001) # Can be removed or made very small

	except StopIteration:
	logger.info("Streamer finished (StopIteration).")
	break
	except Exception as e_stream_iter:
	logger.error(f"Error iterating streamer: {e_stream_iter}", exc_info=True)
	break

	logger.info(f"Finished TTS streaming iteration for: \"{text[:50]}...\"")

	except Exception as e:
	logger.error(f"Error in synthesize_speech_streaming function: {e}", exc_info=True)
	yield b""
	finally:
	logger.info("Exiting synthesize_speech_streaming. Ensuring streamer is ended and thread is joined.")
	if streamer:
	streamer.end()
	if thread and thread.is_alive():
	logger.info("Waiting for TTS generation thread to complete in finally block...")
	final_join_start_time = time.time()
	thread.join(timeout=2.0)
	if thread.is_alive():
	logger.warning(f"TTS generation thread still alive after {time.time() - final_join_start_time:.2f}s in finally block.")


	# --- FastAPI HTTP Endpoints ---
	@app.post("/api/stt", summary="Speech to Text")
	async def speech_to_text_endpoint(file: UploadFile = File(...)):
	if not whisper_model:
	return JSONResponse(content={"error": "Whisper model not loaded"}, status_code=503)
	try:
	audio_bytes = await file.read()
	transcribed_text = await transcribe_audio_bytes(audio_bytes)
	return {"transcription": transcribed_text}
	except Exception as e:
	return JSONResponse(content={"error": str(e)}, status_code=500)

	@app.post("/api/llm", summary="LLM Response Generation (Gemini)")
	async def llm_endpoint(payload: dict):
	if not gemini_model_instance:
	return JSONResponse(content={"error": "Gemini LLM not configured or API key missing"}, status_code=503)
	try:
	text = payload.get("text")
	if not text:
	return JSONResponse(content={"error": "No text provided"}, status_code=400)
	response = await generate_gemini_response(text)
	return {"response": response}
	except Exception as e:
	return JSONResponse(content={"error": str(e)}, status_code=500)

	@app.post("/api/tts", summary="Text to Speech (Non-Streaming for HTTP)")
	async def text_to_speech_endpoint(payload: dict):
	if not tts_model or not tts_tokenizer:
	return JSONResponse(content={"error": "TTS model/tokenizer not loaded"}, status_code=503)
	try:
	text = payload.get("text")
	description = payload.get("description", "A clear, female voice speaking in English.")
	if not text:
	return JSONResponse(content={"error": "No text provided"}, status_code=400)

	description_inputs = tts_tokenizer(description, return_tensors="pt")
	prompt_inputs = tts_tokenizer(text, return_tensors="pt")

	# Use a GenerationConfig object for clarity and consistency
	gen_config_dict = TTS_DEFAULT_PARAMS.copy()
	if tts_tokenizer.pad_token_id is not None:
	gen_config_dict["pad_token_id"] = tts_tokenizer.pad_token_id
	# if tts_tokenizer.eos_token_id is not None: # ParlerTTS might not use standard eos
	# gen_config_dict["eos_token_id"] = tts_tokenizer.eos_token_id

	# Create GenerationConfig from transformers
	generation_config_obj = GenerationConfig(**gen_config_dict)


	with torch.no_grad():
	generation = tts_model.generate(
	input_ids=description_inputs.input_ids.to(DEVICE),
	prompt_input_ids=prompt_inputs.input_ids.to(DEVICE),
	attention_mask=description_inputs.attention_mask.to(DEVICE) if hasattr(description_inputs, 'attention_mask') else None,
	generation_config=generation_config_obj # Pass the config object
	).cpu().to(torch.float32).numpy().squeeze()

	audio_io = io.BytesIO()
	scaled_generation = np.clip(generation * 32767, -32768, 32767).astype(np.int16)

	current_sampling_rate = tts_model.config.audio_encoder.sampling_rate if hasattr(tts_model.config, 'audio_encoder') else 24000
	sf.write(audio_io, scaled_generation, samplerate=current_sampling_rate, format='WAV', subtype='PCM_16')
	audio_io.seek(0)
	audio_bytes = audio_io.read()

	if not audio_bytes:
	return JSONResponse(content={"error": "TTS failed to generate audio"}, status_code=500)
	audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
	return {"audio_base64": audio_base64, "format": "wav", "sample_rate": current_sampling_rate}
	except Exception as e:
	logger.error(f"TTS endpoint error: {e}", exc_info=True)
	return JSONResponse(content={"error": str(e)}, status_code=500)

	# --- WebSocket Endpoint for Real-time Conversation ---
	@app.websocket("/ws/conversation")
	async def conversation_websocket(websocket: WebSocket):
	await websocket.accept()
	logger.info(f"WebSocket connection accepted from: {websocket.client}")

	tts_cancellation_event = Event() # For this specific connection

	try:
	while True:
	if websocket.client_state.name != 'CONNECTED': # Check if client disconnected before receive
	logger.info(f"WebSocket client {websocket.client} disconnected before receive.")
	break

	audio_data = await websocket.receive_bytes()
	logger.info(f"Received {len(audio_data)} bytes of user audio data from {websocket.client}.")

	if not audio_data:
	logger.warning(f"Received empty audio data from user {websocket.client}.")
	continue

	transcribed_text = await transcribe_audio_bytes(audio_data)
	if not transcribed_text:
	logger.warning(f"Transcription failed for {websocket.client}.")
	await websocket.send_text("SYSTEM_ERROR: Transcription failed.")
	continue
	await websocket.send_text(f"USER_TRANSCRIPT: {transcribed_text}")

	llm_response_text = await generate_gemini_response(transcribed_text)
	if not llm_response_text or "Sorry, I encountered an error" in llm_response_text or "unavailable" in llm_response_text:
	logger.warning(f"LLM (Gemini) failed for {websocket.client}: {llm_response_text}")
	await websocket.send_text(f"SYSTEM_ERROR: LLM failed. ({llm_response_text})")
	continue
	await websocket.send_text(f"ASSISTANT_RESPONSE_TEXT: {llm_response_text}")

	tts_description = "A clear, female voice speaking in English."

	current_sampling_rate = tts_model.config.audio_encoder.sampling_rate if hasattr(tts_model.config, 'audio_encoder') else 24000
	audio_params_msg = f"TTS_STREAM_START:{{\"sample_rate\": {current_sampling_rate}, \"channels\": 1, \"bit_depth\": 16}}"
	await websocket.send_text(audio_params_msg)
	logger.info(f"Sent to client {websocket.client}: {audio_params_msg}")

	chunk_count = 0
	tts_cancellation_event.clear() # Reset event for new TTS task

	async for audio_chunk_bytes in synthesize_speech_streaming(llm_response_text, tts_description, cancellation_event=tts_cancellation_event):
	if not audio_chunk_bytes:
	logger.debug(f"Received empty bytes from streaming generator for {websocket.client}, might be end or error in generator.")
	continue
	try:
	if websocket.client_state.name != 'CONNECTED':
	logger.warning(f"Client {websocket.client} disconnected during TTS stream. Aborting TTS.")
	tts_cancellation_event.set() # Signal TTS thread to stop
	break
	await websocket.send_bytes(audio_chunk_bytes)
	chunk_count += 1
	except Exception as send_err:
	logger.warning(f"Error sending audio chunk to {websocket.client}: {send_err}. Client likely disconnected.")
	tts_cancellation_event.set() # Signal TTS thread to stop
	break

	if not tts_cancellation_event.is_set(): # Only send END if not cancelled
	logger.info(f"Sent {chunk_count} TTS audio chunks to client {websocket.client}.")
	await websocket.send_text("TTS_STREAM_END")
	logger.info(f"Sent TTS_STREAM_END to client {websocket.client}.")
	else:
	logger.info(f"TTS stream for {websocket.client} was cancelled. Sent {chunk_count} chunks before cancellation.")


	except WebSocketDisconnect:
	logger.info(f"WebSocket connection closed by client {websocket.client}.")
	tts_cancellation_event.set() # Signal any active TTS to stop
	except Exception as e:
	logger.error(f"Error in WebSocket conversation with {websocket.client}: {e}", exc_info=True)
	tts_cancellation_event.set() # Signal any active TTS to stop
	try:
	if websocket.client_state.name == 'CONNECTED':
	await websocket.send_text(f"SYSTEM_ERROR: An unexpected error occurred: {str(e)}")
	except Exception: pass
	finally:
	logger.info(f"Cleaning up WebSocket connection for {websocket.client}.")
	tts_cancellation_event.set() # Ensure event is set on any exit path
	if websocket.client_state.name == 'CONNECTED' or websocket.client_state.name == 'CONNECTING':
	try: await websocket.close()
	except Exception: pass
	logger.info(f"WebSocket connection resources cleaned up for {websocket.client}.")

	# ... (HTML serving and main execution block remain the same) ...
	@app.get("/", response_class=HTMLResponse)
	async def get_home():
	html_content = """
	<!DOCTYPE html>
	<html>
	<head>
	<title>Conversational AI Chatbot (Streaming)</title>
	<style>
	body { font-family: Arial, sans-serif; margin: 20px; background-color: #f4f4f4; color: #333; }
	#chatbox { width: 80%; max-width: 600px; margin: auto; background-color: #fff; padding: 20px; box-shadow: 0 0 10px rgba(0,0,0,0.1); border-radius: 8px; }
	.message { padding: 10px; margin-bottom: 10px; border-radius: 5px; }
	.user { background-color: #e1f5fe; text-align: right; }
	.assistant { background-color: #f1f8e9; }
	.system { background-color: #ffebee; color: #c62828; font-style: italic;}
	#audioPlayerContainer { margin-top: 10px; }
	#audioPlayer { display: none; width: 100%; }
	button { padding: 10px 15px; background-color: #007bff; color: white; border: none; border-radius: 5px; cursor: pointer; margin-top:10px; }
	button:disabled { background-color: #ccc; }
	#status { margin-top: 10px; font-style: italic; color: #666; }
	#transcriptionArea, #llmResponseArea { margin-top: 10px; padding: 5px; border: 1px solid #eee; background: #fafafa; word-wrap: break-word;}
	</style>
	</head>
	<body>
	<div id="chatbox">
	<h2>Real-time AI Chatbot (Streaming TTS)</h2>
	<div id="messages"></div>
	<div id="transcriptionArea"><strong>You (transcribed):</strong> <span id="userTranscript">...</span></div>
	<div id="llmResponseArea"><strong>Assistant (text):</strong> <span id="assistantTranscript">...</span></div>

	<button id="startRecordButton">Start Recording</button>
	<button id="stopRecordButton" disabled>Stop Recording</button>
	<p id="status">Status: Idle</p>
	<div id="audioPlayerContainer">
	<audio id="audioPlayer" controls></audio>
	</div>
	</div>

	<script>
	const startRecordButton = document.getElementById('startRecordButton');
	const stopRecordButton = document.getElementById('stopRecordButton');
	const audioPlayer = document.getElementById('audioPlayer');
	const messagesDiv = document.getElementById('messages');
	const statusDiv = document.getElementById('status');
	const userTranscriptSpan = document.getElementById('userTranscript');
	const assistantTranscriptSpan = document.getElementById('assistantTranscript');

	let websocket;
	let mediaRecorder;
	let userAudioChunks = [];

	let assistantAudioBufferQueue = [];
	let audioContext;
	let expectedSampleRate;
	let ttsStreaming = false;
	let audioPlaying = false;
	let sourceNode = null;

	function initAudioContext() {
	if (!audioContext \|\| audioContext.state === 'closed') {
	try {
	audioContext = new (window.AudioContext \|\| window.webkitAudioContext)();
	console.log("AudioContext initialized or re-initialized.");
	} catch (e) {
	console.error("Web Audio API is not supported in this browser.", e);
	addMessage("Error: Web Audio API not supported. Cannot play streamed audio.", "system");
	audioContext = null;
	}
	}
	}


	function connectWebSocket() {
	const protocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
	const wsUrl = `${protocol}//${window.location.host}/ws/conversation`;
	websocket = new WebSocket(wsUrl);
	websocket.binaryType = 'arraybuffer';

	websocket.onopen = () => {
	statusDiv.textContent = 'Status: Connected. Ready to record.';
	startRecordButton.disabled = false;
	initAudioContext();
	};

	websocket.onmessage = (event) => {
	if (event.data instanceof ArrayBuffer) {
	if (ttsStreaming && audioContext && expectedSampleRate) {
	const pcmDataInt16 = new Int16Array(event.data);
	if (pcmDataInt16.length > 0) {
	assistantAudioBufferQueue.push(pcmDataInt16);
	playNextChunkFromQueue();
	}
	} else {
	console.warn("Received ArrayBuffer data but not in TTS streaming mode or AudioContext not ready.");
	}
	} else {
	const messageText = event.data;
	if (messageText.startsWith("USER_TRANSCRIPT:")) {
	const transcript = messageText.substring("USER_TRANSCRIPT:".length).trim();
	userTranscriptSpan.textContent = transcript;
	} else if (messageText.startsWith("ASSISTANT_RESPONSE_TEXT:")) {
	const llmResponse = messageText.substring("ASSISTANT_RESPONSE_TEXT:".length).trim();
	assistantTranscriptSpan.textContent = llmResponse;
	addMessage(`Assistant: ${llmResponse}`, 'assistant');
	} else if (messageText.startsWith("TTS_STREAM_START:")) {
	ttsStreaming = true;
	assistantAudioBufferQueue = [];
	audioPlaying = false;
	if (sourceNode) {
	try { sourceNode.stop(); } catch(e) { console.warn("Error stopping previous sourceNode:", e); }
	sourceNode = null;
	}
	audioPlayer.style.display = 'none';
	audioPlayer.src = "";
	try {
	const paramsText = messageText.substring("TTS_STREAM_START:".length);
	const params = JSON.parse(paramsText);
	expectedSampleRate = params.sample_rate;
	initAudioContext();
	statusDiv.textContent = 'Status: Receiving audio stream...';
	addMessage('Assistant (Audio stream starting...)', 'assistant');
	} catch (e) {
	console.error("Could not parse TTS_STREAM_START params:", e);
	statusDiv.textContent = 'Error: Could not parse audio stream parameters.';
	ttsStreaming = false;
	}
	} else if (messageText === "TTS_STREAM_END") {
	ttsStreaming = false;
	if (!audioPlaying && assistantAudioBufferQueue.length === 0) {
	statusDiv.textContent = 'Status: Audio stream finished (or was empty).';
	} else if (!audioPlaying && assistantAudioBufferQueue.length > 0) {
	playNextChunkFromQueue();
	statusDiv.textContent = 'Status: Audio stream finished. Playing remaining...';
	} else {
	statusDiv.textContent = 'Status: Audio stream finished. Playing remaining...';
	}
	addMessage('Assistant (Audio stream ended)', 'assistant');
	} else if (messageText.startsWith("SYSTEM_ERROR:")) {
	const errorMsg = messageText.substring("SYSTEM_ERROR:".length).trim();
	addMessage(`System Error: ${errorMsg}`, 'system');
	statusDiv.textContent = `Error: ${errorMsg}`;
	ttsStreaming = false;
	assistantAudioBufferQueue = [];
	} else {
	addMessage(messageText, 'system');
	}
	}
	};
	websocket.onerror = (error) => {
	console.error('WebSocket Error:', error);
	statusDiv.textContent = 'Status: WebSocket error. Try reconnecting.';
	addMessage('WebSocket Error. Check console.', 'system');
	ttsStreaming = false;
	};

	websocket.onclose = () => {
	statusDiv.textContent = 'Status: Disconnected. Please refresh to reconnect.';
	startRecordButton.disabled = true;
	stopRecordButton.disabled = true;
	addMessage('Disconnected from server.', 'system');
	ttsStreaming = false;
	if (audioContext && audioContext.state !== 'closed') {
	audioContext.close().catch(e => console.warn("Error closing AudioContext:", e));
	audioContext = null;
	console.log("AudioContext closed.");
	}
	};
	}

	function playNextChunkFromQueue() {
	if (audioPlaying \|\| assistantAudioBufferQueue.length === 0 \|\| !audioContext \|\| audioContext.state !== 'running' \|\| !expectedSampleRate) {
	if (assistantAudioBufferQueue.length === 0 && !ttsStreaming && !audioPlaying) {
	console.log("Queue empty, not streaming, not playing: Playback complete.");
	statusDiv.textContent = 'Status: Audio playback complete.';
	}
	return;
	}
	audioPlaying = true;

	const pcmDataInt16 = assistantAudioBufferQueue.shift();

	const float32Pcm = new Float32Array(pcmDataInt16.length);
	for (let i = 0; i < pcmDataInt16.length; i++) {
	float32Pcm[i] = pcmDataInt16[i] / 32768.0;
	}

	const audioBuffer = audioContext.createBuffer(1, float32Pcm.length, expectedSampleRate);
	audioBuffer.getChannelData(0).set(float32Pcm);

	sourceNode = audioContext.createBufferSource();
	sourceNode.buffer = audioBuffer;
	sourceNode.connect(audioContext.destination);
	sourceNode.onended = () => {
	audioPlaying = false;
	if (ttsStreaming \|\| assistantAudioBufferQueue.length > 0) {
	playNextChunkFromQueue();
	} else {
	statusDiv.textContent = 'Status: Audio playback finished.';
	console.log("All queued audio chunks played.");
	}
	};
	sourceNode.start();
	statusDiv.textContent = 'Status: Playing audio chunk...';
	}

	function addMessage(text, type) {
	const messageElement = document.createElement('div');
	messageElement.classList.add('message', type);
	messageElement.textContent = text;
	messagesDiv.appendChild(messageElement);
	messagesDiv.scrollTop = messagesDiv.scrollHeight;
	}

	startRecordButton.onclick = async () => {
	if (!websocket \|\| websocket.readyState !== WebSocket.OPEN) {
	alert("WebSocket is not connected. Please wait or refresh.");
	return;
	}
	if (audioContext && audioContext.state === 'suspended') {
	audioContext.resume().catch(e => console.error("Error resuming AudioContext:", e));
	}
	initAudioContext();

	try {
	const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
	let options = { mimeType: 'audio/webm;codecs=opus' };
	if (!MediaRecorder.isTypeSupported(options.mimeType)) {
	console.warn(`${options.mimeType} is not supported, trying default.`);
	options = {};
	}
	mediaRecorder = new MediaRecorder(stream, options);
	userAudioChunks = [];

	mediaRecorder.ondataavailable = event => {
	if (event.data.size > 0) userAudioChunks.push(event.data);
	};

	mediaRecorder.onstop = () => {
	if (userAudioChunks.length === 0) {
	console.log("No audio data recorded.");
	statusDiv.textContent = 'Status: No audio data recorded. Try again.';
	startRecordButton.disabled = false;
	stopRecordButton.disabled = true;
	return;
	}
	const audioBlob = new Blob(userAudioChunks, { type: mediaRecorder.mimeType });
	if (websocket && websocket.readyState === WebSocket.OPEN) {
	websocket.send(audioBlob);
	statusDiv.textContent = 'Status: Audio sent. Waiting for response...';
	} else {
	statusDiv.textContent = 'Status: WebSocket not open. Cannot send audio.';
	}
	userAudioChunks = [];
	};

	mediaRecorder.start(250);
	startRecordButton.disabled = true;
	stopRecordButton.disabled = false;
	statusDiv.textContent = 'Status: Recording...';
	userTranscriptSpan.textContent = "...";
	assistantTranscriptSpan.textContent = "...";
	audioPlayer.style.display = 'none';
	audioPlayer.src = '';
	assistantAudioBufferQueue = [];
	if (sourceNode) { try {sourceNode.stop();} catch(e){} sourceNode = null; }
	} catch (err) {
	console.error('Error accessing microphone:', err);
	statusDiv.textContent = 'Status: Error accessing microphone.';
	alert('Could not access microphone: ' + err.message);
	}
	};

	stopRecordButton.onclick = () => {
	if (mediaRecorder && mediaRecorder.state === "recording") {
	mediaRecorder.stop();
	startRecordButton.disabled = false;
	stopRecordButton.disabled = true;
	}
	};

	connectWebSocket();
	</script>
	</body>
	</html>
	"""
	return HTMLResponse(content=html_content)

	if __name__ == "__main__":
	uvicorn.run(app, host="0.0.0.0", port=8000, log_level="info")