# model.py -- Realtime Video + Audio + Subtitles + Emotion Fusion
import os
import time
import threading
import wave
from pathlib import Path
from typing import List, Tuple, Dict

import av
import cv2
import numpy as np
import streamlit as st
import torch

from transformers import BertTokenizer, BertForSequenceClassification

# Custom modules (ensure they exist)
from face_model import FacialEmotionDetector
from voice_det import Voice_Analysis

from streamlit_webrtc import webrtc_streamer, WebRtcMode, VideoTransformerBase, AudioProcessorBase

# ------------------------- Config -------------------------
FRAME_DETECT_EVERY_N = 4   # run YOLO every Nth frame (adjust for CPU)
AUDIO_SAMPLE_RATE = 48000
TEMP_AUDIO_PATH = "temp_recordings/live.wav"
BEST_PT = Path(__file__).parent / "best.pt"

st.set_page_config(page_title="AutVid AI — Realtime", layout="wide")
st.title("🧠 AutVid AI — Real-time Video + Audio Emotion")

# ------------------------- Cached model loaders -------------------------
@st.cache_resource
def load_face_model_main():
    if not BEST_PT.exists():
        st.warning(f"YOLO weights not found at {BEST_PT.resolve()}. Video detection will show placeholder.")
        return None
    try:
        det = FacialEmotionDetector(model_path=str(BEST_PT))
        st.info("FacialEmotionDetector loaded.")
        return det
    except Exception as e:
        st.error(f"Failed to load FacialEmotionDetector: {e}")
        return None

@st.cache_resource
def load_voice_model():
    try:
        vm = Voice_Analysis()
        st.info("Voice_Analysis loaded.")
        return vm
    except Exception as e:
        st.error(f"Failed to load Voice_Analysis: {e}")
        return None

@st.cache_resource
def load_text_model():
    try:
        model_name = "bhadresh-savani/bert-base-go-emotion"
        tok = BertTokenizer.from_pretrained(model_name)
        mdl = BertForSequenceClassification.from_pretrained(model_name)
        mdl.eval()
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        mdl.to(device)
        id2label = mdl.config.id2label if hasattr(mdl.config, "id2label") else {i: str(i) for i in range(mdl.config.num_labels)}
        label_list = [id2label[i] for i in range(len(id2label))]
        st.info("Text model loaded.")
        return tok, mdl, device, label_list
    except Exception as e:
        st.error(f"Failed to load text model: {e}")
        return None, None, None, []

face_model_main = load_face_model_main()
voice_model = load_voice_model()
tokenizer, text_model, device, label_list = load_text_model()

# ------------------------- Text analysis -------------------------
def analyze_text_multilabel(text: str, threshold: float = 0.3) -> Tuple[List[str], Dict[str, float]]:
    if not text.strip() or text_model is None:
        return [], {}
    enc = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256).to(device)
    with torch.no_grad():
        logits = text_model(**enc).logits
        probs = torch.sigmoid(logits)[0].cpu().numpy()
    scores = {label_list[i]: float(probs[i]) for i in range(len(label_list))}
    chosen = [lbl for lbl, p in scores.items() if p >= threshold]
    if not chosen:
        chosen = [max(scores, key=scores.get)]
    return chosen, scores

# ------------------------- WebRTC processors -------------------------
class AudioRecorder(AudioProcessorBase):
    def __init__(self):
        self.frames = []
        self.lock = threading.Lock()
        self.sample_rate = AUDIO_SAMPLE_RATE

    def recv_audio(self, frame: av.AudioFrame) -> av.AudioFrame:
        arr = frame.to_ndarray()
        mono = np.mean(arr, axis=0).astype(np.int16) if arr.ndim == 2 else arr.astype(np.int16)
        with self.lock:
            self.frames.append(mono)
        return frame

    def save_wav(self, filename: str = TEMP_AUDIO_PATH) -> str:
        os.makedirs(os.path.dirname(filename), exist_ok=True)
        with self.lock:
            if not self.frames:
                raise ValueError("No audio captured")
            audio = np.concatenate(self.frames, axis=0).astype(np.int16)
        with wave.open(filename, "wb") as wf:
            wf.setnchannels(1)
            wf.setsampwidth(2)
            wf.setframerate(self.sample_rate)
            wf.writeframes(audio.tobytes())
        return filename

    def clear(self):
        with self.lock:
            self.frames = []

class VideoProcessor(VideoTransformerBase):
    def __init__(self):
        try:
            self.detector = FacialEmotionDetector(model_path=str(BEST_PT)) if BEST_PT.exists() else None
        except:
            self.detector = None
        self.lock = threading.Lock()
        self.counter = 0
        self.last_annotated = None
        self.last_emotion = None

    def transform(self, frame: av.VideoFrame) -> av.VideoFrame:
        img = frame.to_ndarray(format="bgr24")
        annotated = img.copy()
        emo = None
        self.counter += 1
        try:
            if self.counter % FRAME_DETECT_EVERY_N == 0 and self.detector:
                ann, emo = self.detector.detect_emotion(img)
                if ann is not None:
                    annotated = ann
        except Exception as e:
            print("Frame detection error:", e)

        # Overlay transcript
        transcript = st.session_state.get("transcript_overlay", "")
        y0 = 30
        for i, line in enumerate(transcript.split("\n")[-3:]):
            y = y0 + i*25
            cv2.putText(annotated, line, (10, y), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255,255,255), 2)

        # Overlay last emotion
        if emo:
            cv2.putText(annotated, f"Emotion: {emo}", (10, y0 + 100), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0,255,0), 2)

        with self.lock:
            self.last_annotated = annotated.copy()
            self.last_emotion = emo

        return av.VideoFrame.from_ndarray(annotated, format="bgr24")

    def get_last(self):
        with self.lock:
            return self.last_annotated, self.last_emotion

# ------------------------- Session state -------------------------
for k, v in {
    "video_emotion": None,
    "voice_emotion": None,
    "transcript": "",
    "transcript_overlay": "",
    "text_emotions": []
}.items():
    if k not in st.session_state:
        st.session_state[k] = v

# ------------------------- UI / Streamer -------------------------
st.sidebar.markdown("## Controls")
FRAME_DETECT_EVERY_N = st.sidebar.slider("Run YOLO every N frames", 1, 12, FRAME_DETECT_EVERY_N, 1)
auto_analyze = st.sidebar.checkbox("Auto analyze audio every interval", value=False)
auto_interval = st.sidebar.slider("Auto analyze interval (s)", 5, 30, 12, 1)

col_main, col_side = st.columns([2, 1])

with col_main:
    st.subheader("Live camera (annotated)")
    ctx = webrtc_streamer(
        key="live-av",
        mode=WebRtcMode.SENDRECV,
        video_transformer_factory=VideoProcessor,
        audio_processor_factory=AudioRecorder,
        media_stream_constraints={"video": True, "audio": True},
        async_processing=True,
    )

    st.markdown("---")
    st.write("Live preview from worker:")
    if ctx and ctx.video_transformer:
        annotated_frame, last_emo = ctx.video_transformer.get_last()
        if annotated_frame is not None:
            st.image(cv2.cvtColor(annotated_frame, cv2.COLOR_BGR2RGB), caption=f"Emotion: {last_emo}")

with col_side:
    st.subheader("Live outputs")
    st.metric("Video emotion", st.session_state.get("video_emotion") or "N/A")
    st.metric("Voice emotion", st.session_state.get("voice_emotion") or "N/A")
    st.text_area("Transcript", value=st.session_state.get("transcript", ""), height=160)

    if st.button("Clear audio buffer") and ctx and ctx.audio_receiver:
        try:
            ctx.audio_receiver._processor.clear()
            st.success("Cleared audio buffer.")
        except Exception as e:
            st.error(f"Clear failed: {e}")

    if st.button("Save & Analyze now") and ctx and ctx.audio_receiver:
        proc = ctx.audio_receiver._processor
        try:
            wav = proc.save_wav(TEMP_AUDIO_PATH)
            proc.clear()
            st.audio(wav)
            if voice_model:
                res = voice_model.detect(wav)
                if res:
                    st.session_state.voice_emotion = max(res, key=lambda r: r["score"])["label"]
                st.session_state.transcript = voice_model.subtitles(wav)
                st.session_state.transcript_overlay = st.session_state.transcript
            st.success("Saved and analyzed audio.")
        except Exception as e:
            st.error(f"Save/analyze failed: {e}")

# Update video emotion from worker
if ctx and ctx.video_transformer:
    _, last_vid_emo = ctx.video_transformer.get_last()
    if last_vid_emo:
        st.session_state.video_emotion = last_vid_emo

# Auto audio analyze loop
def auto_audio_loop():
    while True:
        if auto_analyze and ctx and ctx.audio_receiver:
            try:
                proc = ctx.audio_receiver._processor
                wav = proc.save_wav(TEMP_AUDIO_PATH.replace(".wav","_auto.wav"))
                proc.clear()
                if voice_model:
                    res = voice_model.detect(wav)
                    if res:
                        st.session_state.voice_emotion = max(res, key=lambda r: r["score"])["label"]
                    txt = voice_model.subtitles(wav)
                    st.session_state.transcript = txt
                    st.session_state.transcript_overlay = txt
            except Exception:
                pass
        time.sleep(auto_interval)

threading.Thread(target=auto_audio_loop, daemon=True).start()

# ---- Text analysis UI ----
st.markdown("---")
st.subheader("Text Emotion (BERT multi-label)")
text_in = st.text_area("Enter text to analyze", value=st.session_state.get("transcript", ""), height=140)
thresh = st.slider("Confidence threshold", 0.1, 0.9, 0.3, 0.05)
if st.button("Analyze text"):
    chosen, scores = analyze_text_multilabel(text_in, threshold=thresh)
    st.session_state.text_emotions = chosen
    if scores:
        st.json({k: round(v,4) for k,v in sorted(scores.items(), key=lambda x: x[1], reverse=True)})
    if chosen:
        st.success(f"Predicted (≥{thresh:.2f}): {', '.join(chosen)}")

# ---- Multimodal Fusion ----
st.markdown("---")
st.subheader("Multimodal Fusion")
st.write("Video 0.5, Voice 0.3, Text 0.2")

def fuse(video_emotion, voice_emotion, text_emotions):
    w = {"video":0.5, "voice":0.3, "text":0.2}
    s = {}
    if video_emotion:
        s[video_emotion] = s.get(video_emotion,0)+w["video"]
    if voice_emotion:
        s[voice_emotion] = s.get(voice_emotion,0)+w["voice"]
    if text_emotions:
        share = w["text"]/max(1,len(text_emotions))
        for t in text_emotions:
            s[t] = s.get(t,0)+share
    return s

if st.button("Fuse now"):
    breakdown = fuse(st.session_state.get("video_emotion"), st.session_state.get("voice_emotion"), st.session_state.get("text_emotions", []))
    if breakdown:
        dom = max(breakdown, key=breakdown.get)
        st.success(f"Dominant emotion: {dom}")
        st.json({k: round(v,3) for k,v in breakdown.items()})
    else:
        st.warning("No modalities available yet.")