Spaces:

evildeity
/

HackOdisha

Sleeping

HackOdisha / model.py

Shreyas

Upload 9 files

c20196f verified 2 months ago

11.3 kB

	# model.py -- Realtime Video + Audio + Subtitles + Emotion Fusion
	import os
	import time
	import threading
	import wave
	from pathlib import Path
	from typing import List, Tuple, Dict

	import av
	import cv2
	import numpy as np
	import streamlit as st
	import torch

	from transformers import BertTokenizer, BertForSequenceClassification

	# Custom modules (ensure they exist)
	from face_model import FacialEmotionDetector
	from voice_det import Voice_Analysis

	from streamlit_webrtc import webrtc_streamer, WebRtcMode, VideoTransformerBase, AudioProcessorBase

	# ------------------------- Config -------------------------
	FRAME_DETECT_EVERY_N = 4 # run YOLO every Nth frame (adjust for CPU)
	AUDIO_SAMPLE_RATE = 48000
	TEMP_AUDIO_PATH = "temp_recordings/live.wav"
	BEST_PT = Path(__file__).parent / "best.pt"

	st.set_page_config(page_title="AutVid AI — Realtime", layout="wide")
	st.title("🧠 AutVid AI — Real-time Video + Audio Emotion")

	# ------------------------- Cached model loaders -------------------------
	@st.cache_resource
	def load_face_model_main():
	if not BEST_PT.exists():
	st.warning(f"YOLO weights not found at {BEST_PT.resolve()}. Video detection will show placeholder.")
	return None
	try:
	det = FacialEmotionDetector(model_path=str(BEST_PT))
	st.info("FacialEmotionDetector loaded.")
	return det
	except Exception as e:
	st.error(f"Failed to load FacialEmotionDetector: {e}")
	return None

	@st.cache_resource
	def load_voice_model():
	try:
	vm = Voice_Analysis()
	st.info("Voice_Analysis loaded.")
	return vm
	except Exception as e:
	st.error(f"Failed to load Voice_Analysis: {e}")
	return None

	@st.cache_resource
	def load_text_model():
	try:
	model_name = "bhadresh-savani/bert-base-go-emotion"
	tok = BertTokenizer.from_pretrained(model_name)
	mdl = BertForSequenceClassification.from_pretrained(model_name)
	mdl.eval()
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	mdl.to(device)
	id2label = mdl.config.id2label if hasattr(mdl.config, "id2label") else {i: str(i) for i in range(mdl.config.num_labels)}
	label_list = [id2label[i] for i in range(len(id2label))]
	st.info("Text model loaded.")
	return tok, mdl, device, label_list
	except Exception as e:
	st.error(f"Failed to load text model: {e}")
	return None, None, None, []

	face_model_main = load_face_model_main()
	voice_model = load_voice_model()
	tokenizer, text_model, device, label_list = load_text_model()

	# ------------------------- Text analysis -------------------------
	def analyze_text_multilabel(text: str, threshold: float = 0.3) -> Tuple[List[str], Dict[str, float]]:
	if not text.strip() or text_model is None:
	return [], {}
	enc = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256).to(device)
	with torch.no_grad():
	logits = text_model(**enc).logits
	probs = torch.sigmoid(logits)[0].cpu().numpy()
	scores = {label_list[i]: float(probs[i]) for i in range(len(label_list))}
	chosen = [lbl for lbl, p in scores.items() if p >= threshold]
	if not chosen:
	chosen = [max(scores, key=scores.get)]
	return chosen, scores

	# ------------------------- WebRTC processors -------------------------
	class AudioRecorder(AudioProcessorBase):
	def __init__(self):
	self.frames = []
	self.lock = threading.Lock()
	self.sample_rate = AUDIO_SAMPLE_RATE

	def recv_audio(self, frame: av.AudioFrame) -> av.AudioFrame:
	arr = frame.to_ndarray()
	mono = np.mean(arr, axis=0).astype(np.int16) if arr.ndim == 2 else arr.astype(np.int16)
	with self.lock:
	self.frames.append(mono)
	return frame

	def save_wav(self, filename: str = TEMP_AUDIO_PATH) -> str:
	os.makedirs(os.path.dirname(filename), exist_ok=True)
	with self.lock:
	if not self.frames:
	raise ValueError("No audio captured")
	audio = np.concatenate(self.frames, axis=0).astype(np.int16)
	with wave.open(filename, "wb") as wf:
	wf.setnchannels(1)
	wf.setsampwidth(2)
	wf.setframerate(self.sample_rate)
	wf.writeframes(audio.tobytes())
	return filename

	def clear(self):
	with self.lock:
	self.frames = []

	class VideoProcessor(VideoTransformerBase):
	def __init__(self):
	try:
	self.detector = FacialEmotionDetector(model_path=str(BEST_PT)) if BEST_PT.exists() else None
	except:
	self.detector = None
	self.lock = threading.Lock()
	self.counter = 0
	self.last_annotated = None
	self.last_emotion = None

	def transform(self, frame: av.VideoFrame) -> av.VideoFrame:
	img = frame.to_ndarray(format="bgr24")
	annotated = img.copy()
	emo = None
	self.counter += 1
	try:
	if self.counter % FRAME_DETECT_EVERY_N == 0 and self.detector:
	ann, emo = self.detector.detect_emotion(img)
	if ann is not None:
	annotated = ann
	except Exception as e:
	print("Frame detection error:", e)

	# Overlay transcript
	transcript = st.session_state.get("transcript_overlay", "")
	y0 = 30
	for i, line in enumerate(transcript.split("\n")[-3:]):
	y = y0 + i*25
	cv2.putText(annotated, line, (10, y), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255,255,255), 2)

	# Overlay last emotion
	if emo:
	cv2.putText(annotated, f"Emotion: {emo}", (10, y0 + 100), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0,255,0), 2)

	with self.lock:
	self.last_annotated = annotated.copy()
	self.last_emotion = emo

	return av.VideoFrame.from_ndarray(annotated, format="bgr24")

	def get_last(self):
	with self.lock:
	return self.last_annotated, self.last_emotion

	# ------------------------- Session state -------------------------
	for k, v in {
	"video_emotion": None,
	"voice_emotion": None,
	"transcript": "",
	"transcript_overlay": "",
	"text_emotions": []
	}.items():
	if k not in st.session_state:
	st.session_state[k] = v

	# ------------------------- UI / Streamer -------------------------
	st.sidebar.markdown("## Controls")
	FRAME_DETECT_EVERY_N = st.sidebar.slider("Run YOLO every N frames", 1, 12, FRAME_DETECT_EVERY_N, 1)
	auto_analyze = st.sidebar.checkbox("Auto analyze audio every interval", value=False)
	auto_interval = st.sidebar.slider("Auto analyze interval (s)", 5, 30, 12, 1)

	col_main, col_side = st.columns([2, 1])

	with col_main:
	st.subheader("Live camera (annotated)")
	ctx = webrtc_streamer(
	key="live-av",
	mode=WebRtcMode.SENDRECV,
	video_transformer_factory=VideoProcessor,
	audio_processor_factory=AudioRecorder,
	media_stream_constraints={"video": True, "audio": True},
	async_processing=True,
	)

	st.markdown("---")
	st.write("Live preview from worker:")
	if ctx and ctx.video_transformer:
	annotated_frame, last_emo = ctx.video_transformer.get_last()
	if annotated_frame is not None:
	st.image(cv2.cvtColor(annotated_frame, cv2.COLOR_BGR2RGB), caption=f"Emotion: {last_emo}")

	with col_side:
	st.subheader("Live outputs")
	st.metric("Video emotion", st.session_state.get("video_emotion") or "N/A")
	st.metric("Voice emotion", st.session_state.get("voice_emotion") or "N/A")
	st.text_area("Transcript", value=st.session_state.get("transcript", ""), height=160)

	if st.button("Clear audio buffer") and ctx and ctx.audio_receiver:
	try:
	ctx.audio_receiver._processor.clear()
	st.success("Cleared audio buffer.")
	except Exception as e:
	st.error(f"Clear failed: {e}")

	if st.button("Save & Analyze now") and ctx and ctx.audio_receiver:
	proc = ctx.audio_receiver._processor
	try:
	wav = proc.save_wav(TEMP_AUDIO_PATH)
	proc.clear()
	st.audio(wav)
	if voice_model:
	res = voice_model.detect(wav)
	if res:
	st.session_state.voice_emotion = max(res, key=lambda r: r["score"])["label"]
	st.session_state.transcript = voice_model.subtitles(wav)
	st.session_state.transcript_overlay = st.session_state.transcript
	st.success("Saved and analyzed audio.")
	except Exception as e:
	st.error(f"Save/analyze failed: {e}")

	# Update video emotion from worker
	if ctx and ctx.video_transformer:
	_, last_vid_emo = ctx.video_transformer.get_last()
	if last_vid_emo:
	st.session_state.video_emotion = last_vid_emo

	# Auto audio analyze loop
	def auto_audio_loop():
	while True:
	if auto_analyze and ctx and ctx.audio_receiver:
	try:
	proc = ctx.audio_receiver._processor
	wav = proc.save_wav(TEMP_AUDIO_PATH.replace(".wav","_auto.wav"))
	proc.clear()
	if voice_model:
	res = voice_model.detect(wav)
	if res:
	st.session_state.voice_emotion = max(res, key=lambda r: r["score"])["label"]
	txt = voice_model.subtitles(wav)
	st.session_state.transcript = txt
	st.session_state.transcript_overlay = txt
	except Exception:
	pass
	time.sleep(auto_interval)

	threading.Thread(target=auto_audio_loop, daemon=True).start()

	# ---- Text analysis UI ----
	st.markdown("---")
	st.subheader("Text Emotion (BERT multi-label)")
	text_in = st.text_area("Enter text to analyze", value=st.session_state.get("transcript", ""), height=140)
	thresh = st.slider("Confidence threshold", 0.1, 0.9, 0.3, 0.05)
	if st.button("Analyze text"):
	chosen, scores = analyze_text_multilabel(text_in, threshold=thresh)
	st.session_state.text_emotions = chosen
	if scores:
	st.json({k: round(v,4) for k,v in sorted(scores.items(), key=lambda x: x[1], reverse=True)})
	if chosen:
	st.success(f"Predicted (≥{thresh:.2f}): {', '.join(chosen)}")

	# ---- Multimodal Fusion ----
	st.markdown("---")
	st.subheader("Multimodal Fusion")
	st.write("Video 0.5, Voice 0.3, Text 0.2")

	def fuse(video_emotion, voice_emotion, text_emotions):
	w = {"video":0.5, "voice":0.3, "text":0.2}
	s = {}
	if video_emotion:
	s[video_emotion] = s.get(video_emotion,0)+w["video"]
	if voice_emotion:
	s[voice_emotion] = s.get(voice_emotion,0)+w["voice"]
	if text_emotions:
	share = w["text"]/max(1,len(text_emotions))
	for t in text_emotions:
	s[t] = s.get(t,0)+share
	return s

	if st.button("Fuse now"):
	breakdown = fuse(st.session_state.get("video_emotion"), st.session_state.get("voice_emotion"), st.session_state.get("text_emotions", []))
	if breakdown:
	dom = max(breakdown, key=breakdown.get)
	st.success(f"Dominant emotion: {dom}")
	st.json({k: round(v,3) for k,v in breakdown.items()})
	else:
	st.warning("No modalities available yet.")