Spaces:

coding009
/

computer-vision

Running

File size: 44,748 Bytes

966dfa2

"""
VisionAI — Object Detection & Human Pose Estimation using YOLO
Semester Project

Key features:
  • weapon_detection.pt  — custom weapon model (bundled)
  • Pose Threat Analysis  — classifies each detected person's pose as:
        🟢 NORMAL       — relaxed / standing / walking
        🟡 SUSPICIOUS   — crouching / leaning / unusual angle
        🔴 THREATENING  — raised arms / aggressive / weapon + person together
  • FPS-based video scanning (choose how many frames/sec to analyse)
  • Works on HuggingFace free tier (CPU-safe)
"""

import cv2
import json
import math
import tempfile
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import gradio as gr
from ultralytics import YOLO

try:
    import spaces
except ImportError:
    class spaces:
        @staticmethod
        def GPU(fn): return fn

# ══════════════════════════════════════════════════════════════════
#  MODEL LOADING
# ══════════════════════════════════════════════════════════════════
print("=" * 60)
print("[VisionAI] Loading models ...")

def _load(path, label):
    try:
        m = YOLO(path)
        print(f"  ✅  {label} ({path})")
        return m
    except Exception as e:
        print(f"  ⚠️  {label} skipped — {e}")
        return None

MODEL_OD     = _load("yolo11m.pt",          "Object Detection")
MODEL_POSE   = _load("yolo11m-pose.pt",      "Pose Estimation")
MODEL_SEG    = _load("yolo11m-seg.pt",       "Segmentation")
MODEL_CLS    = _load("yolo11m-cls.pt",       "Classification")
MODEL_OBB    = _load("yolo11m-obb.pt",       "OBB Detection")
MODEL_WEAPON = _load("weapon_detection.pt",  "Weapon Detection ★")

# Ordered task registry (always includes weapon if loaded)
MODELS = {}
if MODEL_OD:     MODELS["object_detection"] = MODEL_OD
if MODEL_POSE:   MODELS["pose"]             = MODEL_POSE
if MODEL_SEG:    MODELS["segmentation"]     = MODEL_SEG
if MODEL_CLS:    MODELS["classification"]   = MODEL_CLS
if MODEL_OBB:    MODELS["obb"]             = MODEL_OBB
if MODEL_WEAPON: MODELS["weapon"]           = MODEL_WEAPON

TASK_DISPLAY = {
    "object_detection": "🔍 Object Detection",
    "pose":             "🦴 Pose Estimation",
    "segmentation":     "🎭 Segmentation",
    "classification":   "🏷️ Classification",
    "obb":              "📦 OBB Detection",
    "weapon":           "🔫 Weapon Detection",
}

OVERLAY_TASKS = [t for t in ["object_detection","pose","segmentation","obb","weapon"] if t in MODELS]
ALL_TASKS     = list(MODELS.keys())

print(f"[VisionAI] ✅ {len(MODELS)} models loaded: {ALL_TASKS}")
print("=" * 60)


# ══════════════════════════════════════════════════════════════════
#  POSE THREAT ANALYSER
#  COCO 17 keypoints:
#   0-nose  1-left_eye  2-right_eye  3-left_ear  4-right_ear
#   5-left_shoulder  6-right_shoulder
#   7-left_elbow     8-right_elbow
#   9-left_wrist    10-right_wrist
#  11-left_hip      12-right_hip
#  13-left_knee     14-right_knee
#  15-left_ankle    16-right_ankle
# ══════════════════════════════════════════════════════════════════

THREAT_NORMAL      = "NORMAL"
THREAT_SUSPICIOUS  = "SUSPICIOUS"
THREAT_THREATENING = "THREATENING"

THREAT_COLOR = {
    THREAT_NORMAL:      (34,  197, 94),   # green
    THREAT_SUSPICIOUS:  (234, 179, 8),    # yellow
    THREAT_THREATENING: (239, 68,  68),   # red
}
THREAT_EMOJI = {
    THREAT_NORMAL:      "🟢",
    THREAT_SUSPICIOUS:  "🟡",
    THREAT_THREATENING: "🔴",
}

def _kp(kps, idx):
    """Return (x, y, visible) for keypoint index. visible=True if coords > 0."""
    if idx >= len(kps):
        return 0, 0, False
    x, y = float(kps[idx][0]), float(kps[idx][1])
    return x, y, (x > 1 and y > 1)

def _angle(a, b, c):
    """Angle at point b formed by a-b-c (degrees)."""
    ax, ay = a[0]-b[0], a[1]-b[1]
    cx, cy = c[0]-b[0], c[1]-b[1]
    dot  = ax*cx + ay*cy
    mag  = (math.hypot(ax,ay) * math.hypot(cx,cy)) + 1e-6
    return math.degrees(math.acos(max(-1, min(1, dot/mag))))

def analyse_pose_threat(kps, weapon_in_frame=False):
    """
    Returns (threat_level, reason_string) for a single person's keypoints.
    kps: list of [x, y] for 17 COCO keypoints.
    """
    # ── Extract key points ──
    nose_x, nose_y, nose_v         = _kp(kps, 0)
    ls_x,   ls_y,   ls_v          = _kp(kps, 5)   # left shoulder
    rs_x,   rs_y,   rs_v          = _kp(kps, 6)   # right shoulder
    le_x,   le_y,   le_v          = _kp(kps, 7)   # left elbow
    re_x,   re_y,   re_v          = _kp(kps, 8)   # right elbow
    lw_x,   lw_y,   lw_v          = _kp(kps, 9)   # left wrist
    rw_x,   rw_y,   rw_v          = _kp(kps, 10)  # right wrist
    lh_x,   lh_y,   lh_v          = _kp(kps, 11)  # left hip
    rh_x,   rh_y,   rh_v          = _kp(kps, 12)  # right hip
    lk_x,   lk_y,   lk_v          = _kp(kps, 13)  # left knee
    rk_x,   rk_y,   rk_v          = _kp(kps, 14)  # right knee
    la_x,   la_y,   la_v          = _kp(kps, 15)  # left ankle
    ra_x,   ra_y,   ra_v          = _kp(kps, 16)  # right ankle

    reasons = []
    score   = 0   # accumulate threat score

    # ── 1. ARMS RAISED (wrists above shoulders) ──
    arms_raised = 0
    if lw_v and ls_v and lw_y < ls_y - 20:   # y decreases upward in image coords
        arms_raised += 1
    if rw_v and rs_v and rw_y < rs_y - 20:
        arms_raised += 1
    if arms_raised == 2:
        score += 3
        reasons.append("both arms raised")
    elif arms_raised == 1:
        score += 1
        reasons.append("one arm raised")

    # ── 2. ARMS EXTENDED FORWARD / POINTING ──
    # Wrists far from body centre horizontally = reaching/pointing
    body_cx = 0
    if ls_v and rs_v:
        body_cx = (ls_x + rs_x) / 2
    if body_cx > 0:
        if lw_v and abs(lw_x - body_cx) > 120:
            score += 1
            reasons.append("left arm extended")
        if rw_v and abs(rw_x - body_cx) > 120:
            score += 1
            reasons.append("right arm extended")

    # ── 3. ELBOW ANGLE (acute = punching / striking pose) ──
    if lw_v and le_v and ls_v:
        ang = _angle((ls_x,ls_y),(le_x,le_y),(lw_x,lw_y))
        if ang < 70:
            score += 2
            reasons.append(f"left arm bent aggressively ({ang:.0f}°)")
    if rw_v and re_v and rs_v:
        ang = _angle((rs_x,rs_y),(re_x,re_y),(rw_x,rw_y))
        if ang < 70:
            score += 2
            reasons.append(f"right arm bent aggressively ({ang:.0f}°)")

    # ── 4. CROUCHING (knees higher than hips relative to ankles) ──
    if lk_v and lh_v and la_v:
        torso_h = abs(lh_y - la_y) + 1e-6
        crouch_ratio = (lk_y - lh_y) / torso_h
        if crouch_ratio < 0.15:         # knee close to hip → crouching
            score += 1
            reasons.append("crouching posture")

    # ── 5. LEANING / TILTED BODY ──
    if ls_v and rs_v:
        shoulder_tilt = abs(ls_y - rs_y) / (abs(ls_x - rs_x) + 1e-6)
        if shoulder_tilt > 0.45:
            score += 1
            reasons.append(f"body tilted ({shoulder_tilt:.2f})")

    # ── 6. WEAPON IN SAME FRAME ──
    if weapon_in_frame:
        score += 4
        reasons.append("weapon detected nearby")

    # ── 7. WIDE STANCE (feet far apart) ──
    if la_v and ra_v and ls_v and rs_v:
        shoulder_w = abs(ls_x - rs_x) + 1e-6
        stance_w   = abs(la_x - ra_x)
        if stance_w / shoulder_w > 1.8:
            score += 1
            reasons.append("wide aggressive stance")

    # ── Map score → threat level ──
    if score >= 6:
        level = THREAT_THREATENING
    elif score >= 2:
        level = THREAT_SUSPICIOUS
    else:
        level = THREAT_NORMAL

    reason_str = ", ".join(reasons) if reasons else "relaxed posture"
    return level, reason_str, score


# ══════════════════════════════════════════════════════════════════
#  OVERLAY DRAWING
# ══════════════════════════════════════════════════════════════════
def draw_threat_overlay(frame_bgr, persons):
    """
    Draw a threat status badge per person on the frame.
    persons: list of dicts with keys: bbox, threat, reason, score
    Returns annotated BGR frame.
    """
    out = frame_bgr.copy()
    for p in persons:
        x1, y1, x2, y2 = [int(v) for v in p["bbox"]]
        threat  = p["threat"]
        color   = THREAT_COLOR[threat]   # (R,G,B)
        bgr     = (color[2], color[1], color[0])  # cv2 BGR
        emoji   = THREAT_EMOJI[threat]

        # Bounding box border
        cv2.rectangle(out, (x1,y1), (x2,y2), bgr, 2)

        # Label background
        label = f"{emoji} {threat}"
        (tw, th), _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_DUPLEX, 0.6, 1)
        cv2.rectangle(out, (x1, y1-th-8), (x1+tw+8, y1), bgr, -1)
        cv2.putText(out, label, (x1+4, y1-4),
                    cv2.FONT_HERSHEY_DUPLEX, 0.6, (255,255,255), 1, cv2.LINE_AA)

        # Reason sub-label (smaller, below box)
        reason_short = p["reason"][:50]
        cv2.putText(out, reason_short, (x1+2, y2+16),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.42, bgr, 1, cv2.LINE_AA)

    # ── Overall frame status banner (top of frame) ──
    if persons:
        worst = max(persons, key=lambda p: p["score"])
        w_threat = worst["threat"]
        w_color  = THREAT_COLOR[w_threat]
        w_bgr    = (w_color[2], w_color[1], w_color[0])
        banner   = f"  {THREAT_EMOJI[w_threat]}  OVERALL: {w_threat}  ({len(persons)} person(s) detected)"
        (bw, bh), _ = cv2.getTextSize(banner, cv2.FONT_HERSHEY_DUPLEX, 0.7, 1)
        cv2.rectangle(out, (0,0), (bw+16, bh+12), w_bgr, -1)
        cv2.putText(out, banner, (8, bh+4),
                    cv2.FONT_HERSHEY_DUPLEX, 0.7, (255,255,255), 1, cv2.LINE_AA)

    return out


def run_combined_analysis(frame_np, conf, iou, img_size):
    """
    Run Object Detection + Pose + Weapon on one frame.
    Returns annotated PIL image + analysis dict.
    """
    # ── Step 1: Weapon detection ──
    weapon_in_frame = False
    weapon_dets = []
    if MODEL_WEAPON:
        w_res = MODEL_WEAPON.predict(source=frame_np, conf=conf, iou=iou,
                                     imgsz=img_size, verbose=False)
        for r in w_res:
            if r.boxes is not None and len(r.boxes):
                weapon_in_frame = True
                for box in r.boxes:
                    weapon_dets.append({
                        "label": MODEL_WEAPON.names[int(box.cls)],
                        "confidence": round(float(box.conf), 3),
                        "bbox": [round(v,1) for v in box.xyxy[0].tolist()],
                    })

    # ── Step 2: Pose estimation ──
    persons = []
    pose_anno = frame_np.copy()
    if MODEL_POSE:
        p_res = MODEL_POSE.predict(source=frame_np, conf=conf, iou=iou,
                                   imgsz=img_size, verbose=False)
        for r in p_res:
            pose_anno = r.plot()   # skeleton overlay
            if r.boxes is None or r.keypoints is None:
                continue
            for i, box in enumerate(r.boxes):
                if MODEL_POSE.names[int(box.cls)] != "person":
                    continue
                kps = r.keypoints.xy[i].tolist()
                threat, reason, score = analyse_pose_threat(kps, weapon_in_frame)
                persons.append({
                    "id":     i,
                    "bbox":   [round(v,1) for v in box.xyxy[0].tolist()],
                    "threat": threat,
                    "reason": reason,
                    "score":  score,
                    "keypoints_count": sum(1 for k in kps if k[0]>1 and k[1]>1),
                })

    # Convert pose_anno (may be BGR from r.plot()) to BGR numpy
    if isinstance(pose_anno, np.ndarray) and pose_anno.shape[2] == 3:
        anno_bgr = pose_anno if pose_anno.dtype == np.uint8 else (pose_anno*255).astype(np.uint8)
        # r.plot() returns RGB; convert to BGR for cv2
        anno_bgr = cv2.cvtColor(anno_bgr, cv2.COLOR_RGB2BGR)
    else:
        anno_bgr = cv2.cvtColor(frame_np, cv2.COLOR_RGB2BGR)

    # ── Step 3: Draw weapon boxes on top ──
    for wd in weapon_dets:
        x1,y1,x2,y2 = [int(v) for v in wd["bbox"]]
        cv2.rectangle(anno_bgr, (x1,y1), (x2,y2), (0,0,220), 3)
        lbl = f"🔫 {wd['label']} {wd['confidence']:.0%}"
        cv2.putText(anno_bgr, lbl, (x1, y1-6),
                    cv2.FONT_HERSHEY_DUPLEX, 0.6, (0,0,220), 1)

    # ── Step 4: Draw threat overlays ──
    anno_bgr = draw_threat_overlay(anno_bgr, persons)

    # Back to RGB PIL
    out_pil = Image.fromarray(cv2.cvtColor(anno_bgr, cv2.COLOR_BGR2RGB))

    analysis = {
        "persons_detected": len(persons),
        "weapon_detected":  weapon_in_frame,
        "weapons":          weapon_dets,
        "persons":          persons,
        "overall_threat":   max((p["threat"] for p in persons),
                               key=lambda t: [THREAT_NORMAL,THREAT_SUSPICIOUS,THREAT_THREATENING].index(t))
                           if persons else THREAT_NORMAL,
    }
    return out_pil, analysis


# ══════════════════════════════════════════════════════════════════
#  CORE HELPERS (single-model path)
# ══════════════════════════════════════════════════════════════════
def predict(model, frame_np, conf, iou, img_size):
    return model.predict(source=frame_np, conf=conf, iou=iou,
                         imgsz=img_size, verbose=False,
                         show_labels=True, show_conf=True)

def extract_dets(results, task, model):
    dets = []
    for r in results:
        if task == "classification":
            if r.probs is not None:
                for idx, c in zip(r.probs.top5, r.probs.top5conf.tolist()):
                    dets.append({"label": model.names[idx], "confidence": round(float(c),3)})
        else:
            if r.boxes is not None:
                for i, box in enumerate(r.boxes):
                    d = {"id": i, "label": model.names[int(box.cls)],
                         "confidence": round(float(box.conf),3),
                         "bbox": [round(v,1) for v in box.xyxy[0].tolist()]}
                    if task == "pose" and r.keypoints is not None:
                        kps = r.keypoints.xy[i].tolist()
                        d["keypoints"] = [[round(x,1),round(y,1)] for x,y in kps]
                    dets.append(d)
    return dets

def to_pil(results):
    for r in results:
        return Image.fromarray(r.plot()[..., ::-1])
    return None

def resize_frame(frame, src_w, src_h, max_side=640):
    scale = min(max_side / max(src_w, src_h), 1.0)
    if scale < 1.0:
        ow = int(src_w*scale)&~1; oh = int(src_h*scale)&~1
        if frame is None:
            return None, ow, oh, scale
        return cv2.resize(frame,(ow,oh)), ow, oh, scale
    if frame is None:
        return None, src_w&~1, src_h&~1, 1.0
    return frame, src_w&~1, src_h&~1, 1.0

def _frame_interval(src_fps, scan_fps):
    return max(1, round(src_fps / min(scan_fps, src_fps)))


# ══════════════════════════════════════════════════════════════════
#  INFERENCE FUNCTIONS
# ══════════════════════════════════════════════════════════════════

# ── COMBINED IMAGE (Pose + OD + Weapon + Threat) ──────────────────
@spaces.GPU
def infer_combined_image(image, conf, iou, img_size):
    if image is None:
        return None, '{"error":"No image"}'
    img_np = np.array(image.convert("RGB"))
    out_pil, analysis = run_combined_analysis(img_np, conf, iou, img_size)
    return out_pil, json.dumps(analysis, indent=2)


# ── SINGLE MODEL IMAGE ────────────────────────────────────────────
@spaces.GPU
def infer_image(image, task, conf, iou, img_size):
    if image is None:
        return None, '{"error":"No image"}'
    img_np  = np.array(image.convert("RGB"))
    model   = MODELS[task]
    results = predict(model, img_np, conf, iou, img_size)
    dets    = extract_dets(results, task, model)
    out_img = to_pil(results)
    payload = {"task": TASK_DISPLAY[task], "count": len(dets), "detections": dets}
    return out_img, json.dumps(payload, indent=2)


# ── COMBINED VIDEO (Pose Threat per frame) ────────────────────────
@spaces.GPU
def infer_combined_video(video_path, conf, iou, img_size,
                         scan_fps=1, max_frames=300, progress=gr.Progress()):
    if video_path is None:
        return None, '{"error":"No video"}'

    cap      = cv2.VideoCapture(video_path)
    src_fps  = cap.get(cv2.CAP_PROP_FPS) or 25.0
    src_w    = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))  or 640
    src_h    = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) or 480
    total_src= max(int(cap.get(cv2.CAP_PROP_FRAME_COUNT)), 1)

    # Lock to 1 fps: only process & write one frame per second
    scan_fps  = float(scan_fps) if scan_fps else 1.0
    interval  = _frame_interval(src_fps, scan_fps)
    out_fps   = max(src_fps / interval, 1.0)

    _, out_w, out_h, scale = resize_frame(None, src_w, src_h)

    tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
    # Try H.264 first (smaller + browser-compatible), fall back to mp4v
    fourcc = cv2.VideoWriter_fourcc(*"avc1")
    vw = cv2.VideoWriter(tmp, fourcc, out_fps, (out_w, out_h))
    if not vw.isOpened():
        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
        vw = cv2.VideoWriter(tmp, fourcc, out_fps, (out_w, out_h))

    frame_idx   = 0
    proc_count  = 0
    threat_counts = {THREAT_NORMAL:0, THREAT_SUSPICIOUS:0, THREAT_THREATENING:0}
    total_weapons = 0
    progress(0, desc="Starting …")

    while True:
        ret, frame = cap.read()
        if not ret or proc_count >= int(max_frames):
            break
        if scale < 1.0:
            frame = cv2.resize(frame, (out_w, out_h))

        # Only process and write frames at the target scan rate
        if frame_idx % interval == 0:
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            _, analysis = run_combined_analysis(frame_rgb, conf, iou, img_size)

            anno_bgr = frame.copy()
            if MODEL_POSE:
                pr = MODEL_POSE.predict(source=frame_rgb, conf=conf, iou=iou,
                                        imgsz=img_size, verbose=False)
                for r in pr:
                    plotted = r.plot()  # RGB
                    anno_bgr = cv2.cvtColor(plotted, cv2.COLOR_RGB2BGR)

            for wd in analysis["weapons"]:
                x1,y1,x2,y2 = [int(v) for v in wd["bbox"]]
                cv2.rectangle(anno_bgr,(x1,y1),(x2,y2),(0,0,220),3)
                cv2.putText(anno_bgr, f"WEAPON {wd['confidence']:.0%}",
                            (x1,y1-6), cv2.FONT_HERSHEY_DUPLEX, 0.6,(0,0,220),1)

            anno_bgr = draw_threat_overlay(anno_bgr, analysis["persons"])

            # Only write this annotated frame (skip raw in-between frames entirely)
            vw.write(anno_bgr)

            for p in analysis["persons"]:
                threat_counts[p["threat"]] += 1
            total_weapons += len(analysis["weapons"])

            proc_count += 1
            ot = analysis["overall_threat"]
            progress(min(frame_idx/total_src, 0.99),
                     desc=f"Frame {frame_idx}/{total_src} | {THREAT_EMOJI[ot]} {ot}")

        frame_idx += 1

    cap.release()
    vw.release()
    progress(1.0, desc="✓ Done!")

    payload = {
        "source_fps":       round(src_fps,2),
        "scan_fps":         round(scan_fps,2),
        "frame_interval":   interval,
        "frames_scanned":   proc_count,
        "total_frames":     frame_idx,
        "resolution":       f"{out_w}x{out_h}",
        "weapon_detections":total_weapons,
        "pose_threat_summary": {
            f"{THREAT_EMOJI[THREAT_NORMAL]} NORMAL":           threat_counts[THREAT_NORMAL],
            f"{THREAT_EMOJI[THREAT_SUSPICIOUS]} SUSPICIOUS":   threat_counts[THREAT_SUSPICIOUS],
            f"{THREAT_EMOJI[THREAT_THREATENING]} THREATENING":  threat_counts[THREAT_THREATENING],
        },
    }
    return tmp, json.dumps(payload, indent=2)


# ── SINGLE MODEL VIDEO ────────────────────────────────────────────
@spaces.GPU
def infer_video(video_path, task, conf, iou, img_size,
                scan_fps=1, max_frames=300, progress=gr.Progress()):
    if video_path is None:
        return None, '{"error":"No video"}'

    model    = MODELS[task]
    cap      = cv2.VideoCapture(video_path)
    src_fps  = cap.get(cv2.CAP_PROP_FPS) or 25.0
    src_w    = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))  or 640
    src_h    = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) or 480
    total_src= max(int(cap.get(cv2.CAP_PROP_FRAME_COUNT)),1)

    scan_fps  = float(scan_fps) if scan_fps else 1.0
    interval  = _frame_interval(src_fps, scan_fps)
    out_fps   = max(src_fps / interval, 1.0)

    _, out_w, out_h, scale = resize_frame(None, src_w, src_h)
    tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
    # Try H.264 first (smaller + browser-compatible), fall back to mp4v
    fourcc = cv2.VideoWriter_fourcc(*"avc1")
    vw = cv2.VideoWriter(tmp, fourcc, out_fps, (out_w, out_h))
    if not vw.isOpened():
        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
        vw = cv2.VideoWriter(tmp, fourcc, out_fps, (out_w, out_h))

    frame_idx=0; proc_count=0; total_dets=0
    progress(0, desc="Starting …")

    while True:
        ret, frame = cap.read()
        if not ret or proc_count >= int(max_frames): break
        if scale < 1.0: frame = cv2.resize(frame,(out_w,out_h))

        # Only process and write frames at the target scan rate
        if frame_idx % interval == 0:
            results = predict(model, frame, conf, iou, img_size)
            for r in results:
                plotted = r.plot()  # r.plot() returns RGB; convert to BGR for VideoWriter
                annotated_bgr = cv2.cvtColor(plotted, cv2.COLOR_RGB2BGR)
                if r.boxes is not None: total_dets += len(r.boxes)
            vw.write(annotated_bgr if 'annotated_bgr' in dir() else frame)
            proc_count += 1
            progress(min(frame_idx/total_src,0.99),
                     desc=f"Frame {frame_idx}/{total_src} | {total_dets} dets")
        frame_idx += 1

    cap.release(); vw.release()
    progress(1.0, desc="✓ Done!")
    payload = {
        "task": TASK_DISPLAY[task],
        "source_fps": round(src_fps,2), "scan_fps": round(scan_fps,2),
        "frame_interval": interval, "frames_scanned": proc_count,
        "resolution": f"{out_w}x{out_h}", "total_detections": total_dets,
        "avg_detections_per_scanned_frame": round(total_dets/max(proc_count,1),2),
    }
    return tmp, json.dumps(payload, indent=2)


# ── WEBCAM — COMBINED (Pose Threat + Weapon live) ─────────────────
@spaces.GPU
def stream_webcam_combined(frame, conf, iou, img_size):
    if frame is None:
        return None
    out_pil, _ = run_combined_analysis(frame, conf, iou, img_size)
    return np.array(out_pil)


# ── WEBCAM — SINGLE MODEL ─────────────────────────────────────────
@spaces.GPU
def stream_webcam(frame, task, conf, iou, img_size):
    if frame is None:
        return None
    model   = MODELS[task]
    results = predict(model, frame, conf, iou, img_size)
    for r in results:
        return r.plot()[..., ::-1]
    return frame


# ══════════════════════════════════════════════════════════════════
#  UI HELPERS
# ══════════════════════════════════════════════════════════════════
def shared_controls(default_conf=0.25):
    with gr.Row():
        conf  = gr.Slider(0.05, 0.95, value=default_conf, step=0.05, label="Confidence")
        iou   = gr.Slider(0.05, 0.95, value=0.45,  step=0.05, label="IoU Threshold")
        isize = gr.Slider(320,  1280,  value=640,   step=32,   label="Image Size")
    return conf, iou, isize

def video_controls():
    with gr.Row():
        scan_fps = gr.Radio(
            choices=[1,2,3,5,8,10,15,24], value=5, type="value",
            label="Scan FPS  ·  frames per second to analyse  ·  higher = thorough but slower"
        )
        max_frames = gr.Slider(50, 600, value=200, step=50, label="Max Frames Cap")
    return scan_fps, max_frames

_order = ["object_detection","pose","segmentation","classification","obb","weapon"]
TASK_CHOICES = [(TASK_DISPLAY[t],t) for t in _order if t in MODELS]


# ══════════════════════════════════════════════════════════════════
#  CSS
# ══════════════════════════════════════════════════════════════════
CSS = """
body,.gradio-container{
  background:#060c1a!important;color:#e2e8f0!important;
  font-family:'Segoe UI',system-ui,sans-serif
}
.hero{
  background:linear-gradient(135deg,#0d1b2a,#1a2744,#0f3460);
  border-radius:16px;padding:2rem;margin-bottom:1rem;
  border:1px solid #1e3a5f;text-align:center
}
.hero h1{
  font-size:2rem;font-weight:800;
  background:linear-gradient(90deg,#38bdf8,#818cf8,#34d399);
  -webkit-background-clip:text;-webkit-text-fill-color:transparent;margin:0
}
.hero p{color:#94a3b8;margin:.4rem 0 0}
.threat-banner{
  background:linear-gradient(135deg,rgba(99,102,241,.12),rgba(34,211,238,.08));
  border:1px solid rgba(99,102,241,.4);border-radius:12px;
  padding:.85rem 1.25rem;margin-bottom:.75rem;font-size:.9rem
}
.threat-legend{
  display:flex;gap:1rem;flex-wrap:wrap;margin-top:.5rem;font-size:.82rem
}
.tl-normal{color:#22c55e}  .tl-sus{color:#eab308}  .tl-threat{color:#ef4444}
.tip{
  background:rgba(52,211,153,.08);border:1px solid rgba(52,211,153,.3);
  border-radius:8px;padding:.5rem 1rem;color:#6ee7b7;font-size:.84rem;margin-bottom:.5rem
}
.weapon-note{
  background:rgba(239,68,68,.08);border:1px solid rgba(239,68,68,.25);
  border-radius:8px;padding:.5rem 1rem;color:#fca5a5;font-size:.84rem;margin-bottom:.5rem
}
"""


# ══════════════════════════════════════════════════════════════════
#  GRADIO UI
# ══════════════════════════════════════════════════════════════════
THREAT_LEGEND_HTML = """
<div class="threat-legend">
  <span class="tl-normal">🟢 NORMAL — relaxed / standing / walking</span>
  <span class="tl-sus">🟡 SUSPICIOUS — crouching / leaning / unusual posture</span>
  <span class="tl-threat">🔴 THREATENING — raised arms / aggressive / weapon present</span>
</div>"""

with gr.Blocks(css=CSS, title="VisionAI — Object Detection & Pose Estimation") as app:

    gr.HTML("""
    <div class="hero">
      <h1>🤖 VisionAI — Object Detection & Human Pose Estimation</h1>
      <p>YOLO11 · Pose Threat Analysis · Weapon Detection (weapon_detection.pt) · FPS-based Video Scanning
      <br><small style="color:#64748b">Semester Project — all models pre-loaded at startup</small></p>
    </div>""")

    with gr.Tabs():

        # ════════════════════════════════════════════════════════
        #  TAB 1 — POSE THREAT ANALYSIS (primary feature)
        # ════════════════════════════════════════════════════════
        with gr.Tab("🎯 Pose Threat Analysis"):
            gr.HTML(f"""
            <div class="threat-banner">
              <strong>Pose Threat Analysis</strong> — Runs Pose Estimation + Weapon Detection together.
              Each detected person is classified by posture:
              {THREAT_LEGEND_HTML}
            </div>""")

            with gr.Tabs():

                # IMAGE
                with gr.Tab("📷 Image"):
                    with gr.Row():
                        with gr.Column():
                            ta_img_in = gr.Image(type="pil", label="Upload Image")
                            conf_tai, iou_tai, sz_tai = shared_controls()
                            btn_tai = gr.Button("🎯 Analyse Threat", variant="primary")
                        with gr.Column():
                            ta_img_out  = gr.Image(type="pil", label="Annotated Result")
                            ta_img_json = gr.Code(label="Threat Analysis JSON", language="json")
                    btn_tai.click(infer_combined_image,
                                  [ta_img_in, conf_tai, iou_tai, sz_tai],
                                  [ta_img_out, ta_img_json])

                # VIDEO
                with gr.Tab("🎬 Video"):
                    gr.HTML('<div class="tip">⚡ Pose threat is evaluated on every scanned frame. Use Scan FPS 3–5 on free tier.</div>')
                    with gr.Row():
                        with gr.Column():
                            ta_vid_in = gr.Video(label="Upload Video")
                            conf_tav, iou_tav, sz_tav = shared_controls()
                            fs_tav, mf_tav = video_controls()
                            btn_tav = gr.Button("🎯 Analyse Video Threats", variant="primary")
                        with gr.Column():
                            ta_vid_out  = gr.Video(label="Annotated Output")
                            ta_vid_json = gr.Code(label="Threat Summary JSON", language="json")
                    btn_tav.click(infer_combined_video,
                                  [ta_vid_in, conf_tav, iou_tav, sz_tav, fs_tav, mf_tav],
                                  [ta_vid_out, ta_vid_json])

                # WEBCAM
                with gr.Tab("📡 Live Webcam"):
                    gr.HTML(f"""
                    <div class="threat-banner">
                      📡 <strong>Live Pose Threat Detection</strong> — real-time per-person threat classification.
                      {THREAT_LEGEND_HTML}
                    </div>""")
                    with gr.Row():
                        with gr.Column(scale=1):
                            conf_taw, iou_taw, sz_taw = shared_controls(default_conf=0.30)
                            gr.Markdown("""
**Tips for live accuracy:**
- Stand in full view of camera
- Ensure good lighting
- Image Size 320 = faster on CPU
- Raise both arms to test 🔴 THREATENING
                            """)
                        with gr.Column(scale=2):
                            ta_cam_in  = gr.Image(sources=["webcam"], streaming=True,
                                                  type="numpy", label="Webcam Feed")
                            ta_cam_out = gr.Image(streaming=True,
                                                  label="🎯 Live Threat Analysis")
                    ta_cam_in.stream(stream_webcam_combined,
                                     [ta_cam_in, conf_taw, iou_taw, sz_taw],
                                     [ta_cam_out])

        # ════════════════════════════════════════════════════════
        #  TAB 2 — WEAPON DETECTION
        # ════════════════════════════════════════════════════════
        with gr.Tab("🔫 Weapon Detection"):
            gr.HTML("""
            <div class="weapon-note">
              🔫 <strong>Custom Weapon Detection Model</strong> (weapon_detection.pt) —
              detects firearms and other weapons. Combined with pose analysis for full threat assessment.
            </div>""")
            with gr.Tabs():
                with gr.Tab("📷 Image"):
                    with gr.Row():
                        with gr.Column():
                            wp_in = gr.Image(type="pil", label="Upload Image")
                            conf_wp, iou_wp, sz_wp = shared_controls(default_conf=0.20)
                            btn_wp = gr.Button("🔫 Detect Weapons", variant="primary")
                        with gr.Column():
                            wp_out  = gr.Image(type="pil", label="Result")
                            wp_json = gr.Code(label="Detection JSON", language="json")
                    btn_wp.click(infer_image,
                                 [wp_in, gr.State("weapon"), conf_wp, iou_wp, sz_wp],
                                 [wp_out, wp_json])

                with gr.Tab("🎬 Video"):
                    with gr.Row():
                        with gr.Column():
                            wpv_in = gr.Video(label="Upload Video")
                            conf_wpv, iou_wpv, sz_wpv = shared_controls(default_conf=0.20)
                            fs_wpv, mf_wpv = video_controls()
                            btn_wpv = gr.Button("🔫 Detect Weapons in Video", variant="primary")
                        with gr.Column():
                            wpv_out  = gr.Video(label="Annotated Video")
                            wpv_json = gr.Code(label="Summary JSON", language="json")
                    btn_wpv.click(infer_video,
                                  [wpv_in, gr.State("weapon"), conf_wpv, iou_wpv, sz_wpv, fs_wpv, mf_wpv],
                                  [wpv_out, wpv_json])

                with gr.Tab("📡 Webcam"):
                    with gr.Row():
                        with gr.Column(scale=1):
                            conf_wpc, iou_wpc, sz_wpc = shared_controls(default_conf=0.20)
                        with gr.Column(scale=2):
                            wpc_in  = gr.Image(sources=["webcam"], streaming=True,
                                               type="numpy", label="Webcam")
                            wpc_out = gr.Image(streaming=True, label="🔫 Weapon Detection Live")
                    wpc_in.stream(lambda f,c,i,s: stream_webcam(f,"weapon",c,i,s),
                                  [wpc_in, conf_wpc, iou_wpc, sz_wpc],
                                  [wpc_out])

        # ════════════════════════════════════════════════════════
        #  TAB 3 — OBJECT DETECTION
        # ════════════════════════════════════════════════════════
        with gr.Tab("🔍 Object Detection"):
            with gr.Tabs():
                with gr.Tab("📷 Image"):
                    with gr.Row():
                        with gr.Column():
                            od_in = gr.Image(type="pil", label="Upload Image")
                            conf_od, iou_od, sz_od = shared_controls()
                            btn_od = gr.Button("▶ Run Detection", variant="primary")
                        with gr.Column():
                            od_out  = gr.Image(type="pil", label="Result")
                            od_json = gr.Code(label="JSON", language="json")
                    btn_od.click(infer_image,
                                 [od_in, gr.State("object_detection"), conf_od, iou_od, sz_od],
                                 [od_out, od_json])
                with gr.Tab("🎬 Video"):
                    with gr.Row():
                        with gr.Column():
                            odv_in = gr.Video(label="Upload Video")
                            conf_odv, iou_odv, sz_odv = shared_controls()
                            fs_odv, mf_odv = video_controls()
                            btn_odv = gr.Button("▶ Process Video", variant="primary")
                        with gr.Column():
                            odv_out  = gr.Video(label="Annotated Video")
                            odv_json = gr.Code(label="Summary JSON", language="json")
                    btn_odv.click(infer_video,
                                  [odv_in, gr.State("object_detection"), conf_odv, iou_odv, sz_odv, fs_odv, mf_odv],
                                  [odv_out, odv_json])
                with gr.Tab("📡 Webcam"):
                    with gr.Row():
                        with gr.Column(scale=1):
                            conf_odc, iou_odc, sz_odc = shared_controls()
                        with gr.Column(scale=2):
                            odc_in  = gr.Image(sources=["webcam"], streaming=True,
                                               type="numpy", label="Webcam")
                            odc_out = gr.Image(streaming=True, label="Live Detection")
                    odc_in.stream(lambda f,c,i,s: stream_webcam(f,"object_detection",c,i,s),
                                  [odc_in, conf_odc, iou_odc, sz_odc],
                                  [odc_out])

        # ════════════════════════════════════════════════════════
        #  TAB 4 — POSE ESTIMATION (standalone)
        # ════════════════════════════════════════════════════════
        with gr.Tab("🦴 Pose Estimation"):
            with gr.Tabs():
                with gr.Tab("📷 Image"):
                    with gr.Row():
                        with gr.Column():
                            pe_in = gr.Image(type="pil", label="Upload Image")
                            conf_pe, iou_pe, sz_pe = shared_controls()
                            btn_pe = gr.Button("▶ Estimate Pose", variant="primary")
                        with gr.Column():
                            pe_out  = gr.Image(type="pil", label="Skeleton Result")
                            pe_json = gr.Code(label="Keypoints JSON", language="json")
                    btn_pe.click(infer_image,
                                 [pe_in, gr.State("pose"), conf_pe, iou_pe, sz_pe],
                                 [pe_out, pe_json])
                with gr.Tab("🎬 Video"):
                    with gr.Row():
                        with gr.Column():
                            pev_in = gr.Video(label="Upload Video")
                            conf_pev, iou_pev, sz_pev = shared_controls()
                            fs_pev, mf_pev = video_controls()
                            btn_pev = gr.Button("▶ Process Video", variant="primary")
                        with gr.Column():
                            pev_out  = gr.Video(label="Annotated Video")
                            pev_json = gr.Code(label="Summary JSON", language="json")
                    btn_pev.click(infer_video,
                                  [pev_in, gr.State("pose"), conf_pev, iou_pev, sz_pev, fs_pev, mf_pev],
                                  [pev_out, pev_json])
                with gr.Tab("📡 Webcam"):
                    with gr.Row():
                        with gr.Column(scale=1):
                            conf_pec, iou_pec, sz_pec = shared_controls()
                        with gr.Column(scale=2):
                            pec_in  = gr.Image(sources=["webcam"], streaming=True,
                                               type="numpy", label="Webcam")
                            pec_out = gr.Image(streaming=True, label="Live Skeleton")
                    pec_in.stream(lambda f,c,i,s: stream_webcam(f,"pose",c,i,s),
                                  [pec_in, conf_pec, iou_pec, sz_pec],
                                  [pec_out])

        # ════════════════════════════════════════════════════════
        #  TAB 5 — OTHER MODELS
        # ════════════════════════════════════════════════════════
        with gr.Tab("🧩 More Models"):
            with gr.Tabs():
                with gr.Tab("📷 Image"):
                    other_choices = [(TASK_DISPLAY[t],t) for t in
                                     ["segmentation","classification","obb"] if t in MODELS]
                    if other_choices:
                        task_om = gr.Radio(choices=other_choices, value=other_choices[0][1],
                                           label="Select Model")
                        with gr.Row():
                            with gr.Column():
                                om_in = gr.Image(type="pil", label="Upload Image")
                                conf_om, iou_om, sz_om = shared_controls()
                                btn_om = gr.Button("▶ Run", variant="primary")
                            with gr.Column():
                                om_out  = gr.Image(type="pil", label="Result")
                                om_json = gr.Code(label="JSON", language="json")
                        btn_om.click(infer_image,
                                     [om_in, task_om, conf_om, iou_om, sz_om],
                                     [om_out, om_json])

                with gr.Tab("🎬 Video"):
                    other_choices_v = [(TASK_DISPLAY[t],t) for t in
                                       ["segmentation","classification","obb"] if t in MODELS]
                    if other_choices_v:
                        task_omv = gr.Radio(choices=other_choices_v, value=other_choices_v[0][1],
                                            label="Select Model")
                        with gr.Row():
                            with gr.Column():
                                omv_in = gr.Video(label="Upload Video")
                                conf_omv, iou_omv, sz_omv = shared_controls()
                                fs_omv, mf_omv = video_controls()
                                btn_omv = gr.Button("▶ Process Video", variant="primary")
                            with gr.Column():
                                omv_out  = gr.Video(label="Annotated Video")
                                omv_json = gr.Code(label="Summary JSON", language="json")
                        btn_omv.click(infer_video,
                                      [omv_in, task_omv, conf_omv, iou_omv, sz_omv, fs_omv, mf_omv],
                                      [omv_out, omv_json])

    gr.HTML("""
    <div style="text-align:center;padding:1.5rem;color:#475569;font-size:.82rem;
                margin-top:1rem;border-top:1px solid #1e293b;">
        VisionAI · Object Detection &amp; Human Pose Estimation · YOLO11 · weapon_detection.pt · Semester Project
    </div>""")

if __name__ == "__main__":
    app.launch(server_name="0.0.0.0", server_port=7860, show_error=True)