Spaces:

FocusGuard
/

final

Sleeping

App Files Files Community

k22056537 commited on Feb 17

Commit

da26163

1 Parent(s): 76adc7f

feat: add optional eye model (YOLO/MobileNet) alongside geometry

Browse files

Files changed (5) hide show

models/eye_behaviour/eye_classifier.py +149 -0
requirements.txt +1 -0
ui/README.md +10 -3
ui/live_demo.py +21 -3
ui/pipeline.py +44 -9

models/eye_behaviour/eye_classifier.py ADDED Viewed

	@@ -0,0 +1,149 @@

+# Swappable eye classifier: geometric only, MobileNetV2 (96x96), or YOLO open/closed (224x224)
+from __future__ import annotations
+from abc import ABC, abstractmethod
+import cv2
+import numpy as np
+class EyeClassifier(ABC):
+    @property
+    @abstractmethod
+    def name(self) -> str:
+        pass
+    @abstractmethod
+    def predict_score(self, crops_bgr: list[np.ndarray]) -> float:
+        # crops_bgr: [left_crop, right_crop] BGR; returns score in [0,1], 1 = attentive (open)
+        pass
+class GeometricOnlyClassifier(EyeClassifier):
+    @property
+    def name(self) -> str:
+        return "geometric"
+    def predict_score(self, crops_bgr: list[np.ndarray]) -> float:
+        return 1.0
+class MobileNetV2Classifier(EyeClassifier):
+    # 96x96 crops, ImageNet norm
+    def __init__(self, checkpoint_path: str, device: str = "cpu"):
+        import torch
+        from models.eye_behaviour.eye_attention_model import EyeAttentionModel
+        from models.eye_behaviour.eye_crop import crop_to_tensor, CROP_SIZE
+        self._crop_to_tensor = crop_to_tensor
+        self._crop_size = CROP_SIZE
+        self._device = torch.device(device)
+        self._model = EyeAttentionModel(pretrained=False).to(self._device)
+        self._model.load_state_dict(
+            torch.load(checkpoint_path, map_location=self._device, weights_only=True)
+        )
+        self._model.eval()
+    @property
+    def name(self) -> str:
+        return "mobilenet"
+    def predict_score(self, crops_bgr: list[np.ndarray]) -> float:
+        import torch
+        if not crops_bgr:
+            return 1.0
+        tensors = []
+        for crop in crops_bgr:
+            resized = cv2.resize(crop, (self._crop_size, self._crop_size), interpolation=cv2.INTER_AREA)
+            tensors.append(self._crop_to_tensor(resized))
+        batch = torch.stack(tensors).to(self._device)
+        with torch.no_grad():
+            scores = self._model.predict_score(batch)
+        return scores.mean().item()
+class YOLOv11Classifier(EyeClassifier):
+    # YOLO open/closed; resizes to 224x224 internally
+    def __init__(self, checkpoint_path: str, device: str = "cpu"):
+        from ultralytics import YOLO
+        self._model = YOLO(checkpoint_path)
+        self._device = device
+        names = self._model.names
+        self._attentive_idx = None
+        for idx, cls_name in names.items():
+            if cls_name in ("open", "attentive"):
+                self._attentive_idx = idx
+                break
+        if self._attentive_idx is None:
+            self._attentive_idx = max(names.keys())
+        print(f"[YOLO] Classes: {names}, attentive_idx={self._attentive_idx}")
+    @property
+    def name(self) -> str:
+        return "yolo"
+    def predict_score(self, crops_bgr: list[np.ndarray]) -> float:
+        if not crops_bgr:
+            return 1.0
+        results = self._model.predict(crops_bgr, device=self._device, verbose=False)
+        scores = [float(r.probs.data[self._attentive_idx]) for r in results]
+        return sum(scores) / len(scores) if scores else 1.0
+def _is_yolo_checkpoint(path: str) -> bool:
+    try:
+        import torch
+        data = torch.load(path, map_location="cpu", weights_only=False)
+        if isinstance(data, dict):
+            model_obj = data.get("model")
+            if model_obj is not None and "Model" in type(model_obj).__name__:
+                return True
+            if "train_args" in data and "model" in data:
+                return True
+    except Exception:
+        pass
+    return False
+def load_eye_classifier(
+    path: str | None = None,
+    backend: str = "auto",
+    device: str = "cpu",
+) -> EyeClassifier:
+    if path is None or backend == "geometric":
+        return GeometricOnlyClassifier()
+    if backend == "yolo":
+        try:
+            return YOLOv11Classifier(path, device=device)
+        except ImportError:
+            print("[CLASSIFIER] ultralytics required. pip install ultralytics")
+            raise
+    if backend == "mobilenet":
+        return MobileNetV2Classifier(path, device=device)
+    if _is_yolo_checkpoint(path):
+        try:
+            return YOLOv11Classifier(path, device=device)
+        except ImportError:
+            print("[CLASSIFIER] YOLO checkpoint needs ultralytics. pip install ultralytics")
+            raise
+    try:
+        return MobileNetV2Classifier(path, device=device)
+    except Exception as exc:
+        err = str(exc)
+        if "Weights only load failed" in err and "ultralytics" in err:
+            try:
+                return YOLOv11Classifier(path, device=device)
+            except ImportError:
+                print("[CLASSIFIER] pip install ultralytics for this checkpoint")
+                raise
+        raise

requirements.txt CHANGED Viewed

@@ -4,3 +4,4 @@ opencv-python>=4.8.0
 numpy>=1.24.0
 torch>=2.0.0
 torchvision>=0.15.0

 numpy>=1.24.0
 torch>=2.0.0
 torchvision>=0.15.0
+# ultralytics  # optional: for YOLO open/closed eye classifier

ui/README.md CHANGED Viewed

@@ -2,14 +2,21 @@
 Live demo and session view.
-## Stage 1 (face mesh only)
-- **pipeline.py** — frame → 478 landmarks (no head pose / CNN).
-- **live_demo.py** — webcam + mesh overlay (tessellation, contours, eyes, irises).
 From repo root:
 ```bash
 pip install -r requirements.txt
 python ui/live_demo.py
 ```
 `q` = quit, `m` = cycle mesh mode (full / contours / off).

 Live demo and session view.
+## Stage 2 (face mesh + head pose + eye)
+- **pipeline.py** — face mesh → S_face (head pose) + S_eye (geometry + optional YOLO/MobileNet) + MAR/yawn → focus.
+- **live_demo.py** — webcam + mesh, FOCUSED/NOT FOCUSED, MAR, YAWN, optional eye model.
 From repo root:
 ```bash
 pip install -r requirements.txt
 python ui/live_demo.py
 ```
+With YOLO open/closed model (face mesh crops eyes → 224×224 → YOLO):
+```bash
+pip install ultralytics
+python ui/live_demo.py --eye-model path/to/yolo.pt --eye-backend yolo
+```
+With MobileNetV2 (96×96 crops): `--eye-model path/to/best_model.pt --eye-backend mobilenet`.
 `q` = quit, `m` = cycle mesh mode (full / contours / off).

ui/live_demo.py CHANGED Viewed

@@ -125,10 +125,22 @@ def main():
     parser.add_argument("--alpha", type=float, default=0.4, help="S_face weight")
     parser.add_argument("--beta", type=float, default=0.6, help="S_eye weight")
     parser.add_argument("--threshold", type=float, default=0.55, help="Score >= this = FOCUSED (higher = stricter)")
     args = parser.parse_args()
-    print("[DEMO] Face mesh + head pose + eye behaviour (Stage 2)")
-    pipeline = FaceMeshPipeline(max_angle=args.max_angle, alpha=args.alpha, beta=args.beta, threshold=args.threshold)
     cap = cv2.VideoCapture(args.camera)
     if not cap.isOpened():
@@ -161,6 +173,11 @@ def main():
                     draw_contours(frame, lm, w, h)
                 draw_eyes_and_irises(frame, lm, w, h)
                 pipeline.head_pose.draw_axes(frame, lm)
             # Status bar: FOCUSED / NOT FOCUSED; YAWN when mouth open (sleepy)
             status = "FOCUSED" if result["is_focused"] else "NOT FOCUSED"
@@ -173,7 +190,8 @@ def main():
                 cv2.putText(frame, "YAWN", (10, 75), FONT, 0.7, ORANGE, 2, cv2.LINE_AA)
             if result["yaw"] is not None:
                 cv2.putText(frame, f"yaw:{result['yaw']:+.0f} pitch:{result['pitch']:+.0f} roll:{result['roll']:+.0f}", (w - 280, 48), FONT, 0.4, (180, 180, 180), 1, cv2.LINE_AA)
-            cv2.putText(frame, f"{_MESH_NAMES[mesh_mode]}  FPS: {fps:.0f}", (w - 200, 28), FONT, 0.45, WHITE, 1, cv2.LINE_AA)
             cv2.putText(frame, "q:quit  m:mesh", (w - 140, 48), FONT, 0.4, (180, 180, 180), 1, cv2.LINE_AA)
             cv2.imshow("FocusGuard", frame)

     parser.add_argument("--alpha", type=float, default=0.4, help="S_face weight")
     parser.add_argument("--beta", type=float, default=0.6, help="S_eye weight")
     parser.add_argument("--threshold", type=float, default=0.55, help="Score >= this = FOCUSED (higher = stricter)")
+    parser.add_argument("--eye-model", type=str, default=None, help="Path to eye model (YOLO .pt or MobileNet .pt); omit = geometry only")
+    parser.add_argument("--eye-backend", type=str, default="auto", choices=["auto", "mobilenet", "yolo", "geometric"], help="Eye model backend (auto = detect from file)")
+    parser.add_argument("--eye-blend", type=float, default=0.5, help="Blend: (1-blend)*geo + blend*model when model loaded")
     args = parser.parse_args()
+    eye_mode = " + model" if args.eye_model else " only"
+    print("[DEMO] Face mesh + head pose + eye (geometry" + eye_mode + ")")
+    pipeline = FaceMeshPipeline(
+        max_angle=args.max_angle,
+        alpha=args.alpha,
+        beta=args.beta,
+        threshold=args.threshold,
+        eye_model_path=args.eye_model,
+        eye_backend=args.eye_backend,
+        eye_blend=args.eye_blend,
+    )
     cap = cv2.VideoCapture(args.camera)
     if not cap.isOpened():
                     draw_contours(frame, lm, w, h)
                 draw_eyes_and_irises(frame, lm, w, h)
                 pipeline.head_pose.draw_axes(frame, lm)
+                if result.get("left_bbox") and result.get("right_bbox"):
+                    lx1, ly1, lx2, ly2 = result["left_bbox"]
+                    rx1, ry1, rx2, ry2 = result["right_bbox"]
+                    cv2.rectangle(frame, (lx1, ly1), (lx2, ly2), YELLOW, 1)
+                    cv2.rectangle(frame, (rx1, ry1), (rx2, ry2), YELLOW, 1)
             # Status bar: FOCUSED / NOT FOCUSED; YAWN when mouth open (sleepy)
             status = "FOCUSED" if result["is_focused"] else "NOT FOCUSED"
                 cv2.putText(frame, "YAWN", (10, 75), FONT, 0.7, ORANGE, 2, cv2.LINE_AA)
             if result["yaw"] is not None:
                 cv2.putText(frame, f"yaw:{result['yaw']:+.0f} pitch:{result['pitch']:+.0f} roll:{result['roll']:+.0f}", (w - 280, 48), FONT, 0.4, (180, 180, 180), 1, cv2.LINE_AA)
+            eye_label = f"eye:{pipeline.eye_classifier.name}" if pipeline.has_eye_model else "eye:geo"
+            cv2.putText(frame, f"{_MESH_NAMES[mesh_mode]}  {eye_label}  FPS: {fps:.0f}", (w - 320, 28), FONT, 0.45, WHITE, 1, cv2.LINE_AA)
             cv2.putText(frame, "q:quit  m:mesh", (w - 140, 48), FONT, 0.4, (180, 180, 180), 1, cv2.LINE_AA)
             cv2.imshow("FocusGuard", frame)

ui/pipeline.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Stage 2: face mesh + head pose (S_face) + eye behaviour (S_eye) -> focus
 import os
 import sys
@@ -12,18 +12,39 @@ if _PROJECT_ROOT not in sys.path:
 from models.face_mesh.face_mesh import FaceMeshDetector
 from models.face_orientation.head_pose import HeadPoseEstimator
 from models.eye_behaviour.eye_scorer import EyeBehaviourScorer, compute_mar, MAR_YAWN_THRESHOLD
 class FaceMeshPipeline:
-    # frame -> face mesh -> S_face + S_eye -> focused / not focused
-    def __init__(self, max_angle: float = 22.0, alpha: float = 0.4, beta: float = 0.6, threshold: float = 0.55):
         self.detector = FaceMeshDetector()
         self.head_pose = HeadPoseEstimator(max_angle=max_angle)
         self.eye_scorer = EyeBehaviourScorer()
         self.alpha = alpha
         self.beta = beta
         self.threshold = threshold
     def process_frame(self, bgr_frame: np.ndarray) -> dict:
         landmarks = self.detector.process(bgr_frame)
@@ -40,6 +61,8 @@ class FaceMeshPipeline:
             "roll": None,
             "mar": None,
             "is_yawning": False,
         }
         if landmarks is None:
@@ -51,19 +74,31 @@ class FaceMeshPipeline:
             out["yaw"], out["pitch"], out["roll"] = angles
         out["s_face"] = self.head_pose.score(landmarks, w, h)
-        # Eye behaviour (EAR + gaze) -> S_eye
-        out["s_eye"] = self.eye_scorer.score(landmarks)
-        # Mouth open (MAR) -> yawn / sleepy: force NOT FOCUSED when mouth open
         out["mar"] = compute_mar(landmarks)
         out["is_yawning"] = out["mar"] > MAR_YAWN_THRESHOLD
-        # Fusion: alpha*S_face + beta*S_eye; if yawning (mouth open) -> not focused
         out["raw_score"] = self.alpha * out["s_face"] + self.beta * out["s_eye"]
         out["is_focused"] = out["raw_score"] >= self.threshold and not out["is_yawning"]
         return out
     def close(self):
         self.detector.close()

+# Stage 2: face mesh + head pose (S_face) + eye (geometry + optional model) -> focus
 import os
 import sys
 from models.face_mesh.face_mesh import FaceMeshDetector
 from models.face_orientation.head_pose import HeadPoseEstimator
 from models.eye_behaviour.eye_scorer import EyeBehaviourScorer, compute_mar, MAR_YAWN_THRESHOLD
+from models.eye_behaviour.eye_crop import extract_eye_crops
+from models.eye_behaviour.eye_classifier import load_eye_classifier, GeometricOnlyClassifier
 class FaceMeshPipeline:
+    # frame -> face mesh -> S_face + S_eye (geo + optional YOLO/MobileNet) -> focused / not focused
+    def __init__(
+        self,
+        max_angle: float = 22.0,
+        alpha: float = 0.4,
+        beta: float = 0.6,
+        threshold: float = 0.55,
+        eye_model_path: str | None = None,
+        eye_backend: str = "auto",
+        eye_blend: float = 0.5,
+    ):
         self.detector = FaceMeshDetector()
         self.head_pose = HeadPoseEstimator(max_angle=max_angle)
         self.eye_scorer = EyeBehaviourScorer()
         self.alpha = alpha
         self.beta = beta
         self.threshold = threshold
+        self.eye_blend = eye_blend  # 0.5 = 50% geo + 50% model when model loaded
+        self.eye_classifier = load_eye_classifier(
+            path=eye_model_path if eye_model_path and os.path.exists(eye_model_path) else None,
+            backend=eye_backend,
+            device="cpu",
+        )
+        self._has_eye_model = not isinstance(self.eye_classifier, GeometricOnlyClassifier)
+        if self._has_eye_model:
+            print(f"[PIPELINE] Eye model: {self.eye_classifier.name}")
     def process_frame(self, bgr_frame: np.ndarray) -> dict:
         landmarks = self.detector.process(bgr_frame)
             "roll": None,
             "mar": None,
             "is_yawning": False,
+            "left_bbox": None,
+            "right_bbox": None,
         }
         if landmarks is None:
             out["yaw"], out["pitch"], out["roll"] = angles
         out["s_face"] = self.head_pose.score(landmarks, w, h)
+        # Eye: geometry (EAR + gaze) always; optional model (YOLO/MobileNet) on cropped eyes
+        s_eye_geo = self.eye_scorer.score(landmarks)
+        if self._has_eye_model:
+            left_crop, right_crop, left_bbox, right_bbox = extract_eye_crops(bgr_frame, landmarks)
+            out["left_bbox"] = left_bbox
+            out["right_bbox"] = right_bbox
+            s_eye_model = self.eye_classifier.predict_score([left_crop, right_crop])
+            out["s_eye"] = (1.0 - self.eye_blend) * s_eye_geo + self.eye_blend * s_eye_model
+        else:
+            out["s_eye"] = s_eye_geo
+        # Mouth open (MAR) -> yawn: force NOT FOCUSED when mouth open
         out["mar"] = compute_mar(landmarks)
         out["is_yawning"] = out["mar"] > MAR_YAWN_THRESHOLD
+        # Fusion; yawn overrides
         out["raw_score"] = self.alpha * out["s_face"] + self.beta * out["s_eye"]
         out["is_focused"] = out["raw_score"] >= self.threshold and not out["is_yawning"]
         return out
+    @property
+    def has_eye_model(self) -> bool:
+        return self._has_eye_model
     def close(self):
         self.detector.close()