Spaces:
Sleeping
Sleeping
k22056537 commited on
Commit ·
76adc7f
1
Parent(s): d582dbd
feat: stage 2 head pose, eye behaviour, MAR/yawn, tighter focus; add torch deps
Browse files- models/eye_behaviour/eye_attention_model.py +48 -1
- models/eye_behaviour/eye_crop.py +70 -1
- models/eye_behaviour/eye_scorer.py +167 -1
- models/face_orientation/head_pose.py +114 -1
- requirements.txt +3 -1
- ui/live_demo.py +22 -7
- ui/pipeline.py +46 -4
models/eye_behaviour/eye_attention_model.py
CHANGED
|
@@ -1 +1,48 @@
|
|
| 1 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# MobileNetV2 eye attention classifier (attentive vs inattentive)
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
import torch.nn as nn
|
| 5 |
+
import torchvision.models as models
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class EyeAttentionModel(nn.Module):
|
| 9 |
+
def __init__(
|
| 10 |
+
self,
|
| 11 |
+
pretrained: bool = True,
|
| 12 |
+
dropout1: float = 0.3,
|
| 13 |
+
dropout2: float = 0.2,
|
| 14 |
+
):
|
| 15 |
+
super().__init__()
|
| 16 |
+
|
| 17 |
+
weights = models.MobileNet_V2_Weights.DEFAULT if pretrained else None
|
| 18 |
+
backbone = models.mobilenet_v2(weights=weights)
|
| 19 |
+
|
| 20 |
+
self.features = backbone.features
|
| 21 |
+
self.pool = nn.AdaptiveAvgPool2d(1)
|
| 22 |
+
self.classifier = nn.Sequential(
|
| 23 |
+
nn.Dropout(dropout1),
|
| 24 |
+
nn.Linear(1280, 256),
|
| 25 |
+
nn.ReLU(),
|
| 26 |
+
nn.Dropout(dropout2),
|
| 27 |
+
nn.Linear(256, 2),
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
| 31 |
+
x = self.features(x)
|
| 32 |
+
x = self.pool(x).flatten(1)
|
| 33 |
+
return self.classifier(x)
|
| 34 |
+
|
| 35 |
+
def predict_score(self, x: torch.Tensor) -> torch.Tensor:
|
| 36 |
+
logits = self.forward(x)
|
| 37 |
+
probs = torch.softmax(logits, dim=1)
|
| 38 |
+
return probs[:, 1]
|
| 39 |
+
|
| 40 |
+
def freeze_backbone(self):
|
| 41 |
+
for param in self.features.parameters():
|
| 42 |
+
param.requires_grad = False
|
| 43 |
+
|
| 44 |
+
def unfreeze_last_blocks(self, n: int = 4):
|
| 45 |
+
total_blocks = len(self.features)
|
| 46 |
+
for i in range(max(0, total_blocks - n), total_blocks):
|
| 47 |
+
for param in self.features[i].parameters():
|
| 48 |
+
param.requires_grad = True
|
models/eye_behaviour/eye_crop.py
CHANGED
|
@@ -1 +1,70 @@
|
|
| 1 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Eye region extraction from Face Mesh landmarks
|
| 2 |
+
|
| 3 |
+
import cv2
|
| 4 |
+
import numpy as np
|
| 5 |
+
|
| 6 |
+
LEFT_EYE_CONTOUR = [33, 7, 163, 144, 145, 153, 154, 155, 133, 173, 157, 158, 159, 160, 161, 246]
|
| 7 |
+
RIGHT_EYE_CONTOUR = [362, 382, 381, 380, 374, 373, 390, 249, 263, 466, 388, 387, 386, 385, 384, 398]
|
| 8 |
+
|
| 9 |
+
IMAGENET_MEAN = (0.485, 0.456, 0.406)
|
| 10 |
+
IMAGENET_STD = (0.229, 0.224, 0.225)
|
| 11 |
+
|
| 12 |
+
CROP_SIZE = 96
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def _bbox_from_landmarks(
|
| 16 |
+
landmarks: np.ndarray,
|
| 17 |
+
indices: list[int],
|
| 18 |
+
frame_w: int,
|
| 19 |
+
frame_h: int,
|
| 20 |
+
expand: float = 0.4,
|
| 21 |
+
) -> tuple[int, int, int, int]:
|
| 22 |
+
pts = landmarks[indices, :2]
|
| 23 |
+
px = pts[:, 0] * frame_w
|
| 24 |
+
py = pts[:, 1] * frame_h
|
| 25 |
+
|
| 26 |
+
x_min, x_max = px.min(), px.max()
|
| 27 |
+
y_min, y_max = py.min(), py.max()
|
| 28 |
+
w = x_max - x_min
|
| 29 |
+
h = y_max - y_min
|
| 30 |
+
cx = (x_min + x_max) / 2
|
| 31 |
+
cy = (y_min + y_max) / 2
|
| 32 |
+
|
| 33 |
+
size = max(w, h) * (1 + expand)
|
| 34 |
+
half = size / 2
|
| 35 |
+
|
| 36 |
+
x1 = int(max(cx - half, 0))
|
| 37 |
+
y1 = int(max(cy - half, 0))
|
| 38 |
+
x2 = int(min(cx + half, frame_w))
|
| 39 |
+
y2 = int(min(cy + half, frame_h))
|
| 40 |
+
|
| 41 |
+
return x1, y1, x2, y2
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def extract_eye_crops(
|
| 45 |
+
frame: np.ndarray,
|
| 46 |
+
landmarks: np.ndarray,
|
| 47 |
+
expand: float = 0.4,
|
| 48 |
+
crop_size: int = CROP_SIZE,
|
| 49 |
+
) -> tuple[np.ndarray, np.ndarray, tuple, tuple]:
|
| 50 |
+
h, w = frame.shape[:2]
|
| 51 |
+
|
| 52 |
+
left_bbox = _bbox_from_landmarks(landmarks, LEFT_EYE_CONTOUR, w, h, expand)
|
| 53 |
+
right_bbox = _bbox_from_landmarks(landmarks, RIGHT_EYE_CONTOUR, w, h, expand)
|
| 54 |
+
|
| 55 |
+
left_crop = frame[left_bbox[1] : left_bbox[3], left_bbox[0] : left_bbox[2]]
|
| 56 |
+
right_crop = frame[right_bbox[1] : right_bbox[3], right_bbox[0] : right_bbox[2]]
|
| 57 |
+
|
| 58 |
+
left_crop = cv2.resize(left_crop, (crop_size, crop_size), interpolation=cv2.INTER_AREA)
|
| 59 |
+
right_crop = cv2.resize(right_crop, (crop_size, crop_size), interpolation=cv2.INTER_AREA)
|
| 60 |
+
|
| 61 |
+
return left_crop, right_crop, left_bbox, right_bbox
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def crop_to_tensor(crop_bgr: np.ndarray):
|
| 65 |
+
import torch
|
| 66 |
+
|
| 67 |
+
rgb = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0
|
| 68 |
+
for c in range(3):
|
| 69 |
+
rgb[:, :, c] = (rgb[:, :, c] - IMAGENET_MEAN[c]) / IMAGENET_STD[c]
|
| 70 |
+
return torch.from_numpy(rgb.transpose(2, 0, 1))
|
models/eye_behaviour/eye_scorer.py
CHANGED
|
@@ -1 +1,167 @@
|
|
| 1 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# EAR + gaze from landmarks -> S_eye (no model)
|
| 2 |
+
|
| 3 |
+
import math
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
|
| 7 |
+
_LEFT_EYE_EAR = [33, 160, 158, 133, 153, 145]
|
| 8 |
+
_RIGHT_EYE_EAR = [362, 385, 387, 263, 373, 380]
|
| 9 |
+
|
| 10 |
+
_LEFT_IRIS_CENTER = 468
|
| 11 |
+
_RIGHT_IRIS_CENTER = 473
|
| 12 |
+
|
| 13 |
+
_LEFT_EYE_INNER = 133
|
| 14 |
+
_LEFT_EYE_OUTER = 33
|
| 15 |
+
_RIGHT_EYE_INNER = 362
|
| 16 |
+
_RIGHT_EYE_OUTER = 263
|
| 17 |
+
|
| 18 |
+
_LEFT_EYE_TOP = 159
|
| 19 |
+
_LEFT_EYE_BOTTOM = 145
|
| 20 |
+
_RIGHT_EYE_TOP = 386
|
| 21 |
+
_RIGHT_EYE_BOTTOM = 374
|
| 22 |
+
|
| 23 |
+
# Mouth (MAR) — inner lip landmarks
|
| 24 |
+
_MOUTH_TOP = 13
|
| 25 |
+
_MOUTH_BOTTOM = 14
|
| 26 |
+
_MOUTH_LEFT = 78
|
| 27 |
+
_MOUTH_RIGHT = 308
|
| 28 |
+
_MOUTH_UPPER_1 = 82
|
| 29 |
+
_MOUTH_UPPER_2 = 312
|
| 30 |
+
_MOUTH_LOWER_1 = 87
|
| 31 |
+
_MOUTH_LOWER_2 = 317
|
| 32 |
+
|
| 33 |
+
MAR_YAWN_THRESHOLD = 0.55 # MAR above this = mouth open (e.g. yawning / sleepy)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def _distance(p1: np.ndarray, p2: np.ndarray) -> float:
|
| 37 |
+
return float(np.linalg.norm(p1 - p2))
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def compute_ear(landmarks: np.ndarray, eye_indices: list[int]) -> float:
|
| 41 |
+
p1 = landmarks[eye_indices[0], :2]
|
| 42 |
+
p2 = landmarks[eye_indices[1], :2]
|
| 43 |
+
p3 = landmarks[eye_indices[2], :2]
|
| 44 |
+
p4 = landmarks[eye_indices[3], :2]
|
| 45 |
+
p5 = landmarks[eye_indices[4], :2]
|
| 46 |
+
p6 = landmarks[eye_indices[5], :2]
|
| 47 |
+
|
| 48 |
+
vertical1 = _distance(p2, p6)
|
| 49 |
+
vertical2 = _distance(p3, p5)
|
| 50 |
+
horizontal = _distance(p1, p4)
|
| 51 |
+
|
| 52 |
+
if horizontal < 1e-6:
|
| 53 |
+
return 0.0
|
| 54 |
+
|
| 55 |
+
return (vertical1 + vertical2) / (2.0 * horizontal)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def compute_avg_ear(landmarks: np.ndarray) -> float:
|
| 59 |
+
left_ear = compute_ear(landmarks, _LEFT_EYE_EAR)
|
| 60 |
+
right_ear = compute_ear(landmarks, _RIGHT_EYE_EAR)
|
| 61 |
+
return (left_ear + right_ear) / 2.0
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def compute_gaze_ratio(landmarks: np.ndarray) -> tuple[float, float]:
|
| 65 |
+
left_iris = landmarks[_LEFT_IRIS_CENTER, :2]
|
| 66 |
+
left_inner = landmarks[_LEFT_EYE_INNER, :2]
|
| 67 |
+
left_outer = landmarks[_LEFT_EYE_OUTER, :2]
|
| 68 |
+
left_top = landmarks[_LEFT_EYE_TOP, :2]
|
| 69 |
+
left_bottom = landmarks[_LEFT_EYE_BOTTOM, :2]
|
| 70 |
+
|
| 71 |
+
right_iris = landmarks[_RIGHT_IRIS_CENTER, :2]
|
| 72 |
+
right_inner = landmarks[_RIGHT_EYE_INNER, :2]
|
| 73 |
+
right_outer = landmarks[_RIGHT_EYE_OUTER, :2]
|
| 74 |
+
right_top = landmarks[_RIGHT_EYE_TOP, :2]
|
| 75 |
+
right_bottom = landmarks[_RIGHT_EYE_BOTTOM, :2]
|
| 76 |
+
|
| 77 |
+
left_h_total = _distance(left_inner, left_outer)
|
| 78 |
+
right_h_total = _distance(right_inner, right_outer)
|
| 79 |
+
|
| 80 |
+
if left_h_total < 1e-6 or right_h_total < 1e-6:
|
| 81 |
+
return 0.5, 0.5
|
| 82 |
+
|
| 83 |
+
left_h_ratio = _distance(left_outer, left_iris) / left_h_total
|
| 84 |
+
right_h_ratio = _distance(right_outer, right_iris) / right_h_total
|
| 85 |
+
h_ratio = (left_h_ratio + right_h_ratio) / 2.0
|
| 86 |
+
|
| 87 |
+
left_v_total = _distance(left_top, left_bottom)
|
| 88 |
+
right_v_total = _distance(right_top, right_bottom)
|
| 89 |
+
|
| 90 |
+
if left_v_total < 1e-6 or right_v_total < 1e-6:
|
| 91 |
+
return h_ratio, 0.5
|
| 92 |
+
|
| 93 |
+
left_v_ratio = _distance(left_top, left_iris) / left_v_total
|
| 94 |
+
right_v_ratio = _distance(right_top, right_iris) / right_v_total
|
| 95 |
+
v_ratio = (left_v_ratio + right_v_ratio) / 2.0
|
| 96 |
+
|
| 97 |
+
return float(np.clip(h_ratio, 0, 1)), float(np.clip(v_ratio, 0, 1))
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def compute_mar(landmarks: np.ndarray) -> float:
|
| 101 |
+
# Mouth aspect ratio: high = mouth open (yawning / sleepy)
|
| 102 |
+
top = landmarks[_MOUTH_TOP, :2]
|
| 103 |
+
bottom = landmarks[_MOUTH_BOTTOM, :2]
|
| 104 |
+
left = landmarks[_MOUTH_LEFT, :2]
|
| 105 |
+
right = landmarks[_MOUTH_RIGHT, :2]
|
| 106 |
+
upper1 = landmarks[_MOUTH_UPPER_1, :2]
|
| 107 |
+
lower1 = landmarks[_MOUTH_LOWER_1, :2]
|
| 108 |
+
upper2 = landmarks[_MOUTH_UPPER_2, :2]
|
| 109 |
+
lower2 = landmarks[_MOUTH_LOWER_2, :2]
|
| 110 |
+
|
| 111 |
+
horizontal = _distance(left, right)
|
| 112 |
+
if horizontal < 1e-6:
|
| 113 |
+
return 0.0
|
| 114 |
+
v1 = _distance(upper1, lower1)
|
| 115 |
+
v2 = _distance(top, bottom)
|
| 116 |
+
v3 = _distance(upper2, lower2)
|
| 117 |
+
return (v1 + v2 + v3) / (2.0 * horizontal)
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
class EyeBehaviourScorer:
|
| 121 |
+
def __init__(
|
| 122 |
+
self,
|
| 123 |
+
ear_open: float = 0.30,
|
| 124 |
+
ear_closed: float = 0.16,
|
| 125 |
+
gaze_max_offset: float = 0.28,
|
| 126 |
+
):
|
| 127 |
+
self.ear_open = ear_open
|
| 128 |
+
self.ear_closed = ear_closed
|
| 129 |
+
self.gaze_max_offset = gaze_max_offset
|
| 130 |
+
|
| 131 |
+
def _ear_score(self, ear: float) -> float:
|
| 132 |
+
if ear >= self.ear_open:
|
| 133 |
+
return 1.0
|
| 134 |
+
if ear <= self.ear_closed:
|
| 135 |
+
return 0.0
|
| 136 |
+
return (ear - self.ear_closed) / (self.ear_open - self.ear_closed)
|
| 137 |
+
|
| 138 |
+
def _gaze_score(self, h_ratio: float, v_ratio: float) -> float:
|
| 139 |
+
h_offset = abs(h_ratio - 0.5)
|
| 140 |
+
v_offset = abs(v_ratio - 0.5)
|
| 141 |
+
offset = math.sqrt(h_offset**2 + v_offset**2)
|
| 142 |
+
t = min(offset / self.gaze_max_offset, 1.0)
|
| 143 |
+
return 0.5 * (1.0 + math.cos(math.pi * t))
|
| 144 |
+
|
| 145 |
+
def score(self, landmarks: np.ndarray) -> float:
|
| 146 |
+
ear = compute_avg_ear(landmarks)
|
| 147 |
+
ear_s = self._ear_score(ear)
|
| 148 |
+
if ear_s < 0.3:
|
| 149 |
+
return ear_s
|
| 150 |
+
h_ratio, v_ratio = compute_gaze_ratio(landmarks)
|
| 151 |
+
gaze_s = self._gaze_score(h_ratio, v_ratio)
|
| 152 |
+
return ear_s * gaze_s
|
| 153 |
+
|
| 154 |
+
def detailed_score(self, landmarks: np.ndarray) -> dict:
|
| 155 |
+
ear = compute_avg_ear(landmarks)
|
| 156 |
+
ear_s = self._ear_score(ear)
|
| 157 |
+
h_ratio, v_ratio = compute_gaze_ratio(landmarks)
|
| 158 |
+
gaze_s = self._gaze_score(h_ratio, v_ratio)
|
| 159 |
+
s_eye = ear_s if ear_s < 0.3 else ear_s * gaze_s
|
| 160 |
+
return {
|
| 161 |
+
"ear": round(ear, 4),
|
| 162 |
+
"ear_score": round(ear_s, 4),
|
| 163 |
+
"h_gaze": round(h_ratio, 4),
|
| 164 |
+
"v_gaze": round(v_ratio, 4),
|
| 165 |
+
"gaze_score": round(gaze_s, 4),
|
| 166 |
+
"s_eye": round(s_eye, 4),
|
| 167 |
+
}
|
models/face_orientation/head_pose.py
CHANGED
|
@@ -1 +1,114 @@
|
|
| 1 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Head pose from 6 Face Mesh landmarks (solvePnP) -> yaw/pitch/roll, S_face
|
| 2 |
+
|
| 3 |
+
import math
|
| 4 |
+
|
| 5 |
+
import cv2
|
| 6 |
+
import numpy as np
|
| 7 |
+
|
| 8 |
+
_LANDMARK_INDICES = [1, 152, 33, 263, 61, 291]
|
| 9 |
+
|
| 10 |
+
_MODEL_POINTS = np.array(
|
| 11 |
+
[
|
| 12 |
+
[0.0, 0.0, 0.0],
|
| 13 |
+
[0.0, -330.0, -65.0],
|
| 14 |
+
[-225.0, 170.0, -135.0],
|
| 15 |
+
[225.0, 170.0, -135.0],
|
| 16 |
+
[-150.0, -150.0, -125.0],
|
| 17 |
+
[150.0, -150.0, -125.0],
|
| 18 |
+
],
|
| 19 |
+
dtype=np.float64,
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class HeadPoseEstimator:
|
| 24 |
+
def __init__(self, max_angle: float = 30.0, roll_weight: float = 0.5):
|
| 25 |
+
self.max_angle = max_angle
|
| 26 |
+
self.roll_weight = roll_weight
|
| 27 |
+
self._camera_matrix = None
|
| 28 |
+
self._frame_size = None
|
| 29 |
+
self._dist_coeffs = np.zeros((4, 1), dtype=np.float64)
|
| 30 |
+
|
| 31 |
+
def _get_camera_matrix(self, frame_w: int, frame_h: int) -> np.ndarray:
|
| 32 |
+
if self._camera_matrix is not None and self._frame_size == (frame_w, frame_h):
|
| 33 |
+
return self._camera_matrix
|
| 34 |
+
focal_length = float(frame_w)
|
| 35 |
+
cx, cy = frame_w / 2.0, frame_h / 2.0
|
| 36 |
+
self._camera_matrix = np.array(
|
| 37 |
+
[[focal_length, 0, cx], [0, focal_length, cy], [0, 0, 1]],
|
| 38 |
+
dtype=np.float64,
|
| 39 |
+
)
|
| 40 |
+
self._frame_size = (frame_w, frame_h)
|
| 41 |
+
return self._camera_matrix
|
| 42 |
+
|
| 43 |
+
def _solve(self, landmarks: np.ndarray, frame_w: int, frame_h: int):
|
| 44 |
+
image_points = np.array(
|
| 45 |
+
[
|
| 46 |
+
[landmarks[i, 0] * frame_w, landmarks[i, 1] * frame_h]
|
| 47 |
+
for i in _LANDMARK_INDICES
|
| 48 |
+
],
|
| 49 |
+
dtype=np.float64,
|
| 50 |
+
)
|
| 51 |
+
camera_matrix = self._get_camera_matrix(frame_w, frame_h)
|
| 52 |
+
success, rvec, tvec = cv2.solvePnP(
|
| 53 |
+
_MODEL_POINTS,
|
| 54 |
+
image_points,
|
| 55 |
+
camera_matrix,
|
| 56 |
+
self._dist_coeffs,
|
| 57 |
+
flags=cv2.SOLVEPNP_ITERATIVE,
|
| 58 |
+
)
|
| 59 |
+
return success, rvec, tvec, image_points
|
| 60 |
+
|
| 61 |
+
def estimate(
|
| 62 |
+
self, landmarks: np.ndarray, frame_w: int, frame_h: int
|
| 63 |
+
) -> tuple[float, float, float] | None:
|
| 64 |
+
success, rvec, tvec, _ = self._solve(landmarks, frame_w, frame_h)
|
| 65 |
+
if not success:
|
| 66 |
+
return None
|
| 67 |
+
|
| 68 |
+
rmat, _ = cv2.Rodrigues(rvec)
|
| 69 |
+
nose_dir = rmat @ np.array([0.0, 0.0, 1.0])
|
| 70 |
+
face_up = rmat @ np.array([0.0, 1.0, 0.0])
|
| 71 |
+
|
| 72 |
+
yaw = math.degrees(math.atan2(nose_dir[0], -nose_dir[2]))
|
| 73 |
+
pitch = math.degrees(math.asin(np.clip(-nose_dir[1], -1.0, 1.0)))
|
| 74 |
+
roll = math.degrees(math.atan2(face_up[0], -face_up[1]))
|
| 75 |
+
|
| 76 |
+
return (yaw, pitch, roll)
|
| 77 |
+
|
| 78 |
+
def score(self, landmarks: np.ndarray, frame_w: int, frame_h: int) -> float:
|
| 79 |
+
angles = self.estimate(landmarks, frame_w, frame_h)
|
| 80 |
+
if angles is None:
|
| 81 |
+
return 0.0
|
| 82 |
+
|
| 83 |
+
yaw, pitch, roll = angles
|
| 84 |
+
deviation = math.sqrt(yaw**2 + pitch**2 + (self.roll_weight * roll) ** 2)
|
| 85 |
+
t = min(deviation / self.max_angle, 1.0)
|
| 86 |
+
return 0.5 * (1.0 + math.cos(math.pi * t))
|
| 87 |
+
|
| 88 |
+
def draw_axes(
|
| 89 |
+
self,
|
| 90 |
+
frame: np.ndarray,
|
| 91 |
+
landmarks: np.ndarray,
|
| 92 |
+
axis_length: float = 50.0,
|
| 93 |
+
) -> np.ndarray:
|
| 94 |
+
h, w = frame.shape[:2]
|
| 95 |
+
success, rvec, tvec, image_points = self._solve(landmarks, w, h)
|
| 96 |
+
if not success:
|
| 97 |
+
return frame
|
| 98 |
+
|
| 99 |
+
camera_matrix = self._get_camera_matrix(w, h)
|
| 100 |
+
nose = tuple(image_points[0].astype(int))
|
| 101 |
+
|
| 102 |
+
axes_3d = np.float64(
|
| 103 |
+
[[axis_length, 0, 0], [0, axis_length, 0], [0, 0, axis_length]]
|
| 104 |
+
)
|
| 105 |
+
projected, _ = cv2.projectPoints(
|
| 106 |
+
axes_3d, rvec, tvec, camera_matrix, self._dist_coeffs
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
colors = [(0, 0, 255), (0, 255, 0), (255, 0, 0)]
|
| 110 |
+
for i, color in enumerate(colors):
|
| 111 |
+
pt = tuple(projected[i].ravel().astype(int))
|
| 112 |
+
cv2.line(frame, nose, pt, color, 2)
|
| 113 |
+
|
| 114 |
+
return frame
|
requirements.txt
CHANGED
|
@@ -1,4 +1,6 @@
|
|
| 1 |
-
#
|
| 2 |
mediapipe>=0.10.14
|
| 3 |
opencv-python>=4.8.0
|
| 4 |
numpy>=1.24.0
|
|
|
|
|
|
|
|
|
| 1 |
+
# Face mesh + head pose + eye behaviour (Stage 2); eye CNN needs torch
|
| 2 |
mediapipe>=0.10.14
|
| 3 |
opencv-python>=4.8.0
|
| 4 |
numpy>=1.24.0
|
| 5 |
+
torch>=2.0.0
|
| 6 |
+
torchvision>=0.15.0
|
ui/live_demo.py
CHANGED
|
@@ -119,12 +119,16 @@ def draw_eyes_and_irises(frame, landmarks, w, h):
|
|
| 119 |
|
| 120 |
|
| 121 |
def main():
|
| 122 |
-
parser = argparse.ArgumentParser(description="FocusGuard — Face mesh (Stage
|
| 123 |
parser.add_argument("--camera", type=int, default=0, help="Camera index")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
args = parser.parse_args()
|
| 125 |
|
| 126 |
-
print("[DEMO] Face mesh
|
| 127 |
-
pipeline = FaceMeshPipeline()
|
| 128 |
|
| 129 |
cap = cv2.VideoCapture(args.camera)
|
| 130 |
if not cap.isOpened():
|
|
@@ -156,10 +160,21 @@ def main():
|
|
| 156 |
elif mesh_mode == MESH_CONTOURS:
|
| 157 |
draw_contours(frame, lm, w, h)
|
| 158 |
draw_eyes_and_irises(frame, lm, w, h)
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
|
| 164 |
cv2.imshow("FocusGuard", frame)
|
| 165 |
|
|
|
|
| 119 |
|
| 120 |
|
| 121 |
def main():
|
| 122 |
+
parser = argparse.ArgumentParser(description="FocusGuard — Face mesh + focus (Stage 2)")
|
| 123 |
parser.add_argument("--camera", type=int, default=0, help="Camera index")
|
| 124 |
+
parser.add_argument("--max-angle", type=float, default=22.0, help="Max head angle for S_face (deg), smaller = tighter")
|
| 125 |
+
parser.add_argument("--alpha", type=float, default=0.4, help="S_face weight")
|
| 126 |
+
parser.add_argument("--beta", type=float, default=0.6, help="S_eye weight")
|
| 127 |
+
parser.add_argument("--threshold", type=float, default=0.55, help="Score >= this = FOCUSED (higher = stricter)")
|
| 128 |
args = parser.parse_args()
|
| 129 |
|
| 130 |
+
print("[DEMO] Face mesh + head pose + eye behaviour (Stage 2)")
|
| 131 |
+
pipeline = FaceMeshPipeline(max_angle=args.max_angle, alpha=args.alpha, beta=args.beta, threshold=args.threshold)
|
| 132 |
|
| 133 |
cap = cv2.VideoCapture(args.camera)
|
| 134 |
if not cap.isOpened():
|
|
|
|
| 160 |
elif mesh_mode == MESH_CONTOURS:
|
| 161 |
draw_contours(frame, lm, w, h)
|
| 162 |
draw_eyes_and_irises(frame, lm, w, h)
|
| 163 |
+
pipeline.head_pose.draw_axes(frame, lm)
|
| 164 |
+
|
| 165 |
+
# Status bar: FOCUSED / NOT FOCUSED; YAWN when mouth open (sleepy)
|
| 166 |
+
status = "FOCUSED" if result["is_focused"] else "NOT FOCUSED"
|
| 167 |
+
status_color = GREEN if result["is_focused"] else RED
|
| 168 |
+
cv2.rectangle(frame, (0, 0), (w, 55), (0, 0, 0), -1)
|
| 169 |
+
cv2.putText(frame, status, (10, 28), FONT, 0.8, status_color, 2, cv2.LINE_AA)
|
| 170 |
+
mar_str = f" MAR:{result['mar']:.2f}" if result.get("mar") is not None else ""
|
| 171 |
+
cv2.putText(frame, f"S_face:{result['s_face']:.2f} S_eye:{result['s_eye']:.2f}{mar_str} score:{result['raw_score']:.2f}", (10, 48), FONT, 0.45, WHITE, 1, cv2.LINE_AA)
|
| 172 |
+
if result.get("is_yawning"):
|
| 173 |
+
cv2.putText(frame, "YAWN", (10, 75), FONT, 0.7, ORANGE, 2, cv2.LINE_AA)
|
| 174 |
+
if result["yaw"] is not None:
|
| 175 |
+
cv2.putText(frame, f"yaw:{result['yaw']:+.0f} pitch:{result['pitch']:+.0f} roll:{result['roll']:+.0f}", (w - 280, 48), FONT, 0.4, (180, 180, 180), 1, cv2.LINE_AA)
|
| 176 |
+
cv2.putText(frame, f"{_MESH_NAMES[mesh_mode]} FPS: {fps:.0f}", (w - 200, 28), FONT, 0.45, WHITE, 1, cv2.LINE_AA)
|
| 177 |
+
cv2.putText(frame, "q:quit m:mesh", (w - 140, 48), FONT, 0.4, (180, 180, 180), 1, cv2.LINE_AA)
|
| 178 |
|
| 179 |
cv2.imshow("FocusGuard", frame)
|
| 180 |
|
ui/pipeline.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
# Stage
|
| 2 |
|
| 3 |
import os
|
| 4 |
import sys
|
|
@@ -10,17 +10,59 @@ if _PROJECT_ROOT not in sys.path:
|
|
| 10 |
sys.path.insert(0, _PROJECT_ROOT)
|
| 11 |
|
| 12 |
from models.face_mesh.face_mesh import FaceMeshDetector
|
|
|
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
class FaceMeshPipeline:
|
| 16 |
-
# frame -> face mesh ->
|
| 17 |
|
| 18 |
-
def __init__(self):
|
| 19 |
self.detector = FaceMeshDetector()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
def process_frame(self, bgr_frame: np.ndarray) -> dict:
|
| 22 |
landmarks = self.detector.process(bgr_frame)
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
def close(self):
|
| 26 |
self.detector.close()
|
|
|
|
| 1 |
+
# Stage 2: face mesh + head pose (S_face) + eye behaviour (S_eye) -> focus
|
| 2 |
|
| 3 |
import os
|
| 4 |
import sys
|
|
|
|
| 10 |
sys.path.insert(0, _PROJECT_ROOT)
|
| 11 |
|
| 12 |
from models.face_mesh.face_mesh import FaceMeshDetector
|
| 13 |
+
from models.face_orientation.head_pose import HeadPoseEstimator
|
| 14 |
+
from models.eye_behaviour.eye_scorer import EyeBehaviourScorer, compute_mar, MAR_YAWN_THRESHOLD
|
| 15 |
|
| 16 |
|
| 17 |
class FaceMeshPipeline:
|
| 18 |
+
# frame -> face mesh -> S_face + S_eye -> focused / not focused
|
| 19 |
|
| 20 |
+
def __init__(self, max_angle: float = 22.0, alpha: float = 0.4, beta: float = 0.6, threshold: float = 0.55):
|
| 21 |
self.detector = FaceMeshDetector()
|
| 22 |
+
self.head_pose = HeadPoseEstimator(max_angle=max_angle)
|
| 23 |
+
self.eye_scorer = EyeBehaviourScorer()
|
| 24 |
+
self.alpha = alpha
|
| 25 |
+
self.beta = beta
|
| 26 |
+
self.threshold = threshold
|
| 27 |
|
| 28 |
def process_frame(self, bgr_frame: np.ndarray) -> dict:
|
| 29 |
landmarks = self.detector.process(bgr_frame)
|
| 30 |
+
h, w = bgr_frame.shape[:2]
|
| 31 |
+
|
| 32 |
+
out = {
|
| 33 |
+
"landmarks": landmarks,
|
| 34 |
+
"s_face": 0.0,
|
| 35 |
+
"s_eye": 0.0,
|
| 36 |
+
"raw_score": 0.0,
|
| 37 |
+
"is_focused": False,
|
| 38 |
+
"yaw": None,
|
| 39 |
+
"pitch": None,
|
| 40 |
+
"roll": None,
|
| 41 |
+
"mar": None,
|
| 42 |
+
"is_yawning": False,
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
if landmarks is None:
|
| 46 |
+
return out
|
| 47 |
+
|
| 48 |
+
# Head pose -> S_face, yaw/pitch/roll
|
| 49 |
+
angles = self.head_pose.estimate(landmarks, w, h)
|
| 50 |
+
if angles is not None:
|
| 51 |
+
out["yaw"], out["pitch"], out["roll"] = angles
|
| 52 |
+
out["s_face"] = self.head_pose.score(landmarks, w, h)
|
| 53 |
+
|
| 54 |
+
# Eye behaviour (EAR + gaze) -> S_eye
|
| 55 |
+
out["s_eye"] = self.eye_scorer.score(landmarks)
|
| 56 |
+
|
| 57 |
+
# Mouth open (MAR) -> yawn / sleepy: force NOT FOCUSED when mouth open
|
| 58 |
+
out["mar"] = compute_mar(landmarks)
|
| 59 |
+
out["is_yawning"] = out["mar"] > MAR_YAWN_THRESHOLD
|
| 60 |
+
|
| 61 |
+
# Fusion: alpha*S_face + beta*S_eye; if yawning (mouth open) -> not focused
|
| 62 |
+
out["raw_score"] = self.alpha * out["s_face"] + self.beta * out["s_eye"]
|
| 63 |
+
out["is_focused"] = out["raw_score"] >= self.threshold and not out["is_yawning"]
|
| 64 |
+
|
| 65 |
+
return out
|
| 66 |
|
| 67 |
def close(self):
|
| 68 |
self.detector.close()
|