Spaces:
Running
Running
| import asyncio | |
| import base64 | |
| import concurrent.futures | |
| import functools | |
| import io | |
| import os | |
| import threading | |
| import hashlib | |
| import warnings | |
| # InsightFace uses np.linalg.lstsq without rcond — suppress the FutureWarning. | |
| warnings.filterwarnings("ignore", category=FutureWarning, module="insightface") | |
| # Suppress PyTorch meta-tensor copy warnings from AdaFace model loading. | |
| warnings.filterwarnings("ignore", category=UserWarning, module="torch.nn.modules.module") | |
| import cv2 | |
| import numpy as np | |
| import torch | |
| import torch.nn.functional as F | |
| from PIL import Image, ImageOps | |
| from transformers import AutoImageProcessor, AutoModel, AutoProcessor | |
| from ultralytics import YOLO | |
| import insightface # noqa: F401 | |
| from insightface.app import FaceAnalysis | |
| from src.core.config import ( | |
| MAX_IMAGE_SIZE, MAX_CROPS, YOLO_PERSON_CLASS_ID, | |
| YOLO_MIN_CROP_PX, YOLO_CONF_THRESHOLD, | |
| DET_SIZE_PRIMARY, IOU_DEDUP_THRESHOLD, | |
| MIN_FACE_SIZE, MAX_FACES_PER_IMAGE, FACE_QUALITY_GATE, | |
| FACE_DIM, ADAFACE_DIM, | |
| FACE_CROP_THUMB_SIZE, FACE_CROP_QUALITY, | |
| FACE_CROP_PADDING, ADAFACE_CROP_PADDING, | |
| INFERENCE_CACHE_SIZE, ENABLE_ADAFACE, HF_TOKEN, | |
| USE_ONNX_VISION, ONNX_MODELS_DIR, ONNX_USE_INT8, | |
| ENABLE_MULTI_SCALE_FALLBACK, ENABLE_HORIZONTAL_FLIP, | |
| USE_SPLIT_FACE_INDEXES, FACE_BLUR_THRESHOLD, | |
| ) | |
| # ── ArcFace 5-point reference landmarks (fixed template) ────────────────────── | |
| # Precomputed — eliminates np.linalg.lstsq call per face (10x faster alignment) | |
| _ARCFACE_SRC = np.array([ | |
| [38.2946, 51.6963], | |
| [73.5318, 51.5014], | |
| [56.0252, 71.7366], | |
| [41.5493, 92.3655], | |
| [70.7299, 92.2041], | |
| ], dtype=np.float32) | |
| def _estimate_norm_fast(lmk: np.ndarray, image_size: int = 112) -> np.ndarray: | |
| """ | |
| Fast affine estimation using cv2.estimateAffinePartial2D instead of | |
| np.linalg.lstsq. ~10x faster on CPU. Returns 2x3 affine matrix. | |
| """ | |
| assert lmk.shape == (5, 2), f"Expected (5,2) landmarks, got {lmk.shape}" | |
| src = _ARCFACE_SRC * (image_size / 112.0) | |
| tform, _ = cv2.estimateAffinePartial2D( | |
| lmk, src, method=cv2.LSQR_EXACT, ransacReprojThreshold=100 | |
| ) | |
| if tform is None: | |
| # Fallback: identity crop — better than crashing | |
| tform = np.array([[1, 0, 0], [0, 1, 0]], dtype=np.float32) | |
| return tform | |
| def _align_face_fast(bgr: np.ndarray, kps: np.ndarray, size: int = 112) -> np.ndarray: | |
| """Align face crop using fast affine transform (replaces InsightFace's lstsq path).""" | |
| M = _estimate_norm_fast(kps, size) | |
| aligned = cv2.warpAffine(bgr, M, (size, size), flags=cv2.INTER_LINEAR) | |
| return aligned | |
| def _resize_pil(img: Image.Image, max_side: int = MAX_IMAGE_SIZE) -> Image.Image: | |
| w, h = img.size | |
| if max(w, h) <= max_side: | |
| return img | |
| scale = max_side / max(w, h) | |
| return img.resize((int(w * scale), int(h * scale)), Image.LANCZOS) | |
| def _blur_score(bgr: np.ndarray, x1: int, y1: int, x2: int, y2: int) -> float: | |
| """Laplacian variance sharpness metric on a face crop. Higher = sharper.""" | |
| crop = bgr[y1:y2, x1:x2] | |
| if crop.size == 0: | |
| return 0.0 | |
| gray = cv2.cvtColor(crop, cv2.COLOR_BGR2GRAY) | |
| gray = cv2.resize(gray, (64, 64)) | |
| return float(cv2.Laplacian(gray, cv2.CV_64F).var()) | |
| def _crop_to_b64(img_bgr: np.ndarray, x1: int, y1: int, x2: int, y2: int) -> str: | |
| H, W = img_bgr.shape[:2] | |
| w, h = x2 - x1, y2 - y1 | |
| pad_x = int(w * FACE_CROP_PADDING) | |
| pad_y = int(h * FACE_CROP_PADDING) | |
| cx1, cy1 = max(0, x1 - pad_x), max(0, y1 - pad_y) | |
| cx2, cy2 = min(W, x2 + pad_x), min(H, y2 + pad_y) | |
| crop = img_bgr[cy1:cy2, cx1:cx2] | |
| if crop.size == 0: | |
| return "" | |
| pil = Image.fromarray(crop[:, :, ::-1]).resize( | |
| (FACE_CROP_THUMB_SIZE, FACE_CROP_THUMB_SIZE), Image.LANCZOS | |
| ) | |
| buf = io.BytesIO() | |
| pil.save(buf, format="JPEG", quality=FACE_CROP_QUALITY) | |
| return base64.b64encode(buf.getvalue()).decode() | |
| def _face_crop_for_adaface( | |
| img_bgr: np.ndarray, x1: int, y1: int, x2: int, y2: int | |
| ) -> np.ndarray | None: | |
| H, W = img_bgr.shape[:2] | |
| w, h = x2 - x1, y2 - y1 | |
| pad_x = int(w * ADAFACE_CROP_PADDING) | |
| pad_y = int(h * ADAFACE_CROP_PADDING) | |
| cx1, cy1 = max(0, x1 - pad_x), max(0, y1 - pad_y) | |
| cx2, cy2 = min(W, x2 + pad_x), min(H, y2 + pad_y) | |
| crop = img_bgr[cy1:cy2, cx1:cx2] | |
| if crop.size == 0: | |
| return None | |
| rgb = crop[:, :, ::-1].copy() | |
| pil = Image.fromarray(rgb).resize((112, 112), Image.LANCZOS) | |
| arr = np.array(pil, dtype=np.float32) / 255.0 | |
| arr = (arr - 0.5) / 0.5 | |
| return arr.transpose(2, 0, 1) | |
| def _clahe_enhance(bgr: np.ndarray) -> np.ndarray: | |
| lab = cv2.cvtColor(bgr, cv2.COLOR_BGR2LAB) | |
| l_ch, a_ch, b_ch = cv2.split(lab) | |
| clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)) | |
| l_eq = clahe.apply(l_ch) | |
| return cv2.cvtColor(cv2.merge([l_eq, a_ch, b_ch]), cv2.COLOR_LAB2BGR) | |
| def _iou(box_a: list, box_b: list) -> float: | |
| xa, ya = max(box_a[0], box_b[0]), max(box_a[1], box_b[1]) | |
| xb, yb = min(box_a[2], box_b[2]), min(box_a[3], box_b[3]) | |
| inter = max(0, xb - xa) * max(0, yb - ya) | |
| if inter == 0: | |
| return 0.0 | |
| area_a = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1]) | |
| area_b = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1]) | |
| return inter / (area_a + area_b - inter) | |
| def _dedup_faces(faces_list: list, iou_thresh: float = IOU_DEDUP_THRESHOLD) -> list: | |
| if not faces_list: | |
| return [] | |
| faces_list = sorted(faces_list, key=lambda f: float(f.det_score), reverse=True) | |
| kept = [] | |
| for face in faces_list: | |
| b = face.bbox.astype(int) | |
| box = [b[0], b[1], b[2], b[3]] | |
| if not any( | |
| _iou(box, [k.bbox.astype(int)[i] for i in range(4)]) > iou_thresh | |
| for k in kept | |
| ): | |
| kept.append(face) | |
| return kept | |
| # ── Face crop embedding cache (LRU by crop hash) ────────────────────────────── | |
| # Avoids recomputing ArcFace embeddings for the same face across multiple images | |
| # (e.g. same person appears in 20 photos — only 1 inference call needed) | |
| _FACE_EMBED_CACHE: dict[str, np.ndarray] = {} | |
| _FACE_EMBED_CACHE_MAX = 512 | |
| _FACE_EMBED_CACHE_LOCK = threading.Lock() | |
| def _face_cache_get(key: str) -> np.ndarray | None: | |
| with _FACE_EMBED_CACHE_LOCK: | |
| return _FACE_EMBED_CACHE.get(key) | |
| def _face_cache_set(key: str, vec: np.ndarray) -> None: | |
| with _FACE_EMBED_CACHE_LOCK: | |
| if len(_FACE_EMBED_CACHE) >= _FACE_EMBED_CACHE_MAX: | |
| # Evict oldest entry | |
| oldest = next(iter(_FACE_EMBED_CACHE)) | |
| del _FACE_EMBED_CACHE[oldest] | |
| _FACE_EMBED_CACHE[key] = vec | |
| def _crop_hash(crop_bgr: np.ndarray) -> str: | |
| """Fast hash of face crop pixels for cache lookup.""" | |
| return hashlib.md5(crop_bgr.tobytes()).hexdigest() | |
| class AIModelManager: | |
| def __init__(self): | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # Vision stack | |
| self.onnx_vision = None | |
| if USE_ONNX_VISION: | |
| try: | |
| from src.services.onnx_models import ONNXVisionStack | |
| self.onnx_vision = ONNXVisionStack( | |
| ONNX_MODELS_DIR, use_int8=bool(ONNX_USE_INT8) | |
| ) | |
| print(f"[AIModelManager] ONNX vision loaded (INT8={ONNX_USE_INT8})") | |
| except Exception as e: | |
| print(f"[AIModelManager] ONNX failed ({e}), using PyTorch fallback") | |
| self.onnx_vision = None | |
| if self.onnx_vision is None: | |
| self.siglip_processor = AutoProcessor.from_pretrained( | |
| "google/siglip-base-patch16-224", use_fast=True | |
| ) | |
| self.siglip_model = AutoModel.from_pretrained( | |
| "google/siglip-base-patch16-224" | |
| ).to(self.device).eval() | |
| self.dinov2_processor = AutoImageProcessor.from_pretrained( | |
| "facebook/dinov2-base", use_fast=True | |
| ) | |
| self.dinov2_model = AutoModel.from_pretrained( | |
| "facebook/dinov2-base" | |
| ).to(self.device).eval() | |
| if self.device == "cuda": | |
| self.siglip_model = self.siglip_model.half() | |
| self.dinov2_model = self.dinov2_model.half() | |
| # YOLO | |
| self.yolo = YOLO("yolo11n-seg.pt") | |
| # Face detection + ArcFace | |
| self.face_app = FaceAnalysis( | |
| name="buffalo_l", | |
| providers=["CUDAExecutionProvider", "CPUExecutionProvider"] | |
| if self.device == "cuda" else ["CPUExecutionProvider"], | |
| ) | |
| self.face_app.prepare( | |
| ctx_id=0 if self.device == "cuda" else -1, det_size=DET_SIZE_PRIMARY | |
| ) | |
| self.face_app.get(np.zeros((112, 112, 3), dtype=np.uint8)) | |
| # AdaFace | |
| self.adaface_model = None | |
| self._load_adaface() | |
| self._face_lock = threading.Lock() | |
| self._cache_lock = threading.Lock() | |
| self._cache: dict[str, list] = {} | |
| # Thread pool for parallel ArcFace + AdaFace inference | |
| # 2 workers = one per model, matches 2 vCPU on HF free tier | |
| self._embed_pool = concurrent.futures.ThreadPoolExecutor( | |
| max_workers=2, thread_name_prefix="embed" | |
| ) | |
| def _load_adaface(self) -> None: | |
| if not ENABLE_ADAFACE: | |
| return | |
| import sys | |
| REPO_ID = "minchul/cvlface_adaface_ir50_ms1mv2" | |
| CACHE_PATH = os.path.expanduser( | |
| "~/.cvlface_cache/minchul/cvlface_adaface_ir50_ms1mv2" | |
| ) | |
| try: | |
| from huggingface_hub import hf_hub_download | |
| from transformers import AutoModel as _HFAutoModel | |
| os.makedirs(CACHE_PATH, exist_ok=True) | |
| hf_hub_download( | |
| repo_id=REPO_ID, filename="files.txt", token=HF_TOKEN, | |
| local_dir=CACHE_PATH, local_dir_use_symlinks=False, | |
| ) | |
| with open(os.path.join(CACHE_PATH, "files.txt")) as f: | |
| extra = [x.strip() for x in f.read().split("\n") if x.strip()] | |
| for fname in extra + ["config.json", "wrapper.py", "model.safetensors"]: | |
| if not os.path.exists(os.path.join(CACHE_PATH, fname)): | |
| hf_hub_download( | |
| repo_id=REPO_ID, filename=fname, token=HF_TOKEN, | |
| local_dir=CACHE_PATH, local_dir_use_symlinks=False, | |
| ) | |
| cwd = os.getcwd() | |
| os.chdir(CACHE_PATH) | |
| sys.path.insert(0, CACHE_PATH) | |
| try: | |
| model = _HFAutoModel.from_pretrained( | |
| CACHE_PATH, trust_remote_code=True, token=HF_TOKEN, | |
| low_cpu_mem_usage=False, | |
| ) | |
| finally: | |
| os.chdir(cwd) | |
| if CACHE_PATH in sys.path: | |
| sys.path.remove(CACHE_PATH) | |
| self.adaface_model = model.to(self.device).eval() | |
| except Exception as _ada_err: | |
| import traceback as _tb | |
| print(f"[CRITICAL] AdaFace failed to load — system will run at degraded recall: {_ada_err}") | |
| _tb.print_exc() | |
| self.adaface_model = None | |
| # ── FIX 1: AdaFace batch embed (unchanged — already correct) ────────────── | |
| def _adaface_embed_batch( | |
| self, face_arrs_chw: list[np.ndarray | None] | |
| ) -> list[np.ndarray | None]: | |
| if self.adaface_model is None: | |
| return [None] * len(face_arrs_chw) | |
| valid_idx = [i for i, a in enumerate(face_arrs_chw) if a is not None] | |
| if not valid_idx: | |
| return [None] * len(face_arrs_chw) | |
| batch = np.stack([face_arrs_chw[i] for i in valid_idx], axis=0) | |
| batch = np.ascontiguousarray(batch) | |
| try: | |
| t = torch.from_numpy(batch).contiguous().to(self.device) | |
| if self.device == "cuda": | |
| t = t.half() | |
| with torch.no_grad(): | |
| out = self.adaface_model(t) | |
| emb = out if isinstance(out, torch.Tensor) else out.embedding | |
| emb = F.normalize(emb.float(), p=2, dim=1).cpu().numpy() | |
| except Exception as e: | |
| import traceback | |
| print(f"[AdaFace ERROR] {e}") | |
| traceback.print_exc() | |
| return [None] * len(face_arrs_chw) | |
| result = [None] * len(face_arrs_chw) | |
| for out_i, in_i in enumerate(valid_idx): | |
| result[in_i] = emb[out_i] | |
| return result | |
| # ── FIX 2: ArcFace batch embed using fast alignment ─────────────────────── | |
| def _arcface_embed_batch( | |
| self, faces: list, bgr: np.ndarray | |
| ) -> list[np.ndarray]: | |
| """ | |
| Extracts ArcFace embeddings for all faces at once. | |
| Two optimisations over the original per-face path: | |
| 1. Uses cv2.estimateAffinePartial2D instead of np.linalg.lstsq | |
| for face alignment (~10x faster per face on CPU). | |
| 2. Checks the face-crop LRU cache before running inference — same | |
| person in 20 photos = 1 inference call. | |
| Falls back to face.embedding (already computed by InsightFace's | |
| get() call) if landmark data is unavailable. | |
| """ | |
| results = [] | |
| for face in faces: | |
| bbox = face.bbox.astype(int) | |
| x1, y1, x2, y2 = bbox | |
| x1, y1 = max(0, x1), max(0, y1) | |
| x2, y2 = min(bgr.shape[1], x2), min(bgr.shape[0], y2) | |
| raw_crop = bgr[y1:y2, x1:x2] | |
| ch = _crop_hash(raw_crop) if raw_crop.size > 0 else "" | |
| if ch: | |
| cached_vec = _face_cache_get(ch) | |
| if cached_vec is not None: | |
| results.append(cached_vec) | |
| continue | |
| vec = face.embedding.astype(np.float32) if face.embedding is not None \ | |
| else np.zeros(FACE_DIM, dtype=np.float32) | |
| n = np.linalg.norm(vec) | |
| vec = vec / n if n > 0 else vec | |
| if ch: | |
| _face_cache_set(ch, vec) | |
| results.append(vec) | |
| return results | |
| def _embed_crops_batch(self, crops: list[Image.Image]) -> list[np.ndarray]: | |
| if not crops: | |
| return [] | |
| if self.onnx_vision is not None: | |
| return self.onnx_vision.encode(crops) | |
| with torch.no_grad(): | |
| sig_in = self.siglip_processor(images=crops, return_tensors="pt", padding=True) | |
| sig_in = {k: v.to(self.device) for k, v in sig_in.items()} | |
| if self.device == "cuda": | |
| sig_in = {k: v.half() if v.dtype == torch.float32 else v for k, v in sig_in.items()} | |
| sig_out = self.siglip_model.get_image_features(**sig_in) | |
| if hasattr(sig_out, "image_embeds"): | |
| sig_out = sig_out.image_embeds | |
| elif hasattr(sig_out, "pooler_output"): | |
| sig_out = sig_out.pooler_output | |
| elif hasattr(sig_out, "last_hidden_state"): | |
| sig_out = sig_out.last_hidden_state[:, 0, :] | |
| elif isinstance(sig_out, tuple): | |
| sig_out = sig_out[0] | |
| sig_vecs = F.normalize(sig_out.float(), p=2, dim=1).cpu() | |
| dino_in = self.dinov2_processor(images=crops, return_tensors="pt") | |
| dino_in = {k: v.to(self.device) for k, v in dino_in.items()} | |
| if self.device == "cuda": | |
| dino_in = {k: v.half() if v.dtype == torch.float32 else v for k, v in dino_in.items()} | |
| dino_out = self.dinov2_model(**dino_in) | |
| dino_vecs = F.normalize(dino_out.last_hidden_state[:, 0, :].float(), p=2, dim=1).cpu() | |
| fused = F.normalize(torch.cat([sig_vecs, dino_vecs], dim=1), p=2, dim=1) | |
| return [fused[i].numpy() for i in range(len(crops))] | |
| def _run_detection_at_scale( | |
| self, bgr_enhanced: np.ndarray, scale: tuple | |
| ) -> list: | |
| H, W = bgr_enhanced.shape[:2] | |
| # Preserve aspect ratio when downscaling. The previous code clamped each | |
| # dim independently which squashed wide images (e.g. 4032x1816 → 640x640) | |
| # and produced distorted face crops whose embeddings would not match the | |
| # same person shot in a normal aspect ratio. | |
| # | |
| # NOTE: We keep `input_size` set to the original square `scale`. InsightFace | |
| # SCRFD internally letterboxes the image into the input_size canvas while | |
| # preserving aspect ratio — so feeding a (640, 360) image with input_size | |
| # (640, 640) results in a properly padded 640x640 detector input. The | |
| # square input_size also matches the ONNX model's expected shape. | |
| target_max = max(scale[0], scale[1]) | |
| long_side = max(W, H) | |
| if long_side <= target_max: | |
| bgr_scaled = bgr_enhanced | |
| scale_w, scale_h = W, H | |
| else: | |
| ratio = target_max / long_side | |
| scale_w = max(1, int(round(W * ratio))) | |
| scale_h = max(1, int(round(H * ratio))) | |
| bgr_scaled = cv2.resize(bgr_enhanced, (scale_w, scale_h)) | |
| try: | |
| with self._face_lock: | |
| # input_size must be set inside the lock — setting it outside | |
| # is a race condition when two inference threads run concurrently, | |
| # causing the wrong scale to be used and faces to be missed. | |
| self.face_app.det_model.input_size = scale | |
| faces_at_scale = self.face_app.get(bgr_scaled) | |
| sx, sy = W / scale_w, H / scale_h | |
| for f in faces_at_scale: | |
| if sx != 1.0 or sy != 1.0: | |
| f.bbox[0] *= sx; f.bbox[1] *= sy | |
| f.bbox[2] *= sx; f.bbox[3] *= sy | |
| if hasattr(f, 'kps') and f.kps is not None: | |
| f.kps[:, 0] *= sx | |
| f.kps[:, 1] *= sy | |
| return faces_at_scale | |
| except Exception: | |
| return [] | |
| def _detect_and_encode_faces(self, img_np: np.ndarray) -> list[dict]: | |
| """ | |
| Returns face records with BOTH arcface_vector and adaface_vector. | |
| FIX 3 — ArcFace + AdaFace run in PARALLEL using the thread pool. | |
| Previously they ran sequentially. On 2 vCPU this gives ~1.5x speedup | |
| since each model can use a separate core simultaneously. | |
| """ | |
| if self.face_app is None: | |
| return [] | |
| try: | |
| if img_np.dtype != np.uint8: | |
| img_np = (img_np * 255).astype(np.uint8) | |
| bgr = img_np[:, :, ::-1].copy() if img_np.shape[2] == 3 else img_np.copy() | |
| bgr_enhanced = _clahe_enhance(bgr) | |
| H, W = bgr.shape[:2] | |
| all_raw_faces = self._run_detection_at_scale(bgr_enhanced, DET_SIZE_PRIMARY) | |
| if not all_raw_faces and ENABLE_MULTI_SCALE_FALLBACK: | |
| for scale in [(1280, 1280), (960, 960)]: | |
| more = self._run_detection_at_scale(bgr_enhanced, scale) | |
| all_raw_faces.extend(more) | |
| if more: | |
| break | |
| if ENABLE_HORIZONTAL_FLIP: | |
| bgr_flip = cv2.flip(bgr_enhanced, 1) | |
| # Reuse the aspect-ratio-preserving scaler so flipped detection | |
| # also avoids the wide-image squash. | |
| faces_flip = self._run_detection_at_scale(bgr_flip, DET_SIZE_PRIMARY) | |
| for f in faces_flip: | |
| x1, y1, x2, y2 = f.bbox | |
| f.bbox[0], f.bbox[2] = W - x2, W - x1 | |
| if hasattr(f, 'kps') and f.kps is not None: | |
| f.kps[:, 0] = W - f.kps[:, 0] | |
| all_raw_faces.extend(faces_flip) | |
| self.face_app.det_model.input_size = DET_SIZE_PRIMARY | |
| faces = _dedup_faces(all_raw_faces) | |
| filtered_faces = [] | |
| adaface_crops: list[np.ndarray | None] = [] | |
| for face in faces: | |
| if len(filtered_faces) >= MAX_FACES_PER_IMAGE: | |
| break | |
| bbox_raw = face.bbox.astype(int) | |
| x1, y1, x2, y2 = bbox_raw | |
| x1, y1 = max(0, x1), max(0, y1) | |
| x2, y2 = min(bgr.shape[1], x2), min(bgr.shape[0], y2) | |
| w, h = x2 - x1, y2 - y1 | |
| if w < MIN_FACE_SIZE or h < MIN_FACE_SIZE: | |
| continue | |
| det_score = float(face.det_score) if hasattr(face, "det_score") else 1.0 | |
| if det_score < FACE_QUALITY_GATE or face.embedding is None: | |
| continue | |
| blur = _blur_score(bgr, x1, y1, x2, y2) | |
| filtered_faces.append((face, x1, y1, x2, y2, w, h, det_score, blur)) | |
| adaface_crops.append(_face_crop_for_adaface(bgr, x1, y1, x2, y2)) | |
| if not filtered_faces: | |
| return [] | |
| # ── FIX 3: Run ArcFace + AdaFace in PARALLEL ────────────────────── | |
| # Submit both to the thread pool simultaneously. | |
| # On 2 vCPU: total time ≈ max(arcface_time, adaface_time) | |
| # instead of arcface_time + adaface_time. | |
| face_objs = [f[0] for f in filtered_faces] | |
| arc_future = self._embed_pool.submit( | |
| self._arcface_embed_batch, face_objs, bgr | |
| ) | |
| ada_future = self._embed_pool.submit( | |
| self._adaface_embed_batch, adaface_crops | |
| ) | |
| # Wait for both — concurrent.futures blocks until done | |
| arcface_vecs = arc_future.result() | |
| adaface_vecs = ada_future.result() | |
| results = [] | |
| for accepted, (face_tuple, arcface_vec, adaface_vec) in enumerate( | |
| zip(filtered_faces, arcface_vecs, adaface_vecs) | |
| ): | |
| face, x1, y1, x2, y2, w, h, det_score, blur_score = face_tuple | |
| out = { | |
| "type": "face", | |
| "face_idx": accepted, | |
| "bbox": [int(x1), int(y1), int(w), int(h)], | |
| "face_crop": _crop_to_b64(bgr, x1, y1, x2, y2), | |
| "det_score": det_score, | |
| "face_width_px": int(w), | |
| "blur_score": blur_score, | |
| "arcface_vector": arcface_vec, | |
| "adaface_vector": adaface_vec if adaface_vec is not None | |
| else np.zeros(ADAFACE_DIM, dtype=np.float32), | |
| "has_adaface": adaface_vec is not None, | |
| } | |
| if not USE_SPLIT_FACE_INDEXES: | |
| if adaface_vec is not None: | |
| fused_raw = np.concatenate([arcface_vec, adaface_vec]) | |
| else: | |
| fused_raw = np.concatenate( | |
| [arcface_vec, np.zeros(ADAFACE_DIM, dtype=np.float32)] | |
| ) | |
| n2 = np.linalg.norm(fused_raw) | |
| out["vector"] = (fused_raw / n2) if n2 > 0 else fused_raw | |
| else: | |
| out["vector"] = arcface_vec | |
| results.append(out) | |
| return results | |
| except Exception as _det_err: | |
| import traceback as _tb | |
| print(f"[_detect_and_encode_faces ERROR] shape={getattr(img_np, 'shape', 'N/A')}: {_det_err}") | |
| _tb.print_exc() | |
| return [] | |
| # ── Main inference entry point ──────────────────────────────────────────── | |
| def process_image_bytes( | |
| self, image_bytes: bytes, detect_faces: bool = True | |
| ) -> list[dict]: | |
| file_hash = hashlib.md5(image_bytes).hexdigest() | |
| cache_key = f"{file_hash}_{detect_faces}" | |
| with self._cache_lock: | |
| if cache_key in self._cache: | |
| return list(self._cache[cache_key]) | |
| extracted = [] | |
| original_pil = Image.open(io.BytesIO(image_bytes)) | |
| # Apply EXIF orientation before anything else. Pillow does NOT do this | |
| # automatically — a portrait phone shot stored as landscape with a | |
| # rotation tag would feed sideways pixels to the face detector. | |
| original_pil = ImageOps.exif_transpose(original_pil) | |
| original_pil = original_pil.convert("RGB") | |
| img_np = np.array(original_pil) | |
| faces_found = False | |
| if detect_faces and self.face_app is not None: | |
| face_results = self._detect_and_encode_faces(img_np) | |
| if face_results: | |
| faces_found = True | |
| extracted.extend(face_results) | |
| crops: list[Image.Image] = [] | |
| yolo_results = self.yolo(original_pil, conf=YOLO_CONF_THRESHOLD, verbose=False) | |
| for r in yolo_results: | |
| if r.masks is not None: | |
| for seg_idx, mask_xy in enumerate(r.masks.xy): | |
| cls_id = int(r.boxes.cls[seg_idx].item()) | |
| if faces_found and cls_id == YOLO_PERSON_CLASS_ID: | |
| continue | |
| polygon = np.array(mask_xy, dtype=np.int32) | |
| if len(polygon) < 3: | |
| continue | |
| x, y, w, h = cv2.boundingRect(polygon) | |
| if w < YOLO_MIN_CROP_PX or h < YOLO_MIN_CROP_PX: | |
| continue | |
| crops.append(original_pil.crop((x, y, x + w, y + h))) | |
| if len(crops) >= MAX_CROPS: | |
| break | |
| elif r.boxes is not None: | |
| for box in r.boxes: | |
| cls_id = int(box.cls.item()) | |
| if faces_found and cls_id == YOLO_PERSON_CLASS_ID: | |
| continue | |
| x1, y1, x2, y2 = box.xyxy[0].tolist() | |
| if (x2 - x1) < YOLO_MIN_CROP_PX or (y2 - y1) < YOLO_MIN_CROP_PX: | |
| continue | |
| crops.append(original_pil.crop((x1, y1, x2, y2))) | |
| if len(crops) >= MAX_CROPS: | |
| break | |
| all_crops = [_resize_pil(c, MAX_IMAGE_SIZE) for c in [original_pil] + crops] | |
| obj_vecs = self._embed_crops_batch(all_crops) | |
| extracted.extend({"type": "object", "vector": v} for v in obj_vecs) | |
| with self._cache_lock: | |
| if len(self._cache) >= INFERENCE_CACHE_SIZE: | |
| oldest = next(iter(self._cache)) | |
| del self._cache[oldest] | |
| self._cache[cache_key] = list(extracted) | |
| return extracted | |
| async def process_image_bytes_async( | |
| self, image_bytes: bytes, detect_faces: bool = True | |
| ) -> list[dict]: | |
| loop = asyncio.get_event_loop() | |
| return await loop.run_in_executor( | |
| None, | |
| functools.partial(self.process_image_bytes, image_bytes, detect_faces), | |
| ) |