meaculpitt
/

ScoreVision

@@ -1,5 +1,5 @@
 """
-Score Vision SN44 — Unified miner v3.5 (2026-04-02).
 Dual-model: vehicle (YOLO11m INT8 1280) + person (YOLO26s FP16 960 end2end).
 Vehicle weights loaded from secondary HF repo (meaculpitt/ScoreVision-Vehicle).
 Person weights loaded from primary HF repo (template downloads automatically).
@@ -11,6 +11,7 @@ Vehicle model (vehicle_weights.onnx):
 Person model (person_weights.onnx):
   YOLO26s FP16 960px end2end [1,300,6]. Single class: 0=person.
 Both models run on every image. All detections merged.
 Vehicle cls_id=4 (bus) filtered by validator (out of range for both elements).
@@ -187,6 +188,12 @@ PER_MIN_AREA = 14 * 14
 PER_MAX_ASPECT = 6.0
 PER_MAX_AREA_RATIO = 0.80
 # ── Shared ──────────────────────────────────────────────────────────────────
 WBF_SKIP_THR = 0.0001
@@ -612,82 +619,156 @@ class Miner:
         raw = self.per_session.run(None, {self.per_input_name: inp})[0]
         return self._per_decode(raw, ratio, pl, pt, oh, ow, conf_thresh)
     def _infer_person(self, image_bgr):
         oh, ow = image_bgr.shape[:2]
         t_start = time.monotonic()
-        boxes_orig, confs_orig = self._per_run_pass(image_bgr, PER_CONF_LOW)
-        elapsed = time.monotonic() - t_start
-        has_flip = False
-        if elapsed < PER_RTF_BUDGET / 2:
             flipped = cv2.flip(image_bgr, 1)
             boxes_flip, confs_flip = self._per_run_pass(flipped, PER_CONF_LOW)
-            if len(boxes_flip):
-                boxes_flip[:, 0], boxes_flip[:, 2] = ow - boxes_flip[:, 2], ow - boxes_flip[:, 0]
-            has_flip = True
-        else:
-            logger.warning(f"[person TTA] skipping flip — pass1 took {elapsed*1000:.0f}ms")
-        kept_b, kept_s = [], []
-        if not has_flip or len(boxes_flip) == 0:
-            # No flip available — keep all original boxes
-            for i in range(len(boxes_orig)):
-                kept_b.append(boxes_orig[i])
-                kept_s.append(float(confs_orig[i]))
-        else:
-            # Consensus TTA merge
-            used_flip = set()
-            for i in range(len(boxes_orig)):
-                # Compute IoU with all flip boxes
-                xx1 = np.maximum(boxes_orig[i, 0], boxes_flip[:, 0])
-                yy1 = np.maximum(boxes_orig[i, 1], boxes_flip[:, 1])
-                xx2 = np.minimum(boxes_orig[i, 2], boxes_flip[:, 2])
-                yy2 = np.minimum(boxes_orig[i, 3], boxes_flip[:, 3])
-                inter = np.maximum(0, xx2 - xx1) * np.maximum(0, yy2 - yy1)
-                a1 = (boxes_orig[i, 2] - boxes_orig[i, 0]) * (boxes_orig[i, 3] - boxes_orig[i, 1])
-                a2 = (boxes_flip[:, 2] - boxes_flip[:, 0]) * (boxes_flip[:, 3] - boxes_flip[:, 1])
-                iou = inter / (a1 + a2 - inter + 1e-9)
-                best_j = int(np.argmax(iou))
-                best_iou = float(iou[best_j])
-                if confs_orig[i] >= PER_CONF_HIGH:
-                    # High-conf original: auto-accept, boost with flip match
-                    score = float(confs_orig[i])
-                    if best_iou >= PER_CONSENSUS_IOU:
-                        score = max(score, float(confs_flip[best_j]))
-                        used_flip.add(best_j)
-                    kept_b.append(boxes_orig[i])
-                    kept_s.append(score)
-                else:
-                    # Low-conf original: need confirmation from flip view
-                    if best_iou >= PER_CONSENSUS_IOU:
-                        score = max(float(confs_orig[i]), float(confs_flip[best_j]))
-                        used_flip.add(best_j)
-                        kept_b.append(boxes_orig[i])
-                        kept_s.append(score)
-                    # else: unconfirmed low-conf → drop
-            # Add unmatched high-conf flip boxes
-            for j in range(len(boxes_flip)):
-                if j not in used_flip and confs_flip[j] >= PER_CONF_HIGH:
-                    kept_b.append(boxes_flip[j])
-                    kept_s.append(float(confs_flip[j]))
-        if not kept_b:
             return []
-        kept_b = np.array(kept_b)
-        kept_s = np.array(kept_s)
-        # Sanity filters (consensus merge handles dedup — no WBF needed)
         img_area = float(oh * ow)
-        sane = []
-        for i in range(len(kept_b)):
-            bw = kept_b[i, 2] - kept_b[i, 0]
-            bh = kept_b[i, 3] - kept_b[i, 1]
             if bw < PER_MIN_WH or bh < PER_MIN_WH:
                 continue
             area = bw * bh
@@ -697,23 +778,14 @@ class Miner:
                 continue
             if area / img_area > PER_MAX_AREA_RATIO:
                 continue
-            sane.append(i)
-        if not sane:
-            return []
-        kept_b = kept_b[sane]
-        kept_s = kept_s[sane]
-        out = []
-        for i in range(len(kept_b)):
-            b = kept_b[i]
             out.append(BoundingBox(
                 x1=max(0, min(ow, int(b[0]))),
                 y1=max(0, min(oh, int(b[1]))),
                 x2=max(0, min(ow, int(b[2]))),
                 y2=max(0, min(oh, int(b[3]))),
                 cls_id=0,
-                conf=max(0.0, min(1.0, kept_s[i])),
             ))
         return out

 """
+Score Vision SN44 — Unified miner v3.7 (2026-04-02). SAHI-style tiled person inference.
 Dual-model: vehicle (YOLO11m INT8 1280) + person (YOLO26s FP16 960 end2end).
 Vehicle weights loaded from secondary HF repo (meaculpitt/ScoreVision-Vehicle).
 Person weights loaded from primary HF repo (template downloads automatically).
 Person model (person_weights.onnx):
   YOLO26s FP16 960px end2end [1,300,6]. Single class: 0=person.
+  SAHI-style tiling: full + 2 adaptive tiles + flip TTA, max-conf NMS merge.
 Both models run on every image. All detections merged.
 Vehicle cls_id=4 (bus) filtered by validator (out of range for both elements).
 PER_MAX_ASPECT = 6.0
 PER_MAX_AREA_RATIO = 0.80
+# ── Person tiling config (SAHI-inspired) ────────────────────────────────────
+PER_TILE_OVERLAP = 0.20          # 20% overlap between tiles
+PER_TILE_MIN_DIM_RATIO = 1.15   # tile when image dim > model_dim * this (~1104px for 960 model)
+PER_TILE_CONF = 0.40            # lower threshold for tile passes (NMS handles FP)
+PER_NMS_IOU = 0.50              # NMS IoU for merging across passes (max-conf wins)
 # ── Shared ──────────────────────────────────────────────────────────────────
 WBF_SKIP_THR = 0.0001
         raw = self.per_session.run(None, {self.per_input_name: inp})[0]
         return self._per_decode(raw, ratio, pl, pt, oh, ow, conf_thresh)
+    def _generate_tiles(self, h, w):
+        """SAHI-inspired tile generation.
+        Smart 2-tile split: horizontal for landscape, vertical for portrait.
+        Edge-aware: for landscape, split in upper portion to avoid cutting
+        through people standing in bottom third.
+        Returns: [(x1,y1,x2,y2), ...] — always starts with full image.
+        """
+        tiles = [(0, 0, w, h)]  # full image always first
+        # Only tile if image significantly exceeds model input
+        if max(h, w) <= max(self.per_h, self.per_w) * PER_TILE_MIN_DIM_RATIO:
+            return tiles
+        overlap_px_x = int(w * PER_TILE_OVERLAP)
+        overlap_px_y = int(h * PER_TILE_OVERLAP)
+        if w >= h:
+            # Landscape: 2 horizontal tiles (left + right)
+            mid = w // 2
+            tiles.append((0, 0, mid + overlap_px_x, h))
+            tiles.append((mid - overlap_px_x, 0, w, h))
+        else:
+            # Portrait: 2 vertical tiles (top + bottom)
+            # Edge-aware: bias split toward upper portion (people stand at bottom)
+            mid = int(h * 0.45)  # split at 45% height, not 50%
+            tiles.append((0, 0, w, mid + overlap_px_y))
+            tiles.append((0, mid - overlap_px_y, w, h))
+        return tiles
+    def _per_run_tile(self, image_bgr, tile_region, conf_thresh):
+        """Run person model on a tile crop, return boxes in original coords."""
+        x1t, y1t, x2t, y2t = tile_region
+        crop = image_bgr[y1t:y2t, x1t:x2t]
+        boxes, confs = self._per_run_pass(crop, conf_thresh)
+        if len(boxes) == 0:
+            return np.empty((0, 4)), np.empty(0)
+        # Shift back to original image coordinates
+        boxes[:, 0] += x1t
+        boxes[:, 1] += y1t
+        boxes[:, 2] += x1t
+        boxes[:, 3] += y1t
+        return boxes, confs
+    @staticmethod
+    def _nms_max_conf(boxes, scores, iou_thr):
+        """NMS that keeps max confidence when boxes overlap.
+        Unlike WBF which averages scores (diluting strong detections),
+        this preserves sharp confidence values — critical for FP scoring.
+        """
+        if len(boxes) == 0:
+            return np.empty((0, 4)), np.empty(0)
+        # Sort by confidence descending
+        order = np.argsort(-scores)
+        boxes = boxes[order]
+        scores = scores[order]
+        keep_b, keep_s = [], []
+        suppressed = set()
+        for i in range(len(boxes)):
+            if i in suppressed:
+                continue
+            keep_b.append(boxes[i])
+            keep_s.append(scores[i])
+            # Suppress lower-conf overlapping boxes
+            for j in range(i + 1, len(boxes)):
+                if j in suppressed:
+                    continue
+                xx1 = max(boxes[i, 0], boxes[j, 0])
+                yy1 = max(boxes[i, 1], boxes[j, 1])
+                xx2 = min(boxes[i, 2], boxes[j, 2])
+                yy2 = min(boxes[i, 3], boxes[j, 3])
+                inter = max(0, xx2 - xx1) * max(0, yy2 - yy1)
+                a1 = (boxes[i, 2] - boxes[i, 0]) * (boxes[i, 3] - boxes[i, 1])
+                a2 = (boxes[j, 2] - boxes[j, 0]) * (boxes[j, 3] - boxes[j, 1])
+                iou = inter / (a1 + a2 - inter + 1e-9)
+                if iou >= iou_thr:
+                    suppressed.add(j)
+        return np.array(keep_b), np.array(keep_s)
     def _infer_person(self, image_bgr):
+        """Person detection with SAHI-inspired tiled inference.
+        Pipeline:
+        1. Full-image pass (catches large/medium people, low effective resolution)
+        2. 2 tiled passes (higher effective resolution for small/distant people)
+        3. Flip TTA pass if time budget allows
+        4. Max-confidence NMS merge (preserves sharp scores for FP scoring)
+        5. Sanity filters
+        """
         oh, ow = image_bgr.shape[:2]
         t_start = time.monotonic()
+        # Collect all boxes in original pixel coords
+        all_boxes = []   # list of [N, 4] arrays
+        all_confs = []   # list of [N] arrays
+        # Pass 1: full image
+        boxes_full, confs_full = self._per_run_pass(image_bgr, PER_CONF_LOW)
+        if len(boxes_full) > 0:
+            all_boxes.append(boxes_full)
+            all_confs.append(confs_full)
+        elapsed_pass1 = time.monotonic() - t_start
+        # Pass 2-3: tiled passes
+        tiles = self._generate_tiles(oh, ow)
+        if len(tiles) > 1 and elapsed_pass1 < PER_RTF_BUDGET / 4:
+            for tile_region in tiles[1:]:
+                if time.monotonic() - t_start > PER_RTF_BUDGET * 0.6:
+                    break
+                boxes_t, confs_t = self._per_run_tile(
+                    image_bgr, tile_region, PER_TILE_CONF)
+                if len(boxes_t) > 0:
+                    all_boxes.append(boxes_t)
+                    all_confs.append(confs_t)
+        # Pass 4: flip TTA if time allows
+        if time.monotonic() - t_start < PER_RTF_BUDGET / 4:
             flipped = cv2.flip(image_bgr, 1)
             boxes_flip, confs_flip = self._per_run_pass(flipped, PER_CONF_LOW)
+            if len(boxes_flip) > 0:
+                boxes_flip[:, 0], boxes_flip[:, 2] = (
+                    ow - boxes_flip[:, 2], ow - boxes_flip[:, 0])
+                all_boxes.append(boxes_flip)
+                all_confs.append(confs_flip)
+        if not all_boxes:
             return []
+        # Merge all detections with max-confidence NMS
+        merged_b = np.concatenate(all_boxes)
+        merged_s = np.concatenate(all_confs)
+        merged_b, merged_s = self._nms_max_conf(merged_b, merged_s, PER_NMS_IOU)
+        if len(merged_b) == 0:
+            return []
+        # Sanity filters
         img_area = float(oh * ow)
+        out = []
+        for i in range(len(merged_b)):
+            bw = merged_b[i, 2] - merged_b[i, 0]
+            bh = merged_b[i, 3] - merged_b[i, 1]
             if bw < PER_MIN_WH or bh < PER_MIN_WH:
                 continue
             area = bw * bh
                 continue
             if area / img_area > PER_MAX_AREA_RATIO:
                 continue
+            b = merged_b[i]
             out.append(BoundingBox(
                 x1=max(0, min(ow, int(b[0]))),
                 y1=max(0, min(oh, int(b[1]))),
                 x2=max(0, min(ow, int(b[2]))),
                 y2=max(0, min(oh, int(b[3]))),
                 cls_id=0,
+                conf=max(0.0, min(1.0, float(merged_s[i]))),
             ))
         return out