meaculpitt commited on
Commit
65b9551
Β·
verified Β·
1 Parent(s): ef72a08

scorevision: push artifact

Browse files
Files changed (1) hide show
  1. miner.py +149 -77
miner.py CHANGED
@@ -1,5 +1,5 @@
1
  """
2
- Score Vision SN44 β€” Unified miner v3.5 (2026-04-02).
3
  Dual-model: vehicle (YOLO11m INT8 1280) + person (YOLO26s FP16 960 end2end).
4
  Vehicle weights loaded from secondary HF repo (meaculpitt/ScoreVision-Vehicle).
5
  Person weights loaded from primary HF repo (template downloads automatically).
@@ -11,6 +11,7 @@ Vehicle model (vehicle_weights.onnx):
11
 
12
  Person model (person_weights.onnx):
13
  YOLO26s FP16 960px end2end [1,300,6]. Single class: 0=person.
 
14
 
15
  Both models run on every image. All detections merged.
16
  Vehicle cls_id=4 (bus) filtered by validator (out of range for both elements).
@@ -187,6 +188,12 @@ PER_MIN_AREA = 14 * 14
187
  PER_MAX_ASPECT = 6.0
188
  PER_MAX_AREA_RATIO = 0.80
189
 
 
 
 
 
 
 
190
  # ── Shared ──────────────────────────────────────────────────────────────────
191
  WBF_SKIP_THR = 0.0001
192
 
@@ -612,82 +619,156 @@ class Miner:
612
  raw = self.per_session.run(None, {self.per_input_name: inp})[0]
613
  return self._per_decode(raw, ratio, pl, pt, oh, ow, conf_thresh)
614
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
615
  def _infer_person(self, image_bgr):
 
 
 
 
 
 
 
 
 
616
  oh, ow = image_bgr.shape[:2]
617
-
618
  t_start = time.monotonic()
619
- boxes_orig, confs_orig = self._per_run_pass(image_bgr, PER_CONF_LOW)
620
- elapsed = time.monotonic() - t_start
621
 
622
- has_flip = False
623
- if elapsed < PER_RTF_BUDGET / 2:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
624
  flipped = cv2.flip(image_bgr, 1)
625
  boxes_flip, confs_flip = self._per_run_pass(flipped, PER_CONF_LOW)
626
- if len(boxes_flip):
627
- boxes_flip[:, 0], boxes_flip[:, 2] = ow - boxes_flip[:, 2], ow - boxes_flip[:, 0]
628
- has_flip = True
629
- else:
630
- logger.warning(f"[person TTA] skipping flip β€” pass1 took {elapsed*1000:.0f}ms")
631
-
632
- kept_b, kept_s = [], []
633
 
634
- if not has_flip or len(boxes_flip) == 0:
635
- # No flip available β€” keep all original boxes
636
- for i in range(len(boxes_orig)):
637
- kept_b.append(boxes_orig[i])
638
- kept_s.append(float(confs_orig[i]))
639
- else:
640
- # Consensus TTA merge
641
- used_flip = set()
642
-
643
- for i in range(len(boxes_orig)):
644
- # Compute IoU with all flip boxes
645
- xx1 = np.maximum(boxes_orig[i, 0], boxes_flip[:, 0])
646
- yy1 = np.maximum(boxes_orig[i, 1], boxes_flip[:, 1])
647
- xx2 = np.minimum(boxes_orig[i, 2], boxes_flip[:, 2])
648
- yy2 = np.minimum(boxes_orig[i, 3], boxes_flip[:, 3])
649
- inter = np.maximum(0, xx2 - xx1) * np.maximum(0, yy2 - yy1)
650
- a1 = (boxes_orig[i, 2] - boxes_orig[i, 0]) * (boxes_orig[i, 3] - boxes_orig[i, 1])
651
- a2 = (boxes_flip[:, 2] - boxes_flip[:, 0]) * (boxes_flip[:, 3] - boxes_flip[:, 1])
652
- iou = inter / (a1 + a2 - inter + 1e-9)
653
- best_j = int(np.argmax(iou))
654
- best_iou = float(iou[best_j])
655
-
656
- if confs_orig[i] >= PER_CONF_HIGH:
657
- # High-conf original: auto-accept, boost with flip match
658
- score = float(confs_orig[i])
659
- if best_iou >= PER_CONSENSUS_IOU:
660
- score = max(score, float(confs_flip[best_j]))
661
- used_flip.add(best_j)
662
- kept_b.append(boxes_orig[i])
663
- kept_s.append(score)
664
- else:
665
- # Low-conf original: need confirmation from flip view
666
- if best_iou >= PER_CONSENSUS_IOU:
667
- score = max(float(confs_orig[i]), float(confs_flip[best_j]))
668
- used_flip.add(best_j)
669
- kept_b.append(boxes_orig[i])
670
- kept_s.append(score)
671
- # else: unconfirmed low-conf β†’ drop
672
-
673
- # Add unmatched high-conf flip boxes
674
- for j in range(len(boxes_flip)):
675
- if j not in used_flip and confs_flip[j] >= PER_CONF_HIGH:
676
- kept_b.append(boxes_flip[j])
677
- kept_s.append(float(confs_flip[j]))
678
-
679
- if not kept_b:
680
  return []
681
 
682
- kept_b = np.array(kept_b)
683
- kept_s = np.array(kept_s)
 
 
684
 
685
- # Sanity filters (consensus merge handles dedup β€” no WBF needed)
 
 
 
686
  img_area = float(oh * ow)
687
- sane = []
688
- for i in range(len(kept_b)):
689
- bw = kept_b[i, 2] - kept_b[i, 0]
690
- bh = kept_b[i, 3] - kept_b[i, 1]
691
  if bw < PER_MIN_WH or bh < PER_MIN_WH:
692
  continue
693
  area = bw * bh
@@ -697,23 +778,14 @@ class Miner:
697
  continue
698
  if area / img_area > PER_MAX_AREA_RATIO:
699
  continue
700
- sane.append(i)
701
-
702
- if not sane:
703
- return []
704
- kept_b = kept_b[sane]
705
- kept_s = kept_s[sane]
706
-
707
- out = []
708
- for i in range(len(kept_b)):
709
- b = kept_b[i]
710
  out.append(BoundingBox(
711
  x1=max(0, min(ow, int(b[0]))),
712
  y1=max(0, min(oh, int(b[1]))),
713
  x2=max(0, min(ow, int(b[2]))),
714
  y2=max(0, min(oh, int(b[3]))),
715
  cls_id=0,
716
- conf=max(0.0, min(1.0, kept_s[i])),
717
  ))
718
  return out
719
 
 
1
  """
2
+ Score Vision SN44 β€” Unified miner v3.7 (2026-04-02). SAHI-style tiled person inference.
3
  Dual-model: vehicle (YOLO11m INT8 1280) + person (YOLO26s FP16 960 end2end).
4
  Vehicle weights loaded from secondary HF repo (meaculpitt/ScoreVision-Vehicle).
5
  Person weights loaded from primary HF repo (template downloads automatically).
 
11
 
12
  Person model (person_weights.onnx):
13
  YOLO26s FP16 960px end2end [1,300,6]. Single class: 0=person.
14
+ SAHI-style tiling: full + 2 adaptive tiles + flip TTA, max-conf NMS merge.
15
 
16
  Both models run on every image. All detections merged.
17
  Vehicle cls_id=4 (bus) filtered by validator (out of range for both elements).
 
188
  PER_MAX_ASPECT = 6.0
189
  PER_MAX_AREA_RATIO = 0.80
190
 
191
+ # ── Person tiling config (SAHI-inspired) ────────────────────────────────────
192
+ PER_TILE_OVERLAP = 0.20 # 20% overlap between tiles
193
+ PER_TILE_MIN_DIM_RATIO = 1.15 # tile when image dim > model_dim * this (~1104px for 960 model)
194
+ PER_TILE_CONF = 0.40 # lower threshold for tile passes (NMS handles FP)
195
+ PER_NMS_IOU = 0.50 # NMS IoU for merging across passes (max-conf wins)
196
+
197
  # ── Shared ──────────────────────────────────────────────────────────────────
198
  WBF_SKIP_THR = 0.0001
199
 
 
619
  raw = self.per_session.run(None, {self.per_input_name: inp})[0]
620
  return self._per_decode(raw, ratio, pl, pt, oh, ow, conf_thresh)
621
 
622
+ def _generate_tiles(self, h, w):
623
+ """SAHI-inspired tile generation.
624
+
625
+ Smart 2-tile split: horizontal for landscape, vertical for portrait.
626
+ Edge-aware: for landscape, split in upper portion to avoid cutting
627
+ through people standing in bottom third.
628
+ Returns: [(x1,y1,x2,y2), ...] β€” always starts with full image.
629
+ """
630
+ tiles = [(0, 0, w, h)] # full image always first
631
+
632
+ # Only tile if image significantly exceeds model input
633
+ if max(h, w) <= max(self.per_h, self.per_w) * PER_TILE_MIN_DIM_RATIO:
634
+ return tiles
635
+
636
+ overlap_px_x = int(w * PER_TILE_OVERLAP)
637
+ overlap_px_y = int(h * PER_TILE_OVERLAP)
638
+
639
+ if w >= h:
640
+ # Landscape: 2 horizontal tiles (left + right)
641
+ mid = w // 2
642
+ tiles.append((0, 0, mid + overlap_px_x, h))
643
+ tiles.append((mid - overlap_px_x, 0, w, h))
644
+ else:
645
+ # Portrait: 2 vertical tiles (top + bottom)
646
+ # Edge-aware: bias split toward upper portion (people stand at bottom)
647
+ mid = int(h * 0.45) # split at 45% height, not 50%
648
+ tiles.append((0, 0, w, mid + overlap_px_y))
649
+ tiles.append((0, mid - overlap_px_y, w, h))
650
+
651
+ return tiles
652
+
653
+ def _per_run_tile(self, image_bgr, tile_region, conf_thresh):
654
+ """Run person model on a tile crop, return boxes in original coords."""
655
+ x1t, y1t, x2t, y2t = tile_region
656
+ crop = image_bgr[y1t:y2t, x1t:x2t]
657
+ boxes, confs = self._per_run_pass(crop, conf_thresh)
658
+ if len(boxes) == 0:
659
+ return np.empty((0, 4)), np.empty(0)
660
+ # Shift back to original image coordinates
661
+ boxes[:, 0] += x1t
662
+ boxes[:, 1] += y1t
663
+ boxes[:, 2] += x1t
664
+ boxes[:, 3] += y1t
665
+ return boxes, confs
666
+
667
+ @staticmethod
668
+ def _nms_max_conf(boxes, scores, iou_thr):
669
+ """NMS that keeps max confidence when boxes overlap.
670
+
671
+ Unlike WBF which averages scores (diluting strong detections),
672
+ this preserves sharp confidence values β€” critical for FP scoring.
673
+ """
674
+ if len(boxes) == 0:
675
+ return np.empty((0, 4)), np.empty(0)
676
+
677
+ # Sort by confidence descending
678
+ order = np.argsort(-scores)
679
+ boxes = boxes[order]
680
+ scores = scores[order]
681
+
682
+ keep_b, keep_s = [], []
683
+ suppressed = set()
684
+
685
+ for i in range(len(boxes)):
686
+ if i in suppressed:
687
+ continue
688
+ keep_b.append(boxes[i])
689
+ keep_s.append(scores[i])
690
+
691
+ # Suppress lower-conf overlapping boxes
692
+ for j in range(i + 1, len(boxes)):
693
+ if j in suppressed:
694
+ continue
695
+ xx1 = max(boxes[i, 0], boxes[j, 0])
696
+ yy1 = max(boxes[i, 1], boxes[j, 1])
697
+ xx2 = min(boxes[i, 2], boxes[j, 2])
698
+ yy2 = min(boxes[i, 3], boxes[j, 3])
699
+ inter = max(0, xx2 - xx1) * max(0, yy2 - yy1)
700
+ a1 = (boxes[i, 2] - boxes[i, 0]) * (boxes[i, 3] - boxes[i, 1])
701
+ a2 = (boxes[j, 2] - boxes[j, 0]) * (boxes[j, 3] - boxes[j, 1])
702
+ iou = inter / (a1 + a2 - inter + 1e-9)
703
+ if iou >= iou_thr:
704
+ suppressed.add(j)
705
+
706
+ return np.array(keep_b), np.array(keep_s)
707
+
708
  def _infer_person(self, image_bgr):
709
+ """Person detection with SAHI-inspired tiled inference.
710
+
711
+ Pipeline:
712
+ 1. Full-image pass (catches large/medium people, low effective resolution)
713
+ 2. 2 tiled passes (higher effective resolution for small/distant people)
714
+ 3. Flip TTA pass if time budget allows
715
+ 4. Max-confidence NMS merge (preserves sharp scores for FP scoring)
716
+ 5. Sanity filters
717
+ """
718
  oh, ow = image_bgr.shape[:2]
 
719
  t_start = time.monotonic()
 
 
720
 
721
+ # Collect all boxes in original pixel coords
722
+ all_boxes = [] # list of [N, 4] arrays
723
+ all_confs = [] # list of [N] arrays
724
+
725
+ # Pass 1: full image
726
+ boxes_full, confs_full = self._per_run_pass(image_bgr, PER_CONF_LOW)
727
+ if len(boxes_full) > 0:
728
+ all_boxes.append(boxes_full)
729
+ all_confs.append(confs_full)
730
+
731
+ elapsed_pass1 = time.monotonic() - t_start
732
+
733
+ # Pass 2-3: tiled passes
734
+ tiles = self._generate_tiles(oh, ow)
735
+ if len(tiles) > 1 and elapsed_pass1 < PER_RTF_BUDGET / 4:
736
+ for tile_region in tiles[1:]:
737
+ if time.monotonic() - t_start > PER_RTF_BUDGET * 0.6:
738
+ break
739
+ boxes_t, confs_t = self._per_run_tile(
740
+ image_bgr, tile_region, PER_TILE_CONF)
741
+ if len(boxes_t) > 0:
742
+ all_boxes.append(boxes_t)
743
+ all_confs.append(confs_t)
744
+
745
+ # Pass 4: flip TTA if time allows
746
+ if time.monotonic() - t_start < PER_RTF_BUDGET / 4:
747
  flipped = cv2.flip(image_bgr, 1)
748
  boxes_flip, confs_flip = self._per_run_pass(flipped, PER_CONF_LOW)
749
+ if len(boxes_flip) > 0:
750
+ boxes_flip[:, 0], boxes_flip[:, 2] = (
751
+ ow - boxes_flip[:, 2], ow - boxes_flip[:, 0])
752
+ all_boxes.append(boxes_flip)
753
+ all_confs.append(confs_flip)
 
 
754
 
755
+ if not all_boxes:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
756
  return []
757
 
758
+ # Merge all detections with max-confidence NMS
759
+ merged_b = np.concatenate(all_boxes)
760
+ merged_s = np.concatenate(all_confs)
761
+ merged_b, merged_s = self._nms_max_conf(merged_b, merged_s, PER_NMS_IOU)
762
 
763
+ if len(merged_b) == 0:
764
+ return []
765
+
766
+ # Sanity filters
767
  img_area = float(oh * ow)
768
+ out = []
769
+ for i in range(len(merged_b)):
770
+ bw = merged_b[i, 2] - merged_b[i, 0]
771
+ bh = merged_b[i, 3] - merged_b[i, 1]
772
  if bw < PER_MIN_WH or bh < PER_MIN_WH:
773
  continue
774
  area = bw * bh
 
778
  continue
779
  if area / img_area > PER_MAX_AREA_RATIO:
780
  continue
781
+ b = merged_b[i]
 
 
 
 
 
 
 
 
 
782
  out.append(BoundingBox(
783
  x1=max(0, min(ow, int(b[0]))),
784
  y1=max(0, min(oh, int(b[1]))),
785
  x2=max(0, min(ow, int(b[2]))),
786
  y2=max(0, min(oh, int(b[3]))),
787
  cls_id=0,
788
+ conf=max(0.0, min(1.0, float(merged_s[i]))),
789
  ))
790
  return out
791