scorevision: push artifact
Browse files
miner.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
"""
|
| 2 |
-
Score Vision SN44 β Unified miner v3.
|
| 3 |
Dual-model: vehicle (YOLO11m INT8 1280) + person (YOLO26s FP16 960 end2end).
|
| 4 |
Vehicle weights loaded from secondary HF repo (meaculpitt/ScoreVision-Vehicle).
|
| 5 |
Person weights loaded from primary HF repo (template downloads automatically).
|
|
@@ -11,6 +11,7 @@ Vehicle model (vehicle_weights.onnx):
|
|
| 11 |
|
| 12 |
Person model (person_weights.onnx):
|
| 13 |
YOLO26s FP16 960px end2end [1,300,6]. Single class: 0=person.
|
|
|
|
| 14 |
|
| 15 |
Both models run on every image. All detections merged.
|
| 16 |
Vehicle cls_id=4 (bus) filtered by validator (out of range for both elements).
|
|
@@ -187,6 +188,12 @@ PER_MIN_AREA = 14 * 14
|
|
| 187 |
PER_MAX_ASPECT = 6.0
|
| 188 |
PER_MAX_AREA_RATIO = 0.80
|
| 189 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
# ββ Shared ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 191 |
WBF_SKIP_THR = 0.0001
|
| 192 |
|
|
@@ -612,82 +619,156 @@ class Miner:
|
|
| 612 |
raw = self.per_session.run(None, {self.per_input_name: inp})[0]
|
| 613 |
return self._per_decode(raw, ratio, pl, pt, oh, ow, conf_thresh)
|
| 614 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 615 |
def _infer_person(self, image_bgr):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 616 |
oh, ow = image_bgr.shape[:2]
|
| 617 |
-
|
| 618 |
t_start = time.monotonic()
|
| 619 |
-
boxes_orig, confs_orig = self._per_run_pass(image_bgr, PER_CONF_LOW)
|
| 620 |
-
elapsed = time.monotonic() - t_start
|
| 621 |
|
| 622 |
-
|
| 623 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 624 |
flipped = cv2.flip(image_bgr, 1)
|
| 625 |
boxes_flip, confs_flip = self._per_run_pass(flipped, PER_CONF_LOW)
|
| 626 |
-
if len(boxes_flip):
|
| 627 |
-
boxes_flip[:, 0], boxes_flip[:, 2] =
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
kept_b, kept_s = [], []
|
| 633 |
|
| 634 |
-
if not
|
| 635 |
-
# No flip available β keep all original boxes
|
| 636 |
-
for i in range(len(boxes_orig)):
|
| 637 |
-
kept_b.append(boxes_orig[i])
|
| 638 |
-
kept_s.append(float(confs_orig[i]))
|
| 639 |
-
else:
|
| 640 |
-
# Consensus TTA merge
|
| 641 |
-
used_flip = set()
|
| 642 |
-
|
| 643 |
-
for i in range(len(boxes_orig)):
|
| 644 |
-
# Compute IoU with all flip boxes
|
| 645 |
-
xx1 = np.maximum(boxes_orig[i, 0], boxes_flip[:, 0])
|
| 646 |
-
yy1 = np.maximum(boxes_orig[i, 1], boxes_flip[:, 1])
|
| 647 |
-
xx2 = np.minimum(boxes_orig[i, 2], boxes_flip[:, 2])
|
| 648 |
-
yy2 = np.minimum(boxes_orig[i, 3], boxes_flip[:, 3])
|
| 649 |
-
inter = np.maximum(0, xx2 - xx1) * np.maximum(0, yy2 - yy1)
|
| 650 |
-
a1 = (boxes_orig[i, 2] - boxes_orig[i, 0]) * (boxes_orig[i, 3] - boxes_orig[i, 1])
|
| 651 |
-
a2 = (boxes_flip[:, 2] - boxes_flip[:, 0]) * (boxes_flip[:, 3] - boxes_flip[:, 1])
|
| 652 |
-
iou = inter / (a1 + a2 - inter + 1e-9)
|
| 653 |
-
best_j = int(np.argmax(iou))
|
| 654 |
-
best_iou = float(iou[best_j])
|
| 655 |
-
|
| 656 |
-
if confs_orig[i] >= PER_CONF_HIGH:
|
| 657 |
-
# High-conf original: auto-accept, boost with flip match
|
| 658 |
-
score = float(confs_orig[i])
|
| 659 |
-
if best_iou >= PER_CONSENSUS_IOU:
|
| 660 |
-
score = max(score, float(confs_flip[best_j]))
|
| 661 |
-
used_flip.add(best_j)
|
| 662 |
-
kept_b.append(boxes_orig[i])
|
| 663 |
-
kept_s.append(score)
|
| 664 |
-
else:
|
| 665 |
-
# Low-conf original: need confirmation from flip view
|
| 666 |
-
if best_iou >= PER_CONSENSUS_IOU:
|
| 667 |
-
score = max(float(confs_orig[i]), float(confs_flip[best_j]))
|
| 668 |
-
used_flip.add(best_j)
|
| 669 |
-
kept_b.append(boxes_orig[i])
|
| 670 |
-
kept_s.append(score)
|
| 671 |
-
# else: unconfirmed low-conf β drop
|
| 672 |
-
|
| 673 |
-
# Add unmatched high-conf flip boxes
|
| 674 |
-
for j in range(len(boxes_flip)):
|
| 675 |
-
if j not in used_flip and confs_flip[j] >= PER_CONF_HIGH:
|
| 676 |
-
kept_b.append(boxes_flip[j])
|
| 677 |
-
kept_s.append(float(confs_flip[j]))
|
| 678 |
-
|
| 679 |
-
if not kept_b:
|
| 680 |
return []
|
| 681 |
|
| 682 |
-
|
| 683 |
-
|
|
|
|
|
|
|
| 684 |
|
| 685 |
-
|
|
|
|
|
|
|
|
|
|
| 686 |
img_area = float(oh * ow)
|
| 687 |
-
|
| 688 |
-
for i in range(len(
|
| 689 |
-
bw =
|
| 690 |
-
bh =
|
| 691 |
if bw < PER_MIN_WH or bh < PER_MIN_WH:
|
| 692 |
continue
|
| 693 |
area = bw * bh
|
|
@@ -697,23 +778,14 @@ class Miner:
|
|
| 697 |
continue
|
| 698 |
if area / img_area > PER_MAX_AREA_RATIO:
|
| 699 |
continue
|
| 700 |
-
|
| 701 |
-
|
| 702 |
-
if not sane:
|
| 703 |
-
return []
|
| 704 |
-
kept_b = kept_b[sane]
|
| 705 |
-
kept_s = kept_s[sane]
|
| 706 |
-
|
| 707 |
-
out = []
|
| 708 |
-
for i in range(len(kept_b)):
|
| 709 |
-
b = kept_b[i]
|
| 710 |
out.append(BoundingBox(
|
| 711 |
x1=max(0, min(ow, int(b[0]))),
|
| 712 |
y1=max(0, min(oh, int(b[1]))),
|
| 713 |
x2=max(0, min(ow, int(b[2]))),
|
| 714 |
y2=max(0, min(oh, int(b[3]))),
|
| 715 |
cls_id=0,
|
| 716 |
-
conf=max(0.0, min(1.0,
|
| 717 |
))
|
| 718 |
return out
|
| 719 |
|
|
|
|
| 1 |
"""
|
| 2 |
+
Score Vision SN44 β Unified miner v3.7 (2026-04-02). SAHI-style tiled person inference.
|
| 3 |
Dual-model: vehicle (YOLO11m INT8 1280) + person (YOLO26s FP16 960 end2end).
|
| 4 |
Vehicle weights loaded from secondary HF repo (meaculpitt/ScoreVision-Vehicle).
|
| 5 |
Person weights loaded from primary HF repo (template downloads automatically).
|
|
|
|
| 11 |
|
| 12 |
Person model (person_weights.onnx):
|
| 13 |
YOLO26s FP16 960px end2end [1,300,6]. Single class: 0=person.
|
| 14 |
+
SAHI-style tiling: full + 2 adaptive tiles + flip TTA, max-conf NMS merge.
|
| 15 |
|
| 16 |
Both models run on every image. All detections merged.
|
| 17 |
Vehicle cls_id=4 (bus) filtered by validator (out of range for both elements).
|
|
|
|
| 188 |
PER_MAX_ASPECT = 6.0
|
| 189 |
PER_MAX_AREA_RATIO = 0.80
|
| 190 |
|
| 191 |
+
# ββ Person tiling config (SAHI-inspired) ββββββββββββββββββββββββββββββββββββ
|
| 192 |
+
PER_TILE_OVERLAP = 0.20 # 20% overlap between tiles
|
| 193 |
+
PER_TILE_MIN_DIM_RATIO = 1.15 # tile when image dim > model_dim * this (~1104px for 960 model)
|
| 194 |
+
PER_TILE_CONF = 0.40 # lower threshold for tile passes (NMS handles FP)
|
| 195 |
+
PER_NMS_IOU = 0.50 # NMS IoU for merging across passes (max-conf wins)
|
| 196 |
+
|
| 197 |
# ββ Shared ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 198 |
WBF_SKIP_THR = 0.0001
|
| 199 |
|
|
|
|
| 619 |
raw = self.per_session.run(None, {self.per_input_name: inp})[0]
|
| 620 |
return self._per_decode(raw, ratio, pl, pt, oh, ow, conf_thresh)
|
| 621 |
|
| 622 |
+
def _generate_tiles(self, h, w):
|
| 623 |
+
"""SAHI-inspired tile generation.
|
| 624 |
+
|
| 625 |
+
Smart 2-tile split: horizontal for landscape, vertical for portrait.
|
| 626 |
+
Edge-aware: for landscape, split in upper portion to avoid cutting
|
| 627 |
+
through people standing in bottom third.
|
| 628 |
+
Returns: [(x1,y1,x2,y2), ...] β always starts with full image.
|
| 629 |
+
"""
|
| 630 |
+
tiles = [(0, 0, w, h)] # full image always first
|
| 631 |
+
|
| 632 |
+
# Only tile if image significantly exceeds model input
|
| 633 |
+
if max(h, w) <= max(self.per_h, self.per_w) * PER_TILE_MIN_DIM_RATIO:
|
| 634 |
+
return tiles
|
| 635 |
+
|
| 636 |
+
overlap_px_x = int(w * PER_TILE_OVERLAP)
|
| 637 |
+
overlap_px_y = int(h * PER_TILE_OVERLAP)
|
| 638 |
+
|
| 639 |
+
if w >= h:
|
| 640 |
+
# Landscape: 2 horizontal tiles (left + right)
|
| 641 |
+
mid = w // 2
|
| 642 |
+
tiles.append((0, 0, mid + overlap_px_x, h))
|
| 643 |
+
tiles.append((mid - overlap_px_x, 0, w, h))
|
| 644 |
+
else:
|
| 645 |
+
# Portrait: 2 vertical tiles (top + bottom)
|
| 646 |
+
# Edge-aware: bias split toward upper portion (people stand at bottom)
|
| 647 |
+
mid = int(h * 0.45) # split at 45% height, not 50%
|
| 648 |
+
tiles.append((0, 0, w, mid + overlap_px_y))
|
| 649 |
+
tiles.append((0, mid - overlap_px_y, w, h))
|
| 650 |
+
|
| 651 |
+
return tiles
|
| 652 |
+
|
| 653 |
+
def _per_run_tile(self, image_bgr, tile_region, conf_thresh):
|
| 654 |
+
"""Run person model on a tile crop, return boxes in original coords."""
|
| 655 |
+
x1t, y1t, x2t, y2t = tile_region
|
| 656 |
+
crop = image_bgr[y1t:y2t, x1t:x2t]
|
| 657 |
+
boxes, confs = self._per_run_pass(crop, conf_thresh)
|
| 658 |
+
if len(boxes) == 0:
|
| 659 |
+
return np.empty((0, 4)), np.empty(0)
|
| 660 |
+
# Shift back to original image coordinates
|
| 661 |
+
boxes[:, 0] += x1t
|
| 662 |
+
boxes[:, 1] += y1t
|
| 663 |
+
boxes[:, 2] += x1t
|
| 664 |
+
boxes[:, 3] += y1t
|
| 665 |
+
return boxes, confs
|
| 666 |
+
|
| 667 |
+
@staticmethod
|
| 668 |
+
def _nms_max_conf(boxes, scores, iou_thr):
|
| 669 |
+
"""NMS that keeps max confidence when boxes overlap.
|
| 670 |
+
|
| 671 |
+
Unlike WBF which averages scores (diluting strong detections),
|
| 672 |
+
this preserves sharp confidence values β critical for FP scoring.
|
| 673 |
+
"""
|
| 674 |
+
if len(boxes) == 0:
|
| 675 |
+
return np.empty((0, 4)), np.empty(0)
|
| 676 |
+
|
| 677 |
+
# Sort by confidence descending
|
| 678 |
+
order = np.argsort(-scores)
|
| 679 |
+
boxes = boxes[order]
|
| 680 |
+
scores = scores[order]
|
| 681 |
+
|
| 682 |
+
keep_b, keep_s = [], []
|
| 683 |
+
suppressed = set()
|
| 684 |
+
|
| 685 |
+
for i in range(len(boxes)):
|
| 686 |
+
if i in suppressed:
|
| 687 |
+
continue
|
| 688 |
+
keep_b.append(boxes[i])
|
| 689 |
+
keep_s.append(scores[i])
|
| 690 |
+
|
| 691 |
+
# Suppress lower-conf overlapping boxes
|
| 692 |
+
for j in range(i + 1, len(boxes)):
|
| 693 |
+
if j in suppressed:
|
| 694 |
+
continue
|
| 695 |
+
xx1 = max(boxes[i, 0], boxes[j, 0])
|
| 696 |
+
yy1 = max(boxes[i, 1], boxes[j, 1])
|
| 697 |
+
xx2 = min(boxes[i, 2], boxes[j, 2])
|
| 698 |
+
yy2 = min(boxes[i, 3], boxes[j, 3])
|
| 699 |
+
inter = max(0, xx2 - xx1) * max(0, yy2 - yy1)
|
| 700 |
+
a1 = (boxes[i, 2] - boxes[i, 0]) * (boxes[i, 3] - boxes[i, 1])
|
| 701 |
+
a2 = (boxes[j, 2] - boxes[j, 0]) * (boxes[j, 3] - boxes[j, 1])
|
| 702 |
+
iou = inter / (a1 + a2 - inter + 1e-9)
|
| 703 |
+
if iou >= iou_thr:
|
| 704 |
+
suppressed.add(j)
|
| 705 |
+
|
| 706 |
+
return np.array(keep_b), np.array(keep_s)
|
| 707 |
+
|
| 708 |
def _infer_person(self, image_bgr):
|
| 709 |
+
"""Person detection with SAHI-inspired tiled inference.
|
| 710 |
+
|
| 711 |
+
Pipeline:
|
| 712 |
+
1. Full-image pass (catches large/medium people, low effective resolution)
|
| 713 |
+
2. 2 tiled passes (higher effective resolution for small/distant people)
|
| 714 |
+
3. Flip TTA pass if time budget allows
|
| 715 |
+
4. Max-confidence NMS merge (preserves sharp scores for FP scoring)
|
| 716 |
+
5. Sanity filters
|
| 717 |
+
"""
|
| 718 |
oh, ow = image_bgr.shape[:2]
|
|
|
|
| 719 |
t_start = time.monotonic()
|
|
|
|
|
|
|
| 720 |
|
| 721 |
+
# Collect all boxes in original pixel coords
|
| 722 |
+
all_boxes = [] # list of [N, 4] arrays
|
| 723 |
+
all_confs = [] # list of [N] arrays
|
| 724 |
+
|
| 725 |
+
# Pass 1: full image
|
| 726 |
+
boxes_full, confs_full = self._per_run_pass(image_bgr, PER_CONF_LOW)
|
| 727 |
+
if len(boxes_full) > 0:
|
| 728 |
+
all_boxes.append(boxes_full)
|
| 729 |
+
all_confs.append(confs_full)
|
| 730 |
+
|
| 731 |
+
elapsed_pass1 = time.monotonic() - t_start
|
| 732 |
+
|
| 733 |
+
# Pass 2-3: tiled passes
|
| 734 |
+
tiles = self._generate_tiles(oh, ow)
|
| 735 |
+
if len(tiles) > 1 and elapsed_pass1 < PER_RTF_BUDGET / 4:
|
| 736 |
+
for tile_region in tiles[1:]:
|
| 737 |
+
if time.monotonic() - t_start > PER_RTF_BUDGET * 0.6:
|
| 738 |
+
break
|
| 739 |
+
boxes_t, confs_t = self._per_run_tile(
|
| 740 |
+
image_bgr, tile_region, PER_TILE_CONF)
|
| 741 |
+
if len(boxes_t) > 0:
|
| 742 |
+
all_boxes.append(boxes_t)
|
| 743 |
+
all_confs.append(confs_t)
|
| 744 |
+
|
| 745 |
+
# Pass 4: flip TTA if time allows
|
| 746 |
+
if time.monotonic() - t_start < PER_RTF_BUDGET / 4:
|
| 747 |
flipped = cv2.flip(image_bgr, 1)
|
| 748 |
boxes_flip, confs_flip = self._per_run_pass(flipped, PER_CONF_LOW)
|
| 749 |
+
if len(boxes_flip) > 0:
|
| 750 |
+
boxes_flip[:, 0], boxes_flip[:, 2] = (
|
| 751 |
+
ow - boxes_flip[:, 2], ow - boxes_flip[:, 0])
|
| 752 |
+
all_boxes.append(boxes_flip)
|
| 753 |
+
all_confs.append(confs_flip)
|
|
|
|
|
|
|
| 754 |
|
| 755 |
+
if not all_boxes:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 756 |
return []
|
| 757 |
|
| 758 |
+
# Merge all detections with max-confidence NMS
|
| 759 |
+
merged_b = np.concatenate(all_boxes)
|
| 760 |
+
merged_s = np.concatenate(all_confs)
|
| 761 |
+
merged_b, merged_s = self._nms_max_conf(merged_b, merged_s, PER_NMS_IOU)
|
| 762 |
|
| 763 |
+
if len(merged_b) == 0:
|
| 764 |
+
return []
|
| 765 |
+
|
| 766 |
+
# Sanity filters
|
| 767 |
img_area = float(oh * ow)
|
| 768 |
+
out = []
|
| 769 |
+
for i in range(len(merged_b)):
|
| 770 |
+
bw = merged_b[i, 2] - merged_b[i, 0]
|
| 771 |
+
bh = merged_b[i, 3] - merged_b[i, 1]
|
| 772 |
if bw < PER_MIN_WH or bh < PER_MIN_WH:
|
| 773 |
continue
|
| 774 |
area = bw * bh
|
|
|
|
| 778 |
continue
|
| 779 |
if area / img_area > PER_MAX_AREA_RATIO:
|
| 780 |
continue
|
| 781 |
+
b = merged_b[i]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 782 |
out.append(BoundingBox(
|
| 783 |
x1=max(0, min(ow, int(b[0]))),
|
| 784 |
y1=max(0, min(oh, int(b[1]))),
|
| 785 |
x2=max(0, min(ow, int(b[2]))),
|
| 786 |
y2=max(0, min(oh, int(b[3]))),
|
| 787 |
cls_id=0,
|
| 788 |
+
conf=max(0.0, min(1.0, float(merged_s[i]))),
|
| 789 |
))
|
| 790 |
return out
|
| 791 |
|