meaculpitt
/

ScoreVision-Beverage

@@ -161,32 +161,37 @@ class Miner:
         self.input_h = 960
         self.input_w = 960
-        # Pre-NMS confidence threshold. Raised from 0.15 → 0.4 to align with
-        # the SN44 beverage scoring pillar weights (map50 0.6 + false_positive
-        # 0.4); the FP pillar penalises over-prediction so a higher conf
-        # threshold trades a little recall for a meaningful precision gain.
-        # Matches the conf_thres tuning of the current top miner (alfred8995).
-        self.conf_threshold = 0.4
         # Gaussian Soft-NMS sigma. 0.5 is the textbook default — gentler
         # than numberplate's 0.3 because beverage scenes are less crowded.
         self.soft_nms_sigma = 0.5
         # Final score floor after Soft-NMS decay.
         self.score_threshold = 0.01
-        # Sane-box geometry filters. Spurious detections are a major source of
-        # false_positive pillar damage; these constraints reject obviously
-        # non-beverage geometry (1×1 pixel "boxes", needle-thin slivers,
-        # extreme aspect ratios). Values mirror the top miner's tuning.
         self.min_box_area = 100      # 10x10 px²
         self.min_side = 8            # min(w, h) in pixels
         self.max_aspect_ratio = 8.0  # max(w/h, h/w)
-        # Horizontal-flip TTA. Runs inference twice (original + h-flipped),
-        # un-flips the x-coords on the flipped view, merges via cluster_dedup
-        # to suppress duplicates. Doubles inference cost but the validator
-        # latency cap is 10s and our single-pass inference is ~10ms, so we
-        # have ~1000× headroom.
-        self.use_tta = True
         # GPU warmup — force ORT/CUDA/cuDNN kernel compilation before the
         # first real validator frame. Mirrors the numberplate miner pattern.
@@ -424,6 +429,11 @@ class Miner:
         # would otherwise survive Soft-NMS's gentle decay above the score floor.
         dets = self._cluster_dedup(dets, iou_thresh=0.5)
         dets = self._soft_nms(dets)
         out_boxes: list[BoundingBox] = []
         for x1, y1, x2, y2, conf, cls_id in dets:
@@ -510,6 +520,17 @@ class Miner:
         offset: int,
         n_keypoints: int,
     ) -> list[TVFrameResult]:
         results: list[TVFrameResult] = []
         infer = self._infer_with_tta if self.use_tta else self._infer_single
         for idx, image in enumerate(batch_images):

         self.input_h = 960
         self.input_w = 960
+        # Pre-NMS confidence threshold. v3 sets 0.55 to match the only
+        # working SN44-beverage miner (alfred8995). Their empirical tuning
+        # heavily prioritises the false_positive pillar (40% of composite);
+        # validator-side data shows their 0.55 threshold consistently
+        # delivers nonzero composites while looser thresholds (incl. our
+        # earlier 0.4 attempt) score 0.
+        self.conf_threshold = 0.55
         # Gaussian Soft-NMS sigma. 0.5 is the textbook default — gentler
         # than numberplate's 0.3 because beverage scenes are less crowded.
         self.soft_nms_sigma = 0.5
         # Final score floor after Soft-NMS decay.
         self.score_threshold = 0.01
+        # Sane-box geometry filters. v3 keeps the alfred-aligned values
+        # (100 area / 8 side / 8 AR). Loosening them was a hypothesis
+        # that contradicted alfred's working empirical tuning.
         self.min_box_area = 100      # 10x10 px²
         self.min_side = 8            # min(w, h) in pixels
         self.max_aspect_ratio = 8.0  # max(w/h, h/w)
+        # Per-image detection cap. Mirrors alfred's max_det=150 — caps
+        # over-prediction in dense or noisy scenes that would otherwise
+        # tank the false_positive pillar.
+        self.max_det = 150
+        # Horizontal-flip TTA. DISABLED in v3 because the actual latency cap
+        # is RTF≤1.0 with service_rate_fps=1, which means p95 ≤ 5000 ms per
+        # /predict call. Empirical batch tests showed our chute fails the
+        # gate at 20+ frames per call. Halving inference cost (TTA off) gives
+        # ~2× headroom. Scoring impact: small recall loss (TTA usually adds
+        # +0.5–2% mAP), worth it to clear the gate.
+        self.use_tta = False
         # GPU warmup — force ORT/CUDA/cuDNN kernel compilation before the
         # first real validator frame. Mirrors the numberplate miner pattern.
         # would otherwise survive Soft-NMS's gentle decay above the score floor.
         dets = self._cluster_dedup(dets, iou_thresh=0.5)
         dets = self._soft_nms(dets)
+        # Cap per-image detection count (mirrors alfred). Soft-NMS already
+        # returns dets sorted by descending decayed score, so [:max_det]
+        # keeps the top-confidence ones.
+        if len(dets) > self.max_det:
+            dets = dets[: self.max_det]
         out_boxes: list[BoundingBox] = []
         for x1, y1, x2, y2, conf, cls_id in dets:
         offset: int,
         n_keypoints: int,
     ) -> list[TVFrameResult]:
+        # v3 diagnostic: log batch_size to stderr so chute logs reveal what
+        # the validator actually sends per /predict call. Used to confirm
+        # whether the latency gate is the failure mode (large batches ⇒
+        # high p95). Cheap; one print per batch.
+        import sys as _sys
+        _sys.stderr.write(
+            f"[trace] predict_batch n={len(batch_images)} offset={offset} "
+            f"n_kp={n_keypoints} use_tta={self.use_tta}\n"
+        )
+        _sys.stderr.flush()
         results: list[TVFrameResult] = []
         infer = self._infer_with_tta if self.use_tta else self._infer_single
         for idx, image in enumerate(batch_images):