Spaces:

AdarshDRC
/

visual-search-api

Running

AdarshDRC commited on 11 days ago

Commit

dfc44c0

1 Parent(s): 725ae84

Implementing pagination

Files changed (2) hide show

src/core/config.py CHANGED Viewed

@@ -156,9 +156,9 @@ USE_ASYNC_UPLOADS = int(os.getenv("USE_ASYNC_UPLOADS", "1"))
 USE_CLUSTER_AWARE_SEARCH = int(os.getenv("USE_CLUSTER_AWARE_SEARCH", "1"))
 # HDBSCAN parameters — tuned for typical 1k–10k image libraries
-CLUSTER_MIN_SAMPLES = int(os.getenv("CLUSTER_MIN_SAMPLES", "2"))  # Lowered from 3 to include pairs
-CLUSTER_MIN_CLUSTER_SIZE = int(os.getenv("CLUSTER_MIN_CLUSTER_SIZE", "2"))  # Lowered from 3 to 2
-CLUSTER_EPSILON = float(os.getenv("CLUSTER_EPSILON", "0.35"))
 # Auto re-cluster after every N new face uploads (0 = disabled, manual only)
 CLUSTER_AUTO_TRIGGER_EVERY = int(os.getenv("CLUSTER_AUTO_TRIGGER_EVERY", "0"))

 USE_CLUSTER_AWARE_SEARCH = int(os.getenv("USE_CLUSTER_AWARE_SEARCH", "1"))
 # HDBSCAN parameters — tuned for typical 1k–10k image libraries
+CLUSTER_MIN_SAMPLES = int(os.getenv("CLUSTER_MIN_SAMPLES", "5"))  # Increased to 5 for stricter clustering, fewer duplicates
+CLUSTER_MIN_CLUSTER_SIZE = int(os.getenv("CLUSTER_MIN_CLUSTER_SIZE", "5"))  # Increased from 2 to 5, require 5+ faces per cluster
+CLUSTER_EPSILON = float(os.getenv("CLUSTER_EPSILON", "0.20"))  # Tightened from 0.35 to 0.20 to reduce duplicate clusters
 # Auto re-cluster after every N new face uploads (0 = disabled, manual only)
 CLUSTER_AUTO_TRIGGER_EVERY = int(os.getenv("CLUSTER_AUTO_TRIGGER_EVERY", "0"))

src/services/clustering.py CHANGED Viewed

@@ -161,13 +161,22 @@ def _run_hdbscan(vectors: np.ndarray) -> np.ndarray:
 def _pick_representative(cluster_vecs: np.ndarray, cluster_meta: list[dict]) -> dict:
     """
-    Picks the face closest to the cluster centroid as the representative.
-    Returns the metadata dict for that face.
     """
     centroid = cluster_vecs.mean(axis=0)
     centroid /= np.linalg.norm(centroid) + 1e-8
     sims = cluster_vecs @ centroid
-    best_idx = int(np.argmax(sims))
     return cluster_meta[best_idx]

 def _pick_representative(cluster_vecs: np.ndarray, cluster_meta: list[dict]) -> dict:
     """
+    Picks the non-blurry face closest to the cluster centroid as the representative.
+    Prefers sharpest (highest blur_score) faces. Returns the metadata dict for that face.
     """
     centroid = cluster_vecs.mean(axis=0)
     centroid /= np.linalg.norm(centroid) + 1e-8
     sims = cluster_vecs @ centroid
+    # Sort by similarity, but prefer non-blurry faces (higher blur_score)
+    sorted_indices = np.argsort(sims)[::-1]  # highest similarity first
+    for idx in sorted_indices:
+        blur_score = cluster_meta[idx].get("blur_score", 100.0)
+        if blur_score >= CLUSTERING_BLUR_THRESHOLD:
+            return cluster_meta[int(idx)]
+    # Fallback: if all faces are blurry, pick the sharpest one
+    best_idx = max(range(len(cluster_meta)), key=lambda i: cluster_meta[i].get("blur_score", 0))
     return cluster_meta[best_idx]