Spaces:
Running
Running
Implementing pagination
Browse files- src/core/config.py +3 -3
- src/services/clustering.py +12 -3
src/core/config.py
CHANGED
|
@@ -156,9 +156,9 @@ USE_ASYNC_UPLOADS = int(os.getenv("USE_ASYNC_UPLOADS", "1"))
|
|
| 156 |
USE_CLUSTER_AWARE_SEARCH = int(os.getenv("USE_CLUSTER_AWARE_SEARCH", "1"))
|
| 157 |
|
| 158 |
# HDBSCAN parameters — tuned for typical 1k–10k image libraries
|
| 159 |
-
CLUSTER_MIN_SAMPLES = int(os.getenv("CLUSTER_MIN_SAMPLES", "
|
| 160 |
-
CLUSTER_MIN_CLUSTER_SIZE = int(os.getenv("CLUSTER_MIN_CLUSTER_SIZE", "
|
| 161 |
-
CLUSTER_EPSILON = float(os.getenv("CLUSTER_EPSILON", "0.
|
| 162 |
|
| 163 |
# Auto re-cluster after every N new face uploads (0 = disabled, manual only)
|
| 164 |
CLUSTER_AUTO_TRIGGER_EVERY = int(os.getenv("CLUSTER_AUTO_TRIGGER_EVERY", "0"))
|
|
|
|
| 156 |
USE_CLUSTER_AWARE_SEARCH = int(os.getenv("USE_CLUSTER_AWARE_SEARCH", "1"))
|
| 157 |
|
| 158 |
# HDBSCAN parameters — tuned for typical 1k–10k image libraries
|
| 159 |
+
CLUSTER_MIN_SAMPLES = int(os.getenv("CLUSTER_MIN_SAMPLES", "5")) # Increased to 5 for stricter clustering, fewer duplicates
|
| 160 |
+
CLUSTER_MIN_CLUSTER_SIZE = int(os.getenv("CLUSTER_MIN_CLUSTER_SIZE", "5")) # Increased from 2 to 5, require 5+ faces per cluster
|
| 161 |
+
CLUSTER_EPSILON = float(os.getenv("CLUSTER_EPSILON", "0.20")) # Tightened from 0.35 to 0.20 to reduce duplicate clusters
|
| 162 |
|
| 163 |
# Auto re-cluster after every N new face uploads (0 = disabled, manual only)
|
| 164 |
CLUSTER_AUTO_TRIGGER_EVERY = int(os.getenv("CLUSTER_AUTO_TRIGGER_EVERY", "0"))
|
src/services/clustering.py
CHANGED
|
@@ -161,13 +161,22 @@ def _run_hdbscan(vectors: np.ndarray) -> np.ndarray:
|
|
| 161 |
|
| 162 |
def _pick_representative(cluster_vecs: np.ndarray, cluster_meta: list[dict]) -> dict:
|
| 163 |
"""
|
| 164 |
-
Picks the face closest to the cluster centroid as the representative.
|
| 165 |
-
Returns the metadata dict for that face.
|
| 166 |
"""
|
| 167 |
centroid = cluster_vecs.mean(axis=0)
|
| 168 |
centroid /= np.linalg.norm(centroid) + 1e-8
|
| 169 |
sims = cluster_vecs @ centroid
|
| 170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
return cluster_meta[best_idx]
|
| 172 |
|
| 173 |
|
|
|
|
| 161 |
|
| 162 |
def _pick_representative(cluster_vecs: np.ndarray, cluster_meta: list[dict]) -> dict:
|
| 163 |
"""
|
| 164 |
+
Picks the non-blurry face closest to the cluster centroid as the representative.
|
| 165 |
+
Prefers sharpest (highest blur_score) faces. Returns the metadata dict for that face.
|
| 166 |
"""
|
| 167 |
centroid = cluster_vecs.mean(axis=0)
|
| 168 |
centroid /= np.linalg.norm(centroid) + 1e-8
|
| 169 |
sims = cluster_vecs @ centroid
|
| 170 |
+
|
| 171 |
+
# Sort by similarity, but prefer non-blurry faces (higher blur_score)
|
| 172 |
+
sorted_indices = np.argsort(sims)[::-1] # highest similarity first
|
| 173 |
+
for idx in sorted_indices:
|
| 174 |
+
blur_score = cluster_meta[idx].get("blur_score", 100.0)
|
| 175 |
+
if blur_score >= CLUSTERING_BLUR_THRESHOLD:
|
| 176 |
+
return cluster_meta[int(idx)]
|
| 177 |
+
|
| 178 |
+
# Fallback: if all faces are blurry, pick the sharpest one
|
| 179 |
+
best_idx = max(range(len(cluster_meta)), key=lambda i: cluster_meta[i].get("blur_score", 0))
|
| 180 |
return cluster_meta[best_idx]
|
| 181 |
|
| 182 |
|