AdarshDRC commited on
Commit
dfc44c0
·
1 Parent(s): 725ae84

Implementing pagination

Browse files
Files changed (2) hide show
  1. src/core/config.py +3 -3
  2. src/services/clustering.py +12 -3
src/core/config.py CHANGED
@@ -156,9 +156,9 @@ USE_ASYNC_UPLOADS = int(os.getenv("USE_ASYNC_UPLOADS", "1"))
156
  USE_CLUSTER_AWARE_SEARCH = int(os.getenv("USE_CLUSTER_AWARE_SEARCH", "1"))
157
 
158
  # HDBSCAN parameters — tuned for typical 1k–10k image libraries
159
- CLUSTER_MIN_SAMPLES = int(os.getenv("CLUSTER_MIN_SAMPLES", "2")) # Lowered from 3 to include pairs
160
- CLUSTER_MIN_CLUSTER_SIZE = int(os.getenv("CLUSTER_MIN_CLUSTER_SIZE", "2")) # Lowered from 3 to 2
161
- CLUSTER_EPSILON = float(os.getenv("CLUSTER_EPSILON", "0.35"))
162
 
163
  # Auto re-cluster after every N new face uploads (0 = disabled, manual only)
164
  CLUSTER_AUTO_TRIGGER_EVERY = int(os.getenv("CLUSTER_AUTO_TRIGGER_EVERY", "0"))
 
156
  USE_CLUSTER_AWARE_SEARCH = int(os.getenv("USE_CLUSTER_AWARE_SEARCH", "1"))
157
 
158
  # HDBSCAN parameters — tuned for typical 1k–10k image libraries
159
+ CLUSTER_MIN_SAMPLES = int(os.getenv("CLUSTER_MIN_SAMPLES", "5")) # Increased to 5 for stricter clustering, fewer duplicates
160
+ CLUSTER_MIN_CLUSTER_SIZE = int(os.getenv("CLUSTER_MIN_CLUSTER_SIZE", "5")) # Increased from 2 to 5, require 5+ faces per cluster
161
+ CLUSTER_EPSILON = float(os.getenv("CLUSTER_EPSILON", "0.20")) # Tightened from 0.35 to 0.20 to reduce duplicate clusters
162
 
163
  # Auto re-cluster after every N new face uploads (0 = disabled, manual only)
164
  CLUSTER_AUTO_TRIGGER_EVERY = int(os.getenv("CLUSTER_AUTO_TRIGGER_EVERY", "0"))
src/services/clustering.py CHANGED
@@ -161,13 +161,22 @@ def _run_hdbscan(vectors: np.ndarray) -> np.ndarray:
161
 
162
  def _pick_representative(cluster_vecs: np.ndarray, cluster_meta: list[dict]) -> dict:
163
  """
164
- Picks the face closest to the cluster centroid as the representative.
165
- Returns the metadata dict for that face.
166
  """
167
  centroid = cluster_vecs.mean(axis=0)
168
  centroid /= np.linalg.norm(centroid) + 1e-8
169
  sims = cluster_vecs @ centroid
170
- best_idx = int(np.argmax(sims))
 
 
 
 
 
 
 
 
 
171
  return cluster_meta[best_idx]
172
 
173
 
 
161
 
162
  def _pick_representative(cluster_vecs: np.ndarray, cluster_meta: list[dict]) -> dict:
163
  """
164
+ Picks the non-blurry face closest to the cluster centroid as the representative.
165
+ Prefers sharpest (highest blur_score) faces. Returns the metadata dict for that face.
166
  """
167
  centroid = cluster_vecs.mean(axis=0)
168
  centroid /= np.linalg.norm(centroid) + 1e-8
169
  sims = cluster_vecs @ centroid
170
+
171
+ # Sort by similarity, but prefer non-blurry faces (higher blur_score)
172
+ sorted_indices = np.argsort(sims)[::-1] # highest similarity first
173
+ for idx in sorted_indices:
174
+ blur_score = cluster_meta[idx].get("blur_score", 100.0)
175
+ if blur_score >= CLUSTERING_BLUR_THRESHOLD:
176
+ return cluster_meta[int(idx)]
177
+
178
+ # Fallback: if all faces are blurry, pick the sharpest one
179
+ best_idx = max(range(len(cluster_meta)), key=lambda i: cluster_meta[i].get("blur_score", 0))
180
  return cluster_meta[best_idx]
181
 
182