matinsn2000 commited on
Commit
c2cd7f1
·
1 Parent(s): cbab173

Fixed the problem with k means clustring

Browse files
Files changed (2) hide show
  1. cloudzy/routes/photo.py +10 -2
  2. cloudzy/search_engine.py +16 -8
cloudzy/routes/photo.py CHANGED
@@ -81,15 +81,23 @@ async def list_photos(
81
 
82
  @router.get("/albums", response_model=AlbumsResponse)
83
  async def get_albums(
84
- top_k: int = Query(5, ge=1, le=5),
85
  session: Session = Depends(get_session),
86
  ):
87
  """
88
  Create albums of semantically similar photos.
 
 
 
89
  """
90
 
91
  search_engine = SearchEngine()
92
- albums_ids = search_engine.create_albums(top_k=top_k)
 
 
 
 
 
93
  APP_DOMAIN = os.getenv("APP_DOMAIN") or "http://127.0.0.1:8000/"
94
  summarizer = TextSummarizer()
95
 
 
81
 
82
  @router.get("/albums", response_model=AlbumsResponse)
83
  async def get_albums(
84
+ top_k: int = Query(2, ge=1, le=5),
85
  session: Session = Depends(get_session),
86
  ):
87
  """
88
  Create albums of semantically similar photos.
89
+
90
+ Returns albums of grouped photos. If fewer images exist than requested albums,
91
+ returns what's available instead of an empty list.
92
  """
93
 
94
  search_engine = SearchEngine()
95
+ albums_ids = search_engine.create_albums_kmeans(top_k=top_k)
96
+
97
+ # Handle case where no albums were created (no images in database)
98
+ if not albums_ids:
99
+ return []
100
+
101
  APP_DOMAIN = os.getenv("APP_DOMAIN") or "http://127.0.0.1:8000/"
102
  summarizer = TextSummarizer()
103
 
cloudzy/search_engine.py CHANGED
@@ -24,9 +24,10 @@ class SearchEngine:
24
  """
25
  Group similar images into albums (clusters).
26
 
27
- Returns exactly top_k albums, each containing up to album_size similar photos.
28
  Photos are marked as visited to avoid duplicate albums.
29
  Only includes photos within the distance threshold.
 
30
 
31
  OPTIMIZATIONS:
32
  - Batch retrieves all photos in ONE database query (not per-photo)
@@ -34,12 +35,13 @@ class SearchEngine:
34
  - Single session for all DB operations
35
 
36
  Args:
37
- top_k: Number of albums to return
38
  distance_threshold: Maximum distance to consider photos as similar (default 1.0 for normalized embeddings)
39
  album_size: How many similar photos to search for per album (default 5)
40
 
41
  Returns:
42
- List of top_k albums, each album is a list of photo_ids (randomized order each call)
 
43
  """
44
  from cloudzy.database import SessionLocal
45
  from cloudzy.models import Photo
@@ -115,18 +117,24 @@ class SearchEngine:
115
  - All photos get assigned to a cluster (no "orphans")
116
  - Deterministic results for same seed
117
  - Much faster for large datasets
 
118
 
119
  Args:
120
  top_k: Number of clusters (albums) to create
121
  seed: Random seed for reproducibility
122
 
123
  Returns:
124
- List of top_k albums, each album is a list of photo_ids
 
 
125
  """
126
  self.load()
127
- if self.index.ntotal < top_k:
128
  return []
129
 
 
 
 
130
  # Get all photo IDs from FAISS index
131
  id_map = self.index.id_map
132
  all_ids = np.array([id_map.at(i) for i in range(id_map.size())], dtype=np.int64)
@@ -135,10 +143,10 @@ class SearchEngine:
135
  underlying_index = faiss.downcast_index(self.index.index)
136
  all_embeddings = underlying_index.reconstruct_n(0, self.index.ntotal).astype(np.float32)
137
 
138
- # ✅ Run k-means clustering
139
  kmeans = faiss.Kmeans(
140
  d=self.dim,
141
- k=top_k,
142
  niter=20,
143
  verbose=False,
144
  seed=seed
@@ -149,7 +157,7 @@ class SearchEngine:
149
  distances, cluster_assignments = kmeans.index.search(all_embeddings, 1)
150
 
151
  # Group photos by cluster
152
- albums = [[] for _ in range(top_k)]
153
  for photo_id, cluster_id in zip(all_ids, cluster_assignments.flatten()):
154
  albums[cluster_id].append(int(photo_id))
155
 
 
24
  """
25
  Group similar images into albums (clusters).
26
 
27
+ Returns up to top_k albums, each containing up to album_size similar photos.
28
  Photos are marked as visited to avoid duplicate albums.
29
  Only includes photos within the distance threshold.
30
+ Automatically adjusts if fewer images than requested albums.
31
 
32
  OPTIMIZATIONS:
33
  - Batch retrieves all photos in ONE database query (not per-photo)
 
35
  - Single session for all DB operations
36
 
37
  Args:
38
+ top_k: Number of albums to return (returns fewer if not enough images)
39
  distance_threshold: Maximum distance to consider photos as similar (default 1.0 for normalized embeddings)
40
  album_size: How many similar photos to search for per album (default 5)
41
 
42
  Returns:
43
+ List of up to top_k albums, each album is a list of photo_ids (randomized order each call)
44
+ Returns empty list if no images exist.
45
  """
46
  from cloudzy.database import SessionLocal
47
  from cloudzy.models import Photo
 
117
  - All photos get assigned to a cluster (no "orphans")
118
  - Deterministic results for same seed
119
  - Much faster for large datasets
120
+ - Automatically adjusts if fewer images than requested clusters
121
 
122
  Args:
123
  top_k: Number of clusters (albums) to create
124
  seed: Random seed for reproducibility
125
 
126
  Returns:
127
+ List of albums, each album is a list of photo_ids.
128
+ Returns up to top_k albums, or fewer if total images < top_k.
129
+ Returns empty list if no images exist.
130
  """
131
  self.load()
132
+ if self.index.ntotal == 0:
133
  return []
134
 
135
+ # Adjust k to not exceed total number of images
136
+ actual_k = min(top_k, self.index.ntotal)
137
+
138
  # Get all photo IDs from FAISS index
139
  id_map = self.index.id_map
140
  all_ids = np.array([id_map.at(i) for i in range(id_map.size())], dtype=np.int64)
 
143
  underlying_index = faiss.downcast_index(self.index.index)
144
  all_embeddings = underlying_index.reconstruct_n(0, self.index.ntotal).astype(np.float32)
145
 
146
+ # ✅ Run k-means clustering with adjusted k
147
  kmeans = faiss.Kmeans(
148
  d=self.dim,
149
+ k=actual_k,
150
  niter=20,
151
  verbose=False,
152
  seed=seed
 
157
  distances, cluster_assignments = kmeans.index.search(all_embeddings, 1)
158
 
159
  # Group photos by cluster
160
+ albums = [[] for _ in range(actual_k)]
161
  for photo_id, cluster_id in zip(all_ids, cluster_assignments.flatten()):
162
  albums[cluster_id].append(int(photo_id))
163