Spaces:

userx2000
/

cloudzy_ai_challenge

Sleeping

App Files Files Community

matinsn2000 commited on 22 days ago

Commit

c2cd7f1

1 Parent(s): cbab173

Fixed the problem with k means clustring

Browse files

Files changed (2) hide show

cloudzy/routes/photo.py +10 -2
cloudzy/search_engine.py +16 -8

cloudzy/routes/photo.py CHANGED Viewed

@@ -81,15 +81,23 @@ async def list_photos(
 @router.get("/albums", response_model=AlbumsResponse)
 async def get_albums(
-    top_k: int = Query(5, ge=1, le=5),
     session: Session = Depends(get_session),
 ):
     """
     Create albums of semantically similar photos.
     """
     search_engine = SearchEngine()
-    albums_ids = search_engine.create_albums(top_k=top_k)
     APP_DOMAIN = os.getenv("APP_DOMAIN") or "http://127.0.0.1:8000/"
     summarizer = TextSummarizer()

 @router.get("/albums", response_model=AlbumsResponse)
 async def get_albums(
+    top_k: int = Query(2, ge=1, le=5),
     session: Session = Depends(get_session),
 ):
     """
     Create albums of semantically similar photos.
+    Returns albums of grouped photos. If fewer images exist than requested albums,
+    returns what's available instead of an empty list.
     """
     search_engine = SearchEngine()
+    albums_ids = search_engine.create_albums_kmeans(top_k=top_k)
+    # Handle case where no albums were created (no images in database)
+    if not albums_ids:
+        return []
     APP_DOMAIN = os.getenv("APP_DOMAIN") or "http://127.0.0.1:8000/"
     summarizer = TextSummarizer()

cloudzy/search_engine.py CHANGED Viewed

@@ -24,9 +24,10 @@ class SearchEngine:
         """
         Group similar images into albums (clusters).
-        Returns exactly top_k albums, each containing up to album_size similar photos.
         Photos are marked as visited to avoid duplicate albums.
         Only includes photos within the distance threshold.
         OPTIMIZATIONS:
         - Batch retrieves all photos in ONE database query (not per-photo)
@@ -34,12 +35,13 @@ class SearchEngine:
         - Single session for all DB operations
         Args:
-            top_k: Number of albums to return
             distance_threshold: Maximum distance to consider photos as similar (default 1.0 for normalized embeddings)
             album_size: How many similar photos to search for per album (default 5)
         Returns:
-            List of top_k albums, each album is a list of photo_ids (randomized order each call)
         """
         from cloudzy.database import SessionLocal
         from cloudzy.models import Photo
@@ -115,18 +117,24 @@ class SearchEngine:
         - All photos get assigned to a cluster (no "orphans")
         - Deterministic results for same seed
         - Much faster for large datasets
         Args:
             top_k: Number of clusters (albums) to create
             seed: Random seed for reproducibility
         Returns:
-            List of top_k albums, each album is a list of photo_ids
         """
         self.load()
-        if self.index.ntotal < top_k:
             return []
         # Get all photo IDs from FAISS index
         id_map = self.index.id_map
         all_ids = np.array([id_map.at(i) for i in range(id_map.size())], dtype=np.int64)
@@ -135,10 +143,10 @@ class SearchEngine:
         underlying_index = faiss.downcast_index(self.index.index)
         all_embeddings = underlying_index.reconstruct_n(0, self.index.ntotal).astype(np.float32)
-        # ✅ Run k-means clustering
         kmeans = faiss.Kmeans(
             d=self.dim,
-            k=top_k,
             niter=20,
             verbose=False,
             seed=seed
@@ -149,7 +157,7 @@ class SearchEngine:
         distances, cluster_assignments = kmeans.index.search(all_embeddings, 1)
         # Group photos by cluster
-        albums = [[] for _ in range(top_k)]
         for photo_id, cluster_id in zip(all_ids, cluster_assignments.flatten()):
             albums[cluster_id].append(int(photo_id))

         """
         Group similar images into albums (clusters).
+        Returns up to top_k albums, each containing up to album_size similar photos.
         Photos are marked as visited to avoid duplicate albums.
         Only includes photos within the distance threshold.
+        Automatically adjusts if fewer images than requested albums.
         OPTIMIZATIONS:
         - Batch retrieves all photos in ONE database query (not per-photo)
         - Single session for all DB operations
         Args:
+            top_k: Number of albums to return (returns fewer if not enough images)
             distance_threshold: Maximum distance to consider photos as similar (default 1.0 for normalized embeddings)
             album_size: How many similar photos to search for per album (default 5)
         Returns:
+            List of up to top_k albums, each album is a list of photo_ids (randomized order each call)
+            Returns empty list if no images exist.
         """
         from cloudzy.database import SessionLocal
         from cloudzy.models import Photo
         - All photos get assigned to a cluster (no "orphans")
         - Deterministic results for same seed
         - Much faster for large datasets
+        - Automatically adjusts if fewer images than requested clusters
         Args:
             top_k: Number of clusters (albums) to create
             seed: Random seed for reproducibility
         Returns:
+            List of albums, each album is a list of photo_ids.
+            Returns up to top_k albums, or fewer if total images < top_k.
+            Returns empty list if no images exist.
         """
         self.load()
+        if self.index.ntotal == 0:
             return []
+        # Adjust k to not exceed total number of images
+        actual_k = min(top_k, self.index.ntotal)
         # Get all photo IDs from FAISS index
         id_map = self.index.id_map
         all_ids = np.array([id_map.at(i) for i in range(id_map.size())], dtype=np.int64)
         underlying_index = faiss.downcast_index(self.index.index)
         all_embeddings = underlying_index.reconstruct_n(0, self.index.ntotal).astype(np.float32)
+        # ✅ Run k-means clustering with adjusted k
         kmeans = faiss.Kmeans(
             d=self.dim,
+            k=actual_k,
             niter=20,
             verbose=False,
             seed=seed
         distances, cluster_assignments = kmeans.index.search(all_embeddings, 1)
         # Group photos by cluster
+        albums = [[] for _ in range(actual_k)]
         for photo_id, cluster_id in zip(all_ids, cluster_assignments.flatten()):
             albums[cluster_id].append(int(photo_id))