Spaces:
Sleeping
Sleeping
Commit
·
c2cd7f1
1
Parent(s):
cbab173
Fixed the problem with k means clustring
Browse files- cloudzy/routes/photo.py +10 -2
- cloudzy/search_engine.py +16 -8
cloudzy/routes/photo.py
CHANGED
|
@@ -81,15 +81,23 @@ async def list_photos(
|
|
| 81 |
|
| 82 |
@router.get("/albums", response_model=AlbumsResponse)
|
| 83 |
async def get_albums(
|
| 84 |
-
top_k: int = Query(
|
| 85 |
session: Session = Depends(get_session),
|
| 86 |
):
|
| 87 |
"""
|
| 88 |
Create albums of semantically similar photos.
|
|
|
|
|
|
|
|
|
|
| 89 |
"""
|
| 90 |
|
| 91 |
search_engine = SearchEngine()
|
| 92 |
-
albums_ids = search_engine.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
APP_DOMAIN = os.getenv("APP_DOMAIN") or "http://127.0.0.1:8000/"
|
| 94 |
summarizer = TextSummarizer()
|
| 95 |
|
|
|
|
| 81 |
|
| 82 |
@router.get("/albums", response_model=AlbumsResponse)
|
| 83 |
async def get_albums(
|
| 84 |
+
top_k: int = Query(2, ge=1, le=5),
|
| 85 |
session: Session = Depends(get_session),
|
| 86 |
):
|
| 87 |
"""
|
| 88 |
Create albums of semantically similar photos.
|
| 89 |
+
|
| 90 |
+
Returns albums of grouped photos. If fewer images exist than requested albums,
|
| 91 |
+
returns what's available instead of an empty list.
|
| 92 |
"""
|
| 93 |
|
| 94 |
search_engine = SearchEngine()
|
| 95 |
+
albums_ids = search_engine.create_albums_kmeans(top_k=top_k)
|
| 96 |
+
|
| 97 |
+
# Handle case where no albums were created (no images in database)
|
| 98 |
+
if not albums_ids:
|
| 99 |
+
return []
|
| 100 |
+
|
| 101 |
APP_DOMAIN = os.getenv("APP_DOMAIN") or "http://127.0.0.1:8000/"
|
| 102 |
summarizer = TextSummarizer()
|
| 103 |
|
cloudzy/search_engine.py
CHANGED
|
@@ -24,9 +24,10 @@ class SearchEngine:
|
|
| 24 |
"""
|
| 25 |
Group similar images into albums (clusters).
|
| 26 |
|
| 27 |
-
Returns
|
| 28 |
Photos are marked as visited to avoid duplicate albums.
|
| 29 |
Only includes photos within the distance threshold.
|
|
|
|
| 30 |
|
| 31 |
OPTIMIZATIONS:
|
| 32 |
- Batch retrieves all photos in ONE database query (not per-photo)
|
|
@@ -34,12 +35,13 @@ class SearchEngine:
|
|
| 34 |
- Single session for all DB operations
|
| 35 |
|
| 36 |
Args:
|
| 37 |
-
top_k: Number of albums to return
|
| 38 |
distance_threshold: Maximum distance to consider photos as similar (default 1.0 for normalized embeddings)
|
| 39 |
album_size: How many similar photos to search for per album (default 5)
|
| 40 |
|
| 41 |
Returns:
|
| 42 |
-
List of top_k albums, each album is a list of photo_ids (randomized order each call)
|
|
|
|
| 43 |
"""
|
| 44 |
from cloudzy.database import SessionLocal
|
| 45 |
from cloudzy.models import Photo
|
|
@@ -115,18 +117,24 @@ class SearchEngine:
|
|
| 115 |
- All photos get assigned to a cluster (no "orphans")
|
| 116 |
- Deterministic results for same seed
|
| 117 |
- Much faster for large datasets
|
|
|
|
| 118 |
|
| 119 |
Args:
|
| 120 |
top_k: Number of clusters (albums) to create
|
| 121 |
seed: Random seed for reproducibility
|
| 122 |
|
| 123 |
Returns:
|
| 124 |
-
List of
|
|
|
|
|
|
|
| 125 |
"""
|
| 126 |
self.load()
|
| 127 |
-
if self.index.ntotal
|
| 128 |
return []
|
| 129 |
|
|
|
|
|
|
|
|
|
|
| 130 |
# Get all photo IDs from FAISS index
|
| 131 |
id_map = self.index.id_map
|
| 132 |
all_ids = np.array([id_map.at(i) for i in range(id_map.size())], dtype=np.int64)
|
|
@@ -135,10 +143,10 @@ class SearchEngine:
|
|
| 135 |
underlying_index = faiss.downcast_index(self.index.index)
|
| 136 |
all_embeddings = underlying_index.reconstruct_n(0, self.index.ntotal).astype(np.float32)
|
| 137 |
|
| 138 |
-
# ✅ Run k-means clustering
|
| 139 |
kmeans = faiss.Kmeans(
|
| 140 |
d=self.dim,
|
| 141 |
-
k=
|
| 142 |
niter=20,
|
| 143 |
verbose=False,
|
| 144 |
seed=seed
|
|
@@ -149,7 +157,7 @@ class SearchEngine:
|
|
| 149 |
distances, cluster_assignments = kmeans.index.search(all_embeddings, 1)
|
| 150 |
|
| 151 |
# Group photos by cluster
|
| 152 |
-
albums = [[] for _ in range(
|
| 153 |
for photo_id, cluster_id in zip(all_ids, cluster_assignments.flatten()):
|
| 154 |
albums[cluster_id].append(int(photo_id))
|
| 155 |
|
|
|
|
| 24 |
"""
|
| 25 |
Group similar images into albums (clusters).
|
| 26 |
|
| 27 |
+
Returns up to top_k albums, each containing up to album_size similar photos.
|
| 28 |
Photos are marked as visited to avoid duplicate albums.
|
| 29 |
Only includes photos within the distance threshold.
|
| 30 |
+
Automatically adjusts if fewer images than requested albums.
|
| 31 |
|
| 32 |
OPTIMIZATIONS:
|
| 33 |
- Batch retrieves all photos in ONE database query (not per-photo)
|
|
|
|
| 35 |
- Single session for all DB operations
|
| 36 |
|
| 37 |
Args:
|
| 38 |
+
top_k: Number of albums to return (returns fewer if not enough images)
|
| 39 |
distance_threshold: Maximum distance to consider photos as similar (default 1.0 for normalized embeddings)
|
| 40 |
album_size: How many similar photos to search for per album (default 5)
|
| 41 |
|
| 42 |
Returns:
|
| 43 |
+
List of up to top_k albums, each album is a list of photo_ids (randomized order each call)
|
| 44 |
+
Returns empty list if no images exist.
|
| 45 |
"""
|
| 46 |
from cloudzy.database import SessionLocal
|
| 47 |
from cloudzy.models import Photo
|
|
|
|
| 117 |
- All photos get assigned to a cluster (no "orphans")
|
| 118 |
- Deterministic results for same seed
|
| 119 |
- Much faster for large datasets
|
| 120 |
+
- Automatically adjusts if fewer images than requested clusters
|
| 121 |
|
| 122 |
Args:
|
| 123 |
top_k: Number of clusters (albums) to create
|
| 124 |
seed: Random seed for reproducibility
|
| 125 |
|
| 126 |
Returns:
|
| 127 |
+
List of albums, each album is a list of photo_ids.
|
| 128 |
+
Returns up to top_k albums, or fewer if total images < top_k.
|
| 129 |
+
Returns empty list if no images exist.
|
| 130 |
"""
|
| 131 |
self.load()
|
| 132 |
+
if self.index.ntotal == 0:
|
| 133 |
return []
|
| 134 |
|
| 135 |
+
# Adjust k to not exceed total number of images
|
| 136 |
+
actual_k = min(top_k, self.index.ntotal)
|
| 137 |
+
|
| 138 |
# Get all photo IDs from FAISS index
|
| 139 |
id_map = self.index.id_map
|
| 140 |
all_ids = np.array([id_map.at(i) for i in range(id_map.size())], dtype=np.int64)
|
|
|
|
| 143 |
underlying_index = faiss.downcast_index(self.index.index)
|
| 144 |
all_embeddings = underlying_index.reconstruct_n(0, self.index.ntotal).astype(np.float32)
|
| 145 |
|
| 146 |
+
# ✅ Run k-means clustering with adjusted k
|
| 147 |
kmeans = faiss.Kmeans(
|
| 148 |
d=self.dim,
|
| 149 |
+
k=actual_k,
|
| 150 |
niter=20,
|
| 151 |
verbose=False,
|
| 152 |
seed=seed
|
|
|
|
| 157 |
distances, cluster_assignments = kmeans.index.search(all_embeddings, 1)
|
| 158 |
|
| 159 |
# Group photos by cluster
|
| 160 |
+
albums = [[] for _ in range(actual_k)]
|
| 161 |
for photo_id, cluster_id in zip(all_ids, cluster_assignments.flatten()):
|
| 162 |
albums[cluster_id].append(int(photo_id))
|
| 163 |
|