Spaces:
Sleeping
Sleeping
File size: 7,546 Bytes
edba165 8780754 edba165 d57d2e9 edba165 edd2633 edba165 c0a6c3c edba165 edd2633 c0a6c3c edd2633 cc02f09 edd2633 edba165 ebc2986 edba165 8780754 edba165 edd2633 edba165 ebc2986 edba165 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 |
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from huggingface_hub import snapshot_download
from datasets import load_dataset
from gensim.models import FastText
from s2sphere import CellId, Cell, LatLng
from collections import defaultdict
import folium
from folium import Map
import gradio as gr
from gradio_folium import Folium
from sklearn.cluster import KMeans
def extract_restaurant_embeddings(model, processed_df):
"""
Extract the embeddings for all restaurants
"""
unique_restaurants = processed_df['res_cell_id'].unique()
restaurant_embeddings = {}
for restaurant_id in unique_restaurants:
token = str(restaurant_id) # No prefix, just the cell ID
try:
embedding = model.wv[token]
restaurant_embeddings[restaurant_id] = embedding
except KeyError:
print(f"Warning: Restaurant {restaurant_id} not found in vocabulary")
return restaurant_embeddings
def cluster_embeddings(restaurant_embeddings, algo):
restaurant_ids = list(restaurant_embeddings.keys())
embedding_matrix = np.array([restaurant_embeddings[res_id] for res_id in restaurant_ids])
labels = algo.fit_predict(embedding_matrix)
restaurant_clusters = dict(zip(restaurant_ids, labels))
return restaurant_clusters
def s2_cell_to_geojson(cell_id_token_or_int):
# Convert to CellId
cell_id = CellId.from_token(str(cell_id_token_or_int)) if isinstance(cell_id_token_or_int, str) else CellId(cell_id_token_or_int)
cell = Cell(cell_id)
# Get cell corner coordinates
coords = []
for i in range(4):
vertex = cell.get_vertex(i)
latlng = LatLng.from_point(vertex)
coords.append([latlng.lng().degrees, latlng.lat().degrees]) # GeoJSON uses [lng, lat]
coords.append(coords[0]) # Close the polygon
# Build GeoJSON
geojson = {
"type": "Feature",
"geometry": {
"type": "Polygon",
"coordinates": [coords]
},
"properties": {
"cell_id": str(cell_id),
"level": cell_id.level()
}
}
return geojson
def map_cluster_to_restaurants(restaurant_clusters):
# Reverse mapping: cluster_id → list of restaurant_ids
cluster_to_restaurants = defaultdict(list)
for res_id, cluster_id in restaurant_clusters.items():
cluster_to_restaurants[cluster_id].append(res_id)
return cluster_to_restaurants
def get_cluster_jsons(cluster_to_restaurants):
clusters_jsons = []
for cid, res_ids in cluster_to_restaurants.items():
features = []
for cell_id in res_ids:
try:
feature = s2_cell_to_geojson(cell_id)
features.append(feature)
except Exception as e:
print(f"Error converting {cell_id}: {e}")
# Build GeoJSON FeatureCollection
geojson = {
"type": "FeatureCollection",
"features": features
}
clusters_jsons.append(geojson)
return clusters_jsons
def visualise_on_map(jsons):
# Create map (you can center it later using a known location or one of the features)
m = Map(location=[12.935656, 77.543204], zoom_start=12)
# Loop through all cluster GeoJSONs and add them to the map
for i, geojson in enumerate(jsons):
try:
folium.GeoJson(
geojson,
name=f"Cluster {i}",
tooltip=f"Cluster {i}",
style_function=lambda feature, color=f"#{i*123456%0xFFFFFF:06x}": {
"fillColor": color,
"color": color,
"weight": 1,
"fillOpacity": 0.4,
},
).add_to(m)
except Exception as e:
print(f"Failed to add cluster {i}: {e}")
# Optional: Add a layer control to toggle clusters
folium.LayerControl().add_to(m)
return m
REPO_ID = "ankush-003/fastCell"
dataset = load_dataset("ankush-003/Cells_Data")
df = dataset['train'].to_pandas()
model = FastText.load(
"cell_embedddings_model"
)
restaurant_embeddings = extract_restaurant_embeddings(model, df)
clusters_jsons = None
def run_clustering(num_clusters, clusters_to_display):
global clusters_jsons
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
restaurant_clusters = cluster_embeddings(restaurant_embeddings, kmeans)
df['cluster'] = df['res_cell_id'].map(restaurant_clusters)
# Count restaurants per cluster
cluster_sizes = df['cluster'].value_counts().sort_index()
avg_size = cluster_sizes.mean()
min_size = cluster_sizes.min()
max_size = cluster_sizes.max()
analysis = f"""
## Clustering Analysis (K={num_clusters})
- Total restaurants: {len(df)}
- Number of clusters: {num_clusters}
- Average restaurants per cluster: {avg_size:.1f}
- Smallest cluster size: {min_size}
- Largest cluster size: {max_size}
- Empty clusters: {num_clusters - len(cluster_sizes)}
"""
c_to_r = map_cluster_to_restaurants(restaurant_clusters)
clusters_jsons = get_cluster_jsons(c_to_r)
if clusters_to_display > len(clusters_jsons):
clusters_to_display = len(clusters_jsons)
# Show map
m = visualise_on_map(clusters_jsons[:clusters_to_display])
return analysis, m
def update_display(clusters_to_display):
global clusters_jsons
if clusters_jsons is None:
return Map(location=[12.935656, 77.543204], zoom_start=12)
# Ensure we don't try to show more clusters than exist
if clusters_to_display > len(clusters_jsons):
clusters_to_display = len(clusters_jsons)
# Create map visualization with selected number of clusters
m = visualise_on_map(clusters_jsons[:clusters_to_display])
return m
# Create Gradio interface
with gr.Blocks(title="Restaurant Clustering Tool") as app:
gr.Markdown("# Restaurant Cell Embeddings Clustering Analysis")
with gr.Row():
with gr.Column(scale=1):
num_clusters_input = gr.Slider(
minimum=2,
maximum=3460,
value=300,
step=1,
label="Total Number of Clusters (K)"
)
display_clusters_input = gr.Slider(
minimum=1,
maximum=3460,
value=10,
step=1,
label="Number of Clusters to Display"
)
with gr.Row():
cluster_btn = gr.Button("Run Clustering")
with gr.Row():
output_text = gr.Markdown()
with gr.Row():
output_plot = Folium(value=Map(location=[12.935656, 77.543204], zoom_start=12), height=1000)
cluster_btn.click(
fn=run_clustering,
inputs=[num_clusters_input, display_clusters_input],
outputs=[output_text, output_plot]
)
display_clusters_input.change(update_display, inputs=[display_clusters_input], outputs=[output_plot])
gr.Markdown("""
## About this app
This app demonstrates K-means clustering on restaurant cell embeddings. The algorithm groups similar restaurants together based on cell embeddings.
### How to use:
1. Adjust the number of clusters using the slider
2. Click "Run Clustering" to see the results
3. Analyze the visualization and metrics
""")
if __name__ == "__main__":
app.launch() |