Spaces:

ankush-003
/

cell_cluster

Sleeping

File size: 7,546 Bytes

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from huggingface_hub import snapshot_download
from datasets import load_dataset
from gensim.models import FastText
from s2sphere import CellId, Cell, LatLng
from collections import defaultdict
import folium
from folium import Map
import gradio as gr
from gradio_folium import Folium
from sklearn.cluster import KMeans

def extract_restaurant_embeddings(model, processed_df):
    """
    Extract the embeddings for all restaurants
    """
    unique_restaurants = processed_df['res_cell_id'].unique()
    
    restaurant_embeddings = {}
    for restaurant_id in unique_restaurants:
        token = str(restaurant_id)  # No prefix, just the cell ID
        try:
            embedding = model.wv[token]
            restaurant_embeddings[restaurant_id] = embedding
        except KeyError:
            print(f"Warning: Restaurant {restaurant_id} not found in vocabulary")
    
    return restaurant_embeddings

def cluster_embeddings(restaurant_embeddings, algo):
    restaurant_ids = list(restaurant_embeddings.keys())
    embedding_matrix = np.array([restaurant_embeddings[res_id] for res_id in restaurant_ids])
    labels = algo.fit_predict(embedding_matrix)
    restaurant_clusters = dict(zip(restaurant_ids, labels))
    return restaurant_clusters 

def s2_cell_to_geojson(cell_id_token_or_int):
    # Convert to CellId
    cell_id = CellId.from_token(str(cell_id_token_or_int)) if isinstance(cell_id_token_or_int, str) else CellId(cell_id_token_or_int)
    
    cell = Cell(cell_id)
    
    # Get cell corner coordinates
    coords = []
    for i in range(4):
        vertex = cell.get_vertex(i)
        latlng = LatLng.from_point(vertex)
        coords.append([latlng.lng().degrees, latlng.lat().degrees])  # GeoJSON uses [lng, lat]
    coords.append(coords[0])  # Close the polygon
    
    # Build GeoJSON
    geojson = {
        "type": "Feature",
        "geometry": {
            "type": "Polygon",
            "coordinates": [coords]
        },
        "properties": {
            "cell_id": str(cell_id),
            "level": cell_id.level()
        }
    }
    return geojson

def map_cluster_to_restaurants(restaurant_clusters):
    # Reverse mapping: cluster_id → list of restaurant_ids
    cluster_to_restaurants = defaultdict(list)
    for res_id, cluster_id in restaurant_clusters.items():
        cluster_to_restaurants[cluster_id].append(res_id)
    return cluster_to_restaurants

def get_cluster_jsons(cluster_to_restaurants):
    clusters_jsons = []
    for cid, res_ids in cluster_to_restaurants.items():
        features = []
        for cell_id in res_ids:
            try:
                feature = s2_cell_to_geojson(cell_id)
                features.append(feature)
            except Exception as e:
                print(f"Error converting {cell_id}: {e}")
    
        # Build GeoJSON FeatureCollection
        geojson = {
            "type": "FeatureCollection",
            "features": features
        }
        clusters_jsons.append(geojson)
    return clusters_jsons

def visualise_on_map(jsons):
    # Create map (you can center it later using a known location or one of the features)
    m = Map(location=[12.935656, 77.543204], zoom_start=12)
    
    # Loop through all cluster GeoJSONs and add them to the map
    for i, geojson in enumerate(jsons):
        try:
            folium.GeoJson(
                geojson,
                name=f"Cluster {i}",
                tooltip=f"Cluster {i}",
                style_function=lambda feature, color=f"#{i*123456%0xFFFFFF:06x}": {
                    "fillColor": color,
                    "color": color,
                    "weight": 1,
                    "fillOpacity": 0.4,
                },
            ).add_to(m)
        except Exception as e:
            print(f"Failed to add cluster {i}: {e}")
    
    # Optional: Add a layer control to toggle clusters
    folium.LayerControl().add_to(m)

    return m

REPO_ID = "ankush-003/fastCell"

dataset = load_dataset("ankush-003/Cells_Data")
df = dataset['train'].to_pandas()

model = FastText.load(
    "cell_embedddings_model"
)

restaurant_embeddings = extract_restaurant_embeddings(model, df)

clusters_jsons = None


def run_clustering(num_clusters, clusters_to_display):
    global clusters_jsons
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    restaurant_clusters = cluster_embeddings(restaurant_embeddings, kmeans)
    df['cluster'] = df['res_cell_id'].map(restaurant_clusters)
    
    # Count restaurants per cluster
    cluster_sizes = df['cluster'].value_counts().sort_index()
    avg_size = cluster_sizes.mean()
    min_size = cluster_sizes.min()
    max_size = cluster_sizes.max()
    
    analysis = f"""
    ## Clustering Analysis (K={num_clusters})
    
    - Total restaurants: {len(df)}
    - Number of clusters: {num_clusters}
    - Average restaurants per cluster: {avg_size:.1f}
    - Smallest cluster size: {min_size}
    - Largest cluster size: {max_size}
    - Empty clusters: {num_clusters - len(cluster_sizes)}
    """

    c_to_r = map_cluster_to_restaurants(restaurant_clusters)
    clusters_jsons = get_cluster_jsons(c_to_r)
    if clusters_to_display > len(clusters_jsons):
        clusters_to_display = len(clusters_jsons)
    # Show map
    m = visualise_on_map(clusters_jsons[:clusters_to_display])
    
    return analysis, m

def update_display(clusters_to_display):
    global clusters_jsons
    if clusters_jsons is None:
        return Map(location=[12.935656, 77.543204], zoom_start=12)
    
    # Ensure we don't try to show more clusters than exist
    if clusters_to_display > len(clusters_jsons):
        clusters_to_display = len(clusters_jsons)
    
    # Create map visualization with selected number of clusters
    m = visualise_on_map(clusters_jsons[:clusters_to_display])
    
    return m

# Create Gradio interface
with gr.Blocks(title="Restaurant Clustering Tool") as app:
    gr.Markdown("# Restaurant Cell Embeddings Clustering Analysis")
    
    with gr.Row():
        with gr.Column(scale=1):
            num_clusters_input = gr.Slider(
                minimum=2, 
                maximum=3460, 
                value=300, 
                step=1, 
                label="Total Number of Clusters (K)"
            )
            
            display_clusters_input = gr.Slider(
                minimum=1, 
                maximum=3460, 
                value=10, 
                step=1, 
                label="Number of Clusters to Display"
            )
    
    with gr.Row():
        cluster_btn = gr.Button("Run Clustering")
    
    with gr.Row():
        output_text = gr.Markdown()
    
    with gr.Row():
        output_plot = Folium(value=Map(location=[12.935656, 77.543204], zoom_start=12), height=1000)
    
    cluster_btn.click(
        fn=run_clustering,
        inputs=[num_clusters_input, display_clusters_input],
        outputs=[output_text, output_plot]
    )

    display_clusters_input.change(update_display, inputs=[display_clusters_input], outputs=[output_plot])
    
    gr.Markdown("""
    ## About this app
    
    This app demonstrates K-means clustering on restaurant cell embeddings. The algorithm groups similar restaurants together based on cell embeddings.
    
    ### How to use:
    1. Adjust the number of clusters using the slider
    2. Click "Run Clustering" to see the results
    3. Analyze the visualization and metrics
    """)

if __name__ == "__main__":
    app.launch()