Spaces:

MaxNoichl
/

clustering_explorer

Sleeping

App Files Files Community

Maximilian Noichl commited on Feb 18, 2024

Commit

c40a6e4

verified ·

1 Parent(s): 1e27f03

Create app.py

Browse files

Files changed (1) hide show

app.py +325 -0

app.py ADDED Viewed

	@@ -0,0 +1,325 @@

+# -*- coding: utf-8 -*-
+"""01_clustering_methods.ipynb
+Automatically generated by Colaboratory.
+Original file is located at
+    https://colab.research.google.com/drive/1mqAGInsaItbKYVUlP9muYz3fpdGBWFz5
+"""
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import sklearn.cluster as cluster
+import colormaps as cmaps
+import opinionated
+plt.style.use("opinionated_rc")
+from opinionated.core import download_googlefont
+download_googlefont('Quicksand', add_to_cache=True)
+plt.rc('font', family='Quicksand')
+!wget https://github.com/scikit-learn-contrib/hdbscan/raw/master/notebooks/clusterable_data.npy
+!wget https://github.com/mwaskom/seaborn-data/raw/master/penguins.csv
+hdbscan_example_data = np.load('clusterable_data.npy')
+penguins_dataset = pd.read_csv('penguins.csv')[['bill_length_mm','bill_depth_mm','flipper_length_mm']].dropna().values
+from sklearn.preprocessing import StandardScaler
+scaler = StandardScaler()
+penguins_dataset_standardized = scaler.fit_transform(penguins_dataset)
+import gradio as gr
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.datasets import make_blobs, make_moons, load_iris
+import seaborn as sns
+import pandas as pd
+import matplotlib.colors as mcolors
+from sklearn.cluster import KMeans
+from sklearn.cluster import AgglomerativeClustering
+from sklearn.mixture import GaussianMixture
+import hdbscan
+import genieclust
+# Pre-defined datasets
+blobs_X, _ = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)
+moons_X, _ = make_moons(n_samples=300, noise=0.05, random_state=0)
+# Penguins dataset (3D example)
+# For the purpose of this example, let's simulate the Penguins dataset with iris for simplicity
+iris_X, _ = load_iris(return_X_y=True)
+# Assuming iris_X to be a placeholder for the Penguins dataset with numerical features
+datasets = {
+    "Blobs": blobs_X,
+    "Moons": moons_X,
+    "Penguins": penguins_dataset_standardized,  # Placeholder for Penguins dataset
+    "hDBSCAN sample": hdbscan_example_data
+}
+# Function for plotting the unclustered dataset
+def plot_unclustered(dataset_name):
+    X = datasets[dataset_name]  # Fetch dataset from the dictionary
+    # Check if the dataset has more than 2 dimensions
+    if X.shape[1] > 2:
+        # Convert dataset to DataFrame for seaborn pairplot
+        df = pd.DataFrame(X)
+        fig = sns.pairplot(df, plot_kws={'color': 'grey','alpha':0.7}, diag_kws={'color': 'grey'}).fig
+    else:
+        fig, ax = plt.subplots(figsize=(8, 6))
+        ax.scatter(X[:, 0], X[:, 1], color='gray', marker='.',alpha=.7)
+        ax.set_xlabel("Feature 1")
+        ax.set_ylabel("Feature 2")
+        ax.grid(True)
+        plt.tight_layout()
+        plt.close(fig)
+    return fig
+def plot_clustered(dataset_name, clustering_method, kmeans_n_clusters, agg_n_clusters, agg_linkage, gmm_n_clusters, covariance_type,
+                   genie_n_clusters, gini_threshold, M,hdbscan_min_cluster_size, hdbscan_min_samples):
+    X = datasets[dataset_name]
+    # Determine the clustering method and fit the model accordingly
+    if clustering_method == "K-Means":
+        model = KMeans(n_clusters=kmeans_n_clusters)
+        model.fit(X)
+        labels = model.labels_  # For K-Means, labels are in .labels_
+    elif clustering_method == "Agglomerative":
+        model = AgglomerativeClustering(n_clusters=agg_n_clusters, linkage=agg_linkage)
+        model.fit(X)
+        labels = model.labels_  # For Agglomerative Clustering, labels are in .labels_
+    elif clustering_method == "Gaussian Mixture":
+        model = GaussianMixture(n_components=gmm_n_clusters, covariance_type=covariance_type)
+        model.fit(X)
+        labels = model.predict(X)  # For Gaussian Mixture, use .predict() to get labels
+    elif clustering_method == "Genie":
+        model = genieclust.Genie(n_clusters=genie_n_clusters, gini_threshold=gini_threshold, M=M)
+        labels = model.fit_predict(X)  # GenieClust uses fit_predict directly for both fitting and label prediction
+    elif clustering_method == "h-DBSCAN":
+        clusterer = hdbscan.HDBSCAN(min_cluster_size=hdbscan_min_cluster_size, min_samples=hdbscan_min_samples).fit(X)
+        labels = clusterer.labels_
+    n_clusters= len(np.unique([x for x in labels if x >= 0]))
+    if n_clusters <= 10:
+        original_cmap = cmaps.greenorange_12
+        colors = original_cmap([x for x in range(n_clusters)])
+        # Create a new listed colormap with the extracted colors
+        new_cmap = mcolors.ListedColormap(colors)
+    else:
+        new_cmap = cmaps.cet_g_bw_minc
+    cluster_colors = [new_cmap(x) if x >= 0
+                  else (0.5, 0.5, 0.5)
+                  for x in labels]
+    # Check if the dataset has more than 2 dimensions
+    if X.shape[1] > 2:
+        # Convert dataset to DataFrame for seaborn pairplot
+        df = pd.DataFrame(X)
+      # df['cluster'] = labels
+      #  fig = sns.pairplot(df, color = cluster_colors, cmap=new_cmap).fig
+            # Create bins for each variable
+        n_bins = 10
+        bins = {column: np.linspace(df[column].min(), df[column].max(), n_bins+1) for column in df.columns}
+        # Create a figure and axes
+        n = len(df.columns)
+        fig, axes = plt.subplots(nrows=n, ncols=n, figsize=(n*2.3, n*2.3))
+        for i in range(n):
+            for j in range(n):
+                ax = axes[i, j]
+                ax.grid(True, which='both', linestyle='--', linewidth=0.5)
+                if i != j:
+                    ax.scatter(df[df.columns[j]], df[df.columns[i]], c=cluster_colors, alpha=0.8, marker='o',s = 10)
+                else:  # Diagonal - Stacked Bar Charts
+                    data = df[df.columns[i]]
+                    counts = np.zeros((n_bins, n_clusters))
+                    for cluster in range(n_clusters):
+                        cluster_data = data[labels == cluster]
+                        hist, _ = np.histogram(cluster_data, bins=bins[df.columns[i]])
+                        counts[:, cluster] = hist
+                    for cluster in range(n_clusters):
+                        ax.bar(range(n_bins), counts[:, cluster], width=1, align='center',
+                              bottom=np.sum(counts[:, :cluster], axis=1), color=cluster_colors[list(labels).index(cluster)] )
+                # Explicit axis lines at the bottom and left
+                ax.spines['top'].set_visible(False)
+                ax.spines['right'].set_visible(False)
+                ax.spines['bottom'].set_visible(True)
+                ax.spines['left'].set_visible(True)
+                # Hide axis marks for inner plots and adjust label size
+                if i < n - 1:
+                    ax.tick_params(labelbottom=False)  # Hide x-axis labels for all but bottom row
+                else:
+                    ax.tick_params(axis='x', labelsize=8)  # Smaller labels for x-axis
+                if j > 0:
+                    ax.tick_params(labelleft=False)  # Hide y-axis labels for all but first column
+                else:
+                    ax.tick_params(axis='y', labelsize=8)  # Smaller labels for y-axis
+                # Set labels for outer plots only
+                if i == n - 1:
+                    ax.set_xlabel(df.columns[j], rotation=0, fontsize=12)
+                if j == 0:
+                    ax.set_ylabel(df.columns[i], fontsize=12)
+    else:
+        fig, ax = plt.subplots(figsize=(8, 6))
+        ax.scatter(X[:, 0], X[:, 1], c=cluster_colors,  marker='.')
+        ax.grid(True)
+        plt.tight_layout()
+        plt.close(fig)
+    return fig
+intro_md = """
+   # Cluster-algorithm-explorer
+    _by [Max Noichl](https://homepage.univie.ac.at/maximilian.noichl/), for the clustering & data-visualization-workshop, Bremen, 2024_
+    Below you can test a number of clustering-algorithms on several easier and harder datasets.
+    """
+# Gradio interface setup remains the same
+with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
+  with gr.Column():
+    gr.Markdown(intro_md)
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("# Choose your dataset:")
+            dataset_dropdown = gr.Dropdown(label="Select a dataset", choices=list(datasets.keys()), value="Blobs")
+            gr.Markdown("# Choose your Clustering algorithm & Parameters:")
+            # Update the dropdown for clustering method to include "Genie"
+            clustering_method_dropdown = gr.Dropdown(label="Select a clustering method", choices=["K-Means", "Agglomerative", "Gaussian Mixture", "Genie", "h-DBSCAN"], value="K-Means")
+            # K-Means parameters
+            with gr.Group(visible=True) as kmeans_params_group:
+                kmeans_n_clusters_slider = gr.Slider(minimum=2, maximum=10, step=1, label="Number of Clusters (K-Means)", value=4)
+            # Agglomerative Clustering parameters
+            with gr.Group(visible=False) as agglomerative_params_group:
+                agg_n_clusters_slider = gr.Slider(minimum=2, maximum=10, step=1, label="Number of Clusters (Agglomerative)", value=4)
+                agg_linkage_dropdown = gr.Dropdown(label="Linkage Type", choices=["ward", "complete", "average", "single"], value="ward")
+            # Gaussian Mixture Model parameters
+            with gr.Group(visible=False) as gmm_params_group:
+                gmm_n_clusters_slider = gr.Slider(minimum=2, maximum=10, step=1, label="Number of Components (GMM)", value=4)
+                covariance_type_dropdown = gr.Dropdown(label="Covariance Type", choices=["full", "tied", "diag", "spherical"], value="full")
+            # GenieClust parameters
+            with gr.Group(visible=False) as genie_params_group:
+                genie_n_clusters_slider = gr.Slider(minimum=2, maximum=10, step=1, label="Number of Clusters (Genie)", value=4)
+                gini_threshold_slider = gr.Slider(minimum=0.0, maximum=1.05, step=0.05, label="Gini Threshold (Genie)", value=.3)
+                M_slider = gr.Slider(minimum=0.5, maximum=2.0, step=0.1, label="M Parameter (Genie)", value=1.0)
+            with gr.Group(visible=False) as hdbscan_params_group:
+                hdbscan_min_cluster_size = gr.Slider(minimum=2, maximum=200, step=1, label="Minimal Cluster Size (hDBSCAN)", value=10)
+                hdbscan_min_samples = gr.Slider(minimum=2, maximum=200, step=1, label="Min. Samples (hDBSCAN)", value=10)
+            # Update the function that changes visible parameter groups based on selected clustering method
+            def update_method_params(clustering_method):
+                return {
+                    kmeans_params_group: gr.Group(visible=clustering_method == "K-Means"),
+                    agglomerative_params_group: gr.Group(visible=clustering_method == "Agglomerative"),
+                    gmm_params_group: gr.Group(visible=clustering_method == "Gaussian Mixture"),
+                    genie_params_group: gr.Group(visible=clustering_method == "Genie"),
+                    hdbscan_params_group: gr.Group(visible=clustering_method == "h-DBSCAN"),
+                }
+            clustering_method_dropdown.change(update_method_params, inputs=[clustering_method_dropdown], outputs=[kmeans_params_group, agglomerative_params_group,
+                                                                                                                  gmm_params_group, genie_params_group,hdbscan_params_group])
+            button = gr.Button("Run Clustering!")
+        with gr.Column():
+            unclustered_plot_output = gr.Plot(label=None)
+            clustered_plot_output = gr.Plot(label=None)
+        dataset_dropdown.change(plot_unclustered, inputs=[dataset_dropdown], outputs=[unclustered_plot_output])
+        demo.load(plot_unclustered, inputs=[dataset_dropdown], outputs=[unclustered_plot_output])
+        # Update the button click event to include new parameters for GenieClust
+        button.click(
+            plot_clustered,
+            inputs=[
+                dataset_dropdown,
+                clustering_method_dropdown,
+                kmeans_n_clusters_slider,
+                agg_n_clusters_slider,
+                agg_linkage_dropdown,
+                gmm_n_clusters_slider,
+                covariance_type_dropdown,
+                genie_n_clusters_slider,  # Add Genie parameters
+                gini_threshold_slider,
+                M_slider,
+                hdbscan_min_cluster_size,
+                hdbscan_min_samples
+            ],
+            outputs=[clustered_plot_output]
+        )
+if __name__ == "__main__":
+    demo.launch(debug=True)