"""This dashboard is a live demonstration of the sklearn document at https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_assumptions.html#sphx-glr-auto-examples-cluster-plot-kmeans-assumptions-py """ import numpy as np import typing as tp import pandas as pd import gradio as gr from sklearn.datasets import make_blobs from sklearn.cluster import KMeans import matplotlib.pyplot as plt title = "Demonstration of k-means assumptions" random_state = 170 transformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]] # Defines 4 Apps for each demo senario class App: name: tp.ClassVar[str] description: tp.ClassVar[str] def make_data(self, n_samples: int) -> tp.Tuple[np.ndarray, np.ndarray]: raise NotImplementedError() def kmeans_predict(self, n_cluster: int, X: np.ndarray) -> np.ndarray: raise NotImplementedError() class MixGaussianBlobs(App): name = "Mixture of Gaussian Blobs" description = ( "In a real setting there is no uniquely defined true number of clusters. " "An appropriate number of clusters has to be decided from data-based criteria" " and knowledge of the intended goal." ) def make_data(self, n_samples): return make_blobs(n_samples=n_samples, random_state=random_state) def kmeans_predict(self, n_clusters, X): return KMeans( n_clusters=n_clusters, n_init="auto", random_state=random_state ).fit_predict(X) class AnisoDistBlobs(MixGaussianBlobs): name = "Anisotropically Distributed Blobs" description = ( "k-means consists of minimizing sample’s euclidean distances to the centroid of the" " cluster they are assigned to. As a consequence, k-means is more appropriate for " "clusters that are isotropic and normally distributed (i.e. spherical gaussians)" ) def make_data(self, n_samples): X, y = super().make_data(n_samples=n_samples) X = np.dot(X, transformation) return X, y class UnequalVariance(MixGaussianBlobs): name = "Unequal Variance" description = ( "k-means is equivalent to taking the maximum likelihood estimator for a 'mixture' " "of k gaussian distributions with the same variances but with possibly different " " means." ) def make_data(self, n_samples): return make_blobs( n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state ) class UnevenlySizedBlobs(MixGaussianBlobs): name = "Unevenly Sized Blobs" description = ( "There is no theoretical result about k-means that states that it requires similar" " cluster sizes to perform well, yet minimizing euclidean distances does mean that" " the more sparse and high-dimensional the problem is, the higher is the need to run " "the algorithm with different centroid seeds to ensure a global minimal inertia." ) def make_data(self, n_samples): X, y = super().make_data(n_samples=n_samples) X_filter = np.vstack( ( X[y == 0][:500], X[y == 1][:100], X[y == 2][:10], ) ) # print(len(X_filter[:, 0])) # print(len(X_filter[:, 1])) y_filter = [0] * 500 + [1] * 100 + [2] * 10 return X_filter, y_filter # Define instances of the apps _apps = [ MixGaussianBlobs(), AnisoDistBlobs(), UnequalVariance(), UnevenlySizedBlobs(), ] apps = {k.name: k for k in _apps} data_choices = [k.name for k in _apps] # Define the callback to the triggered when a button or a slider used by the user. def fn(data_choice, n_samples, n_clusters): # Find the app and create sample data based on the user choice. app = apps[data_choice] X, y = app.make_data(n_samples) fig_sample, ax_sample = plt.subplots() ax_sample.set_title(app.name) # Execute the KMeans clustering. y_pred = app.kmeans_predict(n_clusters, X) ax_sample.scatter(X[:, 0], X[:, 1], c=y) fig_pred, ax_pred = plt.subplots() ax_pred.scatter(X[:, 0], X[:, 1], c=y_pred) ax_pred.set_title(f"Unexpected KMeans Clusters (n_cluster={n_clusters})") return f"## {app.description}", fig_sample, fig_pred # Define the dashboard layout and buttons with gr.Blocks(title=title) as demo: gr.Markdown(f"# {title}") with gr.Row(): data_choice = gr.Radio( choices=data_choices, value=data_choices[0], ) with gr.Row(): n_samples = gr.Slider( minimum=1500, maximum=3000, step=50, label="Number of Samples" ) n_clusters = gr.Slider(minimum=2, maximum=8, step=1, label="Number of Clusters") with gr.Accordion("Description"): description = gr.Markdown(label="Description") with gr.Row(): plot_sample = gr.Plot(label="Ground Truth Cluster") plot_kmeans = gr.Plot(label="Unexpected KMeans Cluster") data_choice.change( fn=fn, inputs=[data_choice, n_samples, n_clusters], outputs=[description, plot_sample, plot_kmeans], ) n_samples.change( fn=fn, inputs=[data_choice, n_samples, n_clusters], outputs=[description, plot_sample, plot_kmeans], ) n_clusters.change( fn=fn, inputs=[data_choice, n_samples, n_clusters], outputs=[description, plot_sample, plot_kmeans], ) demo.launch()