ehengao's picture
add initial version for the kmeans assumption dashboard
4a27dd7
"""This dashboard is a live demonstration of the sklearn document at
https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_assumptions.html#sphx-glr-auto-examples-cluster-plot-kmeans-assumptions-py
"""
import numpy as np
import typing as tp
import pandas as pd
import gradio as gr
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
title = "Demonstration of k-means assumptions"
random_state = 170
transformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]]
# Defines 4 Apps for each demo senario
class App:
name: tp.ClassVar[str]
description: tp.ClassVar[str]
def make_data(self, n_samples: int) -> tp.Tuple[np.ndarray, np.ndarray]:
raise NotImplementedError()
def kmeans_predict(self, n_cluster: int, X: np.ndarray) -> np.ndarray:
raise NotImplementedError()
class MixGaussianBlobs(App):
name = "Mixture of Gaussian Blobs"
description = (
"In a real setting there is no uniquely defined true number of clusters. "
"An appropriate number of clusters has to be decided from data-based criteria"
" and knowledge of the intended goal."
)
def make_data(self, n_samples):
return make_blobs(n_samples=n_samples, random_state=random_state)
def kmeans_predict(self, n_clusters, X):
return KMeans(
n_clusters=n_clusters, n_init="auto", random_state=random_state
).fit_predict(X)
class AnisoDistBlobs(MixGaussianBlobs):
name = "Anisotropically Distributed Blobs"
description = (
"k-means consists of minimizing sample’s euclidean distances to the centroid of the"
" cluster they are assigned to. As a consequence, k-means is more appropriate for "
"clusters that are isotropic and normally distributed (i.e. spherical gaussians)"
)
def make_data(self, n_samples):
X, y = super().make_data(n_samples=n_samples)
X = np.dot(X, transformation)
return X, y
class UnequalVariance(MixGaussianBlobs):
name = "Unequal Variance"
description = (
"k-means is equivalent to taking the maximum likelihood estimator for a 'mixture' "
"of k gaussian distributions with the same variances but with possibly different "
" means."
)
def make_data(self, n_samples):
return make_blobs(
n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state
)
class UnevenlySizedBlobs(MixGaussianBlobs):
name = "Unevenly Sized Blobs"
description = (
"There is no theoretical result about k-means that states that it requires similar"
" cluster sizes to perform well, yet minimizing euclidean distances does mean that"
" the more sparse and high-dimensional the problem is, the higher is the need to run "
"the algorithm with different centroid seeds to ensure a global minimal inertia."
)
def make_data(self, n_samples):
X, y = super().make_data(n_samples=n_samples)
X_filter = np.vstack(
(
X[y == 0][:500],
X[y == 1][:100],
X[y == 2][:10],
)
)
# print(len(X_filter[:, 0]))
# print(len(X_filter[:, 1]))
y_filter = [0] * 500 + [1] * 100 + [2] * 10
return X_filter, y_filter
# Define instances of the apps
_apps = [
MixGaussianBlobs(),
AnisoDistBlobs(),
UnequalVariance(),
UnevenlySizedBlobs(),
]
apps = {k.name: k for k in _apps}
data_choices = [k.name for k in _apps]
# Define the callback to the triggered when a button or a slider used by the user.
def fn(data_choice, n_samples, n_clusters):
# Find the app and create sample data based on the user choice.
app = apps[data_choice]
X, y = app.make_data(n_samples)
fig_sample, ax_sample = plt.subplots()
ax_sample.set_title(app.name)
# Execute the KMeans clustering.
y_pred = app.kmeans_predict(n_clusters, X)
ax_sample.scatter(X[:, 0], X[:, 1], c=y)
fig_pred, ax_pred = plt.subplots()
ax_pred.scatter(X[:, 0], X[:, 1], c=y_pred)
ax_pred.set_title(f"Unexpected KMeans Clusters (n_cluster={n_clusters})")
return f"## {app.description}", fig_sample, fig_pred
# Define the dashboard layout and buttons
with gr.Blocks(title=title) as demo:
gr.Markdown(f"# {title}")
with gr.Row():
data_choice = gr.Radio(
choices=data_choices,
value=data_choices[0],
)
with gr.Row():
n_samples = gr.Slider(
minimum=1500, maximum=3000, step=50, label="Number of Samples"
)
n_clusters = gr.Slider(minimum=2, maximum=8, step=1, label="Number of Clusters")
with gr.Accordion("Description"):
description = gr.Markdown(label="Description")
with gr.Row():
plot_sample = gr.Plot(label="Ground Truth Cluster")
plot_kmeans = gr.Plot(label="Unexpected KMeans Cluster")
data_choice.change(
fn=fn,
inputs=[data_choice, n_samples, n_clusters],
outputs=[description, plot_sample, plot_kmeans],
)
n_samples.change(
fn=fn,
inputs=[data_choice, n_samples, n_clusters],
outputs=[description, plot_sample, plot_kmeans],
)
n_clusters.change(
fn=fn,
inputs=[data_choice, n_samples, n_clusters],
outputs=[description, plot_sample, plot_kmeans],
)
demo.launch()