gmm-clustering / app.py
Lam4w's picture
Update app.py
17faa9f
# pip install gradio
# pip install pandas
# pip install scikit-learn
# pip install matplotlib
import warnings
warnings.filterwarnings('ignore')
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import pathlib
import pandas as pd
import gradio as gr
import numpy as np
plt.switch_backend('agg')
def gmm_cluster(input_file, first_col, second_col, n_clusters):
data = pd.read_csv(pathlib.Path(input_file.name))
features = [first_col, second_col]
X = data[features]
fig1 = plt.figure()
plt.scatter(X[first_col],X[second_col])
z = StandardScaler()
X[features] = z.fit_transform(X)
fig2 = plt.figure()
plt.scatter(X[first_col],X[second_col])
gm = GaussianMixture(n_components=int(n_clusters))
EM = gm.fit(X)
cluster = gm.predict(X)
labels = gm.predict(X[features])
frame = pd.DataFrame(X)
frame['cluster'] = labels
frame.columns = [first_col, second_col, 'cluster']
fig3 = plt.figure()
score = silhouette_score(X, cluster)
for k in range(0,int(n_clusters)):
color = plt.cm.nipy_spectral(float(k) / n_clusters)
data = frame[frame["cluster"]==k]
plt.scatter(data[first_col],data[second_col],color=color)
fig4 = plt.figure(figsize = (6,6))
plt.gca().set_xlim([-0.1,1])
plt.gca().set_ylim([0, len(X) + (n_clusters + 1) * 10])
# gm = GaussianMixture(n_components=n_clusters)
# labels = gm.fit_predict(X)
y_lower = 10
sample_silhouette_values = silhouette_samples(X, labels)
for i in range(0,int(n_clusters)):
ith_cluster_silhouette_values = \
sample_silhouette_values[labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = plt.cm.nipy_spectral(float(i) / n_clusters)
plt.gca().fill_betweenx(np.arange(y_lower, y_upper),
0, ith_cluster_silhouette_values,
facecolor=color, edgecolor=color, alpha=0.7)
plt.gca().text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
y_lower = y_upper + 10
plt.gca().axvline(x=silhouette_score(X, labels), color="red", linestyle="--", label = "Avg silhouette score")
plt.title("The silhouette plot")
plt.xlabel("The silhouette coefficient values")
plt.ylabel("Cluster label")
plt.legend()
return fig1, fig3, score, fig4
def show_dataset(input_file):
new_data = pd.read_csv(pathlib.Path(input_file.name))
data_set = new_data.head(9)
return data_set
title = 'Gaussian Mixture Model'
with gr.Blocks(title=title) as demo:
gr.HTML(f"{title}")
with gr.Row():
with gr.Column():
inp_file = gr.File(label="Input")
inp_col_1 = gr.Text(label='Feature 1')
inp_col_2 = gr.Text(label='Feature 2')
inp_num_clusters = gr.Slider(
minimum=2,
maximum=7,
value=0,
step=1,
label='Number of clusters'
)
out3 = gr.Textbox(label='Silhouette score')
output_file = gr.Dataframe(label='Dataset')
with gr.Row():
out1 = gr.Plot()
out2 = gr.Plot()
out4 = gr.Plot()
inp_file.change(fn=show_dataset, inputs=[inp_file], outputs=[output_file])
inp_num_clusters.change(fn=gmm_cluster, inputs=[inp_file, inp_col_1, inp_col_2, inp_num_clusters], outputs=[out1, out2, out3, out4])
demo.launch()