# pip install gradio # pip install pandas # pip install scikit-learn # pip install matplotlib import warnings warnings.filterwarnings('ignore') from sklearn.mixture import GaussianMixture from sklearn.metrics import silhouette_score, silhouette_samples from sklearn.preprocessing import StandardScaler import matplotlib.pyplot as plt import pathlib import pandas as pd import gradio as gr import numpy as np plt.switch_backend('agg') def gmm_cluster(input_file, first_col, second_col, n_clusters): data = pd.read_csv(pathlib.Path(input_file.name)) features = [first_col, second_col] X = data[features] fig1 = plt.figure() plt.scatter(X[first_col],X[second_col]) z = StandardScaler() X[features] = z.fit_transform(X) fig2 = plt.figure() plt.scatter(X[first_col],X[second_col]) gm = GaussianMixture(n_components=int(n_clusters)) EM = gm.fit(X) cluster = gm.predict(X) labels = gm.predict(X[features]) frame = pd.DataFrame(X) frame['cluster'] = labels frame.columns = [first_col, second_col, 'cluster'] fig3 = plt.figure() score = silhouette_score(X, cluster) for k in range(0,int(n_clusters)): color = plt.cm.nipy_spectral(float(k) / n_clusters) data = frame[frame["cluster"]==k] plt.scatter(data[first_col],data[second_col],color=color) fig4 = plt.figure(figsize = (6,6)) plt.gca().set_xlim([-0.1,1]) plt.gca().set_ylim([0, len(X) + (n_clusters + 1) * 10]) # gm = GaussianMixture(n_components=n_clusters) # labels = gm.fit_predict(X) y_lower = 10 sample_silhouette_values = silhouette_samples(X, labels) for i in range(0,int(n_clusters)): ith_cluster_silhouette_values = \ sample_silhouette_values[labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = plt.cm.nipy_spectral(float(i) / n_clusters) plt.gca().fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) plt.gca().text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) y_lower = y_upper + 10 plt.gca().axvline(x=silhouette_score(X, labels), color="red", linestyle="--", label = "Avg silhouette score") plt.title("The silhouette plot") plt.xlabel("The silhouette coefficient values") plt.ylabel("Cluster label") plt.legend() return fig1, fig3, score, fig4 def show_dataset(input_file): new_data = pd.read_csv(pathlib.Path(input_file.name)) data_set = new_data.head(9) return data_set title = 'Gaussian Mixture Model' with gr.Blocks(title=title) as demo: gr.HTML(f"{title}") with gr.Row(): with gr.Column(): inp_file = gr.File(label="Input") inp_col_1 = gr.Text(label='Feature 1') inp_col_2 = gr.Text(label='Feature 2') inp_num_clusters = gr.Slider( minimum=2, maximum=7, value=0, step=1, label='Number of clusters' ) out3 = gr.Textbox(label='Silhouette score') output_file = gr.Dataframe(label='Dataset') with gr.Row(): out1 = gr.Plot() out2 = gr.Plot() out4 = gr.Plot() inp_file.change(fn=show_dataset, inputs=[inp_file], outputs=[output_file]) inp_num_clusters.change(fn=gmm_cluster, inputs=[inp_file, inp_col_1, inp_col_2, inp_num_clusters], outputs=[out1, out2, out3, out4]) demo.launch()