wu981526092's picture
update
4b1f0bc
raw
history blame contribute delete
No virus
3.94 kB
import streamlit as st
import pandas as pd
import plotly.express as px
from kmeans import calculate_wcss, fit_kmeans, calculate_silhouette_scores, get_optimal_clusters_silhouette, plot_elbow
from pca import perform_pca, plot_pca, plot_pca_3D
from data_preprocessing import mainDataWrangling
@st.cache_data
def convert_df(df):
return df.to_csv().encode('utf-8')
# Streamlit code
st.set_option('deprecation.showPyplotGlobalUse', False)
st.title('πŸ“Š Holistic AI: Risk Mapping Data study: Optimal Cluster Analysis with PCA Visualization')
uploaded_file = st.file_uploader("πŸ“€ Upload a CSV file", type='csv')
if uploaded_file is not None:
df = pd.read_csv(uploaded_file)
raw_data = mainDataWrangling(df)
raw_data = raw_data.replace("High", 2).replace("Medium", 1).replace("Low", 0)
project_names = raw_data["projectName"]
information_columns = ["projectName", "Overall", "Financial", "Reputational", "Ethics", "Regulation", "Robustness",
"Efficacy", "Privacy", "Bias", "Explainability"]
data = raw_data.drop(columns=information_columns)
st.subheader('πŸ” Data Preview')
st.write(df)
st.subheader('πŸ” Preprocessed Data')
st.write(raw_data)
# Step 1: Plot Elbow Method and Silhouette Scores
wcss = calculate_wcss(data)
silhouette_scores = calculate_silhouette_scores(data)
st.header('Find Optimal Clusters: The Elbow Method and Silhouette Scores')
fig = px.line(x=list(range(2, len(silhouette_scores) + 2)), y=silhouette_scores,
labels={'x': 'Number of Clusters', 'y': 'Silhouette Scores'}, title='Silhouette Scores')
st.plotly_chart(fig)
fig = px.line(x=list(range(2, len(wcss) + 2)), y=wcss, labels={'x': 'Number of Clusters', 'y': 'WCSS'},
title='Elbow Method')
st.plotly_chart(fig)
st.markdown('''
**Directions:**
- Select the optimum number of clusters based on Silhouette Scores and ELBOW Graph.
- For the Silhouette Scores, the optimal number of clusters corresponds to the peak of the plot.
- For the Elbow graph, we can see that the graph will rapidly change at a point and thus creating an elbow shape.
From this point, the graph moves almost parallel to the X-axis. The K value corresponding to this point is the optimal
value of K or an optimal number of clusters.
''')
optimal_clusters_silhouette = get_optimal_clusters_silhouette(silhouette_scores)
st.write(f'Optimal number of clusters based on Silhouette Scores is: {optimal_clusters_silhouette}')
optimal_clusters_elbow = st.slider('Number of clusters (Default to optimal number from Silhouette Scores )', min_value=2, max_value=len(wcss) + 1,
value=optimal_clusters_silhouette, step=1)
# Step 2: KMeans fitting and PCA
st.header('KMeans Clustering and PCA')
st.write('Now we fit the KMeans algorithm with your chosen number of clusters, and perform PCA for visualization.')
kmeans, clustered_data = fit_kmeans(data, optimal_clusters_elbow)
# Add project names back to the data
display_data = pd.concat([project_names, clustered_data], axis=1)
st.subheader('πŸ“Œ Clustered Data')
st.write(display_data[["projectName", "cluster"]])
principalDf = perform_pca(clustered_data, 2)
st.subheader('πŸ“Š 2D PCA Plot')
fig2D = plot_pca(clustered_data, principalDf, raw_data, information_columns)
st.plotly_chart(fig2D)
principalDf_3D = perform_pca(clustered_data, 3)
st.subheader('πŸ“Š 3D PCA Plot')
fig3D = plot_pca_3D(clustered_data, principalDf_3D, raw_data, information_columns)
st.plotly_chart(fig3D)
st.subheader('πŸ“© Data Download')
csv = convert_df(display_data)
st.download_button(
label="Download clustered data as CSV",
data=csv,
file_name='clustered_data.csv',
mime='text/csv',
)