import streamlit as st import pandas as pd import plotly.express as px from kmeans import calculate_wcss, fit_kmeans, calculate_silhouette_scores, get_optimal_clusters_silhouette, plot_elbow from pca import perform_pca, plot_pca, plot_pca_3D from data_preprocessing import mainDataWrangling @st.cache_data def convert_df(df): return df.to_csv().encode('utf-8') # Streamlit code st.set_option('deprecation.showPyplotGlobalUse', False) st.title('📊 Holistic AI: Risk Mapping Data study: Optimal Cluster Analysis with PCA Visualization') uploaded_file = st.file_uploader("📤 Upload a CSV file", type='csv') if uploaded_file is not None: df = pd.read_csv(uploaded_file) raw_data = mainDataWrangling(df) raw_data = raw_data.replace("High", 2).replace("Medium", 1).replace("Low", 0) project_names = raw_data["projectName"] information_columns = ["projectName", "Overall", "Financial", "Reputational", "Ethics", "Regulation", "Robustness", "Efficacy", "Privacy", "Bias", "Explainability"] data = raw_data.drop(columns=information_columns) st.subheader('🔍 Data Preview') st.write(df) st.subheader('🔍 Preprocessed Data') st.write(raw_data) # Step 1: Plot Elbow Method and Silhouette Scores wcss = calculate_wcss(data) silhouette_scores = calculate_silhouette_scores(data) st.header('Find Optimal Clusters: The Elbow Method and Silhouette Scores') fig = px.line(x=list(range(2, len(silhouette_scores) + 2)), y=silhouette_scores, labels={'x': 'Number of Clusters', 'y': 'Silhouette Scores'}, title='Silhouette Scores') st.plotly_chart(fig) fig = px.line(x=list(range(2, len(wcss) + 2)), y=wcss, labels={'x': 'Number of Clusters', 'y': 'WCSS'}, title='Elbow Method') st.plotly_chart(fig) st.markdown(''' **Directions:** - Select the optimum number of clusters based on Silhouette Scores and ELBOW Graph. - For the Silhouette Scores, the optimal number of clusters corresponds to the peak of the plot. - For the Elbow graph, we can see that the graph will rapidly change at a point and thus creating an elbow shape. From this point, the graph moves almost parallel to the X-axis. The K value corresponding to this point is the optimal value of K or an optimal number of clusters. ''') optimal_clusters_silhouette = get_optimal_clusters_silhouette(silhouette_scores) st.write(f'Optimal number of clusters based on Silhouette Scores is: {optimal_clusters_silhouette}') optimal_clusters_elbow = st.slider('Number of clusters (Default to optimal number from Silhouette Scores )', min_value=2, max_value=len(wcss) + 1, value=optimal_clusters_silhouette, step=1) # Step 2: KMeans fitting and PCA st.header('KMeans Clustering and PCA') st.write('Now we fit the KMeans algorithm with your chosen number of clusters, and perform PCA for visualization.') kmeans, clustered_data = fit_kmeans(data, optimal_clusters_elbow) # Add project names back to the data display_data = pd.concat([project_names, clustered_data], axis=1) st.subheader('📌 Clustered Data') st.write(display_data[["projectName", "cluster"]]) principalDf = perform_pca(clustered_data, 2) st.subheader('📊 2D PCA Plot') fig2D = plot_pca(clustered_data, principalDf, raw_data, information_columns) st.plotly_chart(fig2D) principalDf_3D = perform_pca(clustered_data, 3) st.subheader('📊 3D PCA Plot') fig3D = plot_pca_3D(clustered_data, principalDf_3D, raw_data, information_columns) st.plotly_chart(fig3D) st.subheader('📩 Data Download') csv = convert_df(display_data) st.download_button( label="Download clustered data as CSV", data=csv, file_name='clustered_data.csv', mime='text/csv', )