Spaces:

wu981526092
/

Optimal_Cluster_Analysis_with_PCA_Visualization

Sleeping

App Files Files Community

wu981526092 commited on Jun 21, 2023

Commit

760a88c

•

1 Parent(s): 3fae384

update

Browse files

Files changed (4) hide show

app.py +98 -0
data_preprocessing.py +123 -0
kmeans.py +40 -0
pca.py +66 -0

app.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import streamlit as st
+import pandas as pd
+import plotly.express as px
+from kmeans import calculate_wcss, fit_kmeans, calculate_silhouette_scores, get_optimal_clusters_silhouette, plot_elbow
+from pca import perform_pca, plot_pca, plot_pca_3D
+from data_preprocessing import mainDataWrangling
+@st.cache_data
+def convert_df(df):
+    return df.to_csv().encode('utf-8')
+# Streamlit code
+st.set_option('deprecation.showPyplotGlobalUse', False)
+st.title('📊 Holistic AI: Risk Mapping Data study: Optimal Cluster Analysis with PCA Visualization')
+uploaded_file = st.file_uploader("📤 Upload a CSV file", type='csv')
+if uploaded_file is not None:
+    df = pd.read_csv(uploaded_file)
+    raw_data = mainDataWrangling(df)
+    raw_data = raw_data.replace("High", 2).replace("Medium", 1).replace("Low", 0)
+    project_names = raw_data["projectName"]
+    information_columns = ["projectName", "Overall", "Financial", "Reputational", "Ethics", "Regulation", "Robustness",
+                           "Efficacy", "Privacy", "Bias", "Explainability"]
+    data = raw_data.drop(columns=information_columns)
+    st.subheader('🔍 Data Preview')
+    st.write(df)
+    st.subheader('🔍 Preprocessed Data')
+    st.write(raw_data)
+    # Step 1: Plot Elbow Method and Silhouette Scores
+    wcss = calculate_wcss(data)
+    silhouette_scores = calculate_silhouette_scores(data)
+    st.header('Find Optimal Clusters: The Elbow Method and Silhouette Scores')
+    fig = px.line(x=list(range(2, len(silhouette_scores) + 2)), y=silhouette_scores,
+                  labels={'x': 'Number of Clusters', 'y': 'Silhouette Scores'}, title='Silhouette Scores')
+    st.plotly_chart(fig)
+    fig = px.line(x=list(range(2, len(wcss) + 2)), y=wcss, labels={'x': 'Number of Clusters', 'y': 'WCSS'},
+                  title='Elbow Method')
+    st.plotly_chart(fig)
+    st.markdown('''
+       **Directions:**
+       - Select the optimum number of clusters based on Silhouette Scores and ELBOW Graph.
+       - For the Silhouette Scores, the optimal number of clusters corresponds to the peak of the plot.
+       - For the Elbow graph, we can see that the graph will rapidly change at a point and thus creating an elbow shape.
+       From this point, the graph moves almost parallel to the X-axis. The K value corresponding to this point is the optimal
+       value of K or an optimal number of clusters.
+       ''')
+    optimal_clusters_silhouette = get_optimal_clusters_silhouette(silhouette_scores)
+    st.write(f'Optimal number of clusters based on Silhouette Scores is: {optimal_clusters_silhouette}')
+    optimal_clusters_elbow = st.slider('Number of clusters (Default to optimal number from Silhouette Scores )', min_value=2, max_value=len(wcss) + 1,
+                                       value=2, step=1)
+    # Step 2: KMeans fitting and PCA
+    st.header('KMeans Clustering and PCA')
+    st.write('Now we fit the KMeans algorithm with your chosen number of clusters, and perform PCA for visualization.')
+    kmeans, clustered_data = fit_kmeans(data, optimal_clusters_elbow)
+    # Add project names back to the data
+    display_data = pd.concat([project_names, clustered_data], axis=1)
+    st.subheader('📌 Clustered Data')
+    st.write(display_data[["projectName", "cluster"]])
+    principalDf = perform_pca(clustered_data, 2)
+    st.subheader('📊 2D PCA Plot')
+    fig2D = plot_pca(clustered_data, principalDf, raw_data, information_columns)
+    st.plotly_chart(fig2D)
+    principalDf_3D = perform_pca(clustered_data, 3)
+    st.subheader('📊 3D PCA Plot')
+    fig3D = plot_pca_3D(clustered_data, principalDf_3D, raw_data, information_columns)
+    st.plotly_chart(fig3D)
+    st.subheader('📩 Data Download')
+    csv = convert_df(display_data)
+    st.download_button(
+        label="Download clustered data as CSV",
+        data=csv,
+        file_name='clustered_data.csv',
+        mime='text/csv',
+    )

data_preprocessing.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import pandas as pd
+enterpriseGroups = ['facialRecognition',['safetySecurity','recruitment','biometricData']]
+societyGroups = [['policing','controlAccessToServices']]
+dataTypeGroups = [['dataTypePersonal','dataTypeSensistivePersonal'],['dataTypeRestricted']]
+capabilitiesGroups = ['decisionSupportSystems']
+technicalRisks = ['Robustness', 'Efficacy',
+                  'Privacy', 'Bias', 'Explainability']
+governanceRisks = ['Financial', 'Reputational', 'Ethics', 'Regulation']
+riskVerticals = ['Overall'] + governanceRisks + technicalRisks
+def mergeCostColumns(home, commisioned, licensed):
+    if home == 'insignificant' or commisioned == 'insignificant' or licensed == 'insignificant':
+        output = 1
+    else:
+        output = 0
+    return output
+def generateUniqueEntries(targetColumn):
+    listOfEntries = []
+    for i in targetColumn.values:
+        listOfEntries += i.split(',')
+    listOfEntries = set(listOfEntries)
+    return list(listOfEntries)
+def generateOneHot(dataframe, targetColumn, groups):
+    for group in groups:
+        groupColumnName = ''
+        if type(group) == str:
+            groupColumnName = targetColumn + '_' + group
+        else:
+            for element in group:
+                if groupColumnName == '':
+                    groupColumnName += targetColumn + '_' + element
+                else:
+                    groupColumnName += '_' + element
+        dataframe[groupColumnName] = 0
+        for i, targetColumnData in enumerate(dataframe[targetColumn].values):
+            if type(group) == str:
+                if group in targetColumnData.split(','):
+                    dataframe.loc[
+                        i, groupColumnName] = 1  # this method of assignment gets rid of the SettingWithCopy warning
+            else:
+                for element in group:
+                    if element in targetColumnData.split(','):
+                        dataframe.loc[
+                            i, groupColumnName] = 1  # this method of assignment gets rid of the SettingWithCopy warning
+        dataframe.insert(0, groupColumnName, dataframe.pop(groupColumnName))  # move the new column to the far left
+    dataframe.pop(targetColumn)
+def convertToBinaryColumn(dataframe, targetColumn,
+                          positiveGroup):  # anything in the positive group gets assigned 1, o/w zero
+    for i, targetColumnData in enumerate(dataframe[targetColumn].values):
+        if targetColumnData in positiveGroup:
+            dataframe.loc[i, targetColumn] = 1  # this method of assignment gets rid of the SettingWithCopy warning
+        else:
+            dataframe.loc[i, targetColumn] = 0
+def mainDataWrangling(data):
+    # 1. Throw away the columns that we don't need
+    columnsToKeep = [1, 4, 5, 6, 7, 8, 10, 22, 24, 34, 35, 36, 37, 39, 45, 48, 49, 50, 51, 52, 53, 54, 55, 56,
+                     57]  # only keep the columns that we think are pertinent to scoring
+    data = data.iloc[:, columnsToKeep]
+    # 2. Merge the three development cost columns to get a single column for insignificant cost
+    data['insignificant'] = data.apply(
+        lambda x: mergeCostColumns(x['homeBuiltAmount'], x['commisionedAmount'], x['licensedAmount']), axis=1)
+    data.drop(data.iloc[:, 1:4], axis=1, inplace=True)
+    # 3. Replace headers with their compact forms
+    data.iloc[
+        0, -1] = 'insignificant'  # columns are currently in the first row of values. We attach a column name to the newly created column here, before copying the value row to the headers
+    data.iloc[0, 0] = 'projectName'
+    data.iloc[0, -11:-1] = data.columns[-11:-1].values  # copy the risk vertical names to the header row
+    data.columns = data.iloc[0, :].values  # copying the first value row to the headers
+    data = data.drop([0])  # remove the first value row
+    data.reset_index(drop=True, inplace=True)  # reset the row indices
+    data.insert(0, 'insignificant', data.pop('insignificant'))  # move the insignificant column to the far left
+    # 4. Remove/replace missing data
+    data = data.dropna(
+        subset=data.columns[-10:].values)  # drop all the samples for which risk scoring hasn't yet been done
+    data.reset_index(drop=True, inplace=True)  # reset the row indices
+    data['howEssentialHumanInTheLoop'].fillna('low', inplace=True)  # replace NaNs for contingent question with 'low'
+    # 5. Perform one hot encoding and other encoding
+    generateOneHot(data, 'enterpriseUseCases', enterpriseGroups)
+    generateOneHot(data, 'soceityLevel', societyGroups)
+    convertToBinaryColumn(data, 'externalParties', ['yes'])
+    data['howWidelyDeployed'] = data['howWidelyDeployed'].map(
+        {'controlledEnvironment': 0, 'local': 0.2, 'multipleJurisdictions': 0.5, 'global': 1})
+    generateOneHot(data, 'dataType', dataTypeGroups)
+    data['autonomy'] = data['autonomy'].map({'humanInTheLoop': 0, 'autonomous': 1})
+    data['howEssentialHumanInTheLoop'] = data['howEssentialHumanInTheLoop'].map({'low': 0, 'medium': 0.5, 'high': 1})
+    data['damageCausedIfSubstantialFailure'] = data['damageCausedIfSubstantialFailure'].map(
+        {'none': 0, 'minor': 0.3, 'major': 0.7, 'critical': 1})
+    data['damageCausedIfMarginalFailure'] = data['damageCausedIfMarginalFailure'].map(
+        {'none': 0, 'minor': 0.3, 'major': 0.7, 'critical': 1})
+    generateOneHot(data, 'capabilities', capabilitiesGroups)
+    data['selfAdapting'] = data['selfAdapting'].map({'no': 0, 'yesWhenUpdatedMade': 0.5, 'yesRealTime': 1})
+    # convert risk level columns to numbers
+    # creation of binary columns for Low
+    for riskColumn in data.columns[-10:]:
+        data[riskColumn + '_binaryLow'] = data[riskColumn].map({'Low': 1, 'Medium': 0, 'High': 0})
+    # creation of binary columns for High
+    for riskColumn in data.columns[-20:-10]:
+        data[riskColumn + '_binaryHigh'] = data[riskColumn].map({'Low': 0, 'Medium': 0, 'High': 1})
+    data.insert(0, 'projectName', data.pop('projectName'))
+    data['insignificant'] = pd.to_numeric(data['insignificant'])
+    data['externalParties'] = pd.to_numeric(data['externalParties'])
+    return data

kmeans.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from matplotlib import pyplot as plt
+from sklearn.cluster import KMeans
+from sklearn.metrics import silhouette_score
+def calculate_wcss(data):
+    wcss = []
+    for i in range(1, 11):
+        kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
+        kmeans.fit(data)
+        wcss.append(kmeans.inertia_)
+    return wcss
+def calculate_silhouette_scores(data):
+    scores = []
+    range_values = range(2, 11)
+    for i in range_values:
+        kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
+        kmeans.fit(data)
+        score = silhouette_score(data, kmeans.labels_, metric='euclidean')
+        scores.append(score)
+    return scores
+def plot_elbow(wcss):
+    plt.plot(range(1, 11), wcss)
+    plt.title('Elbow Method')
+    plt.xlabel('Number of clusters')
+    plt.ylabel('WCSS')
+    plt.show()
+def get_optimal_clusters_silhouette(scores):
+    optimal_clusters = scores.index(max(scores)) + 2 # +2 because range_values starts from 2
+    print(f"Optimal number of clusters: {optimal_clusters}")
+    return optimal_clusters
+def fit_kmeans(data, n_clusters):
+    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
+    clusters = kmeans.fit_predict(data)
+    data['cluster'] = clusters
+    return kmeans, data

pca.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import pandas as pd
+from sklearn.decomposition import PCA
+import plotly.express as px
+def perform_pca(data, n_components):
+    pca = PCA(n_components=n_components)
+    principalComponents = pca.fit_transform(data)
+    principalDf = pd.DataFrame(data=principalComponents,
+                               columns=[f'principal component {i+1}' for i in range(n_components)])
+    return principalDf
+def plot_pca(clustered_data,principalDf,df,information_columns):
+    clustered_data = clustered_data.reset_index()  # To make sure indices match with df
+    finalDf = df.merge(clustered_data[['index', 'cluster']], left_index=True, right_on='index')
+    finalDf = finalDf.merge(principalDf, left_index=True, right_index=True)
+    finalDf.drop(columns=['index'], inplace=True)
+    fig = px.scatter(finalDf,
+                 x='principal component 1',
+                 y='principal component 2',
+                 color='cluster',
+                 hover_data=information_columns,
+                 title='2 Component PCA',
+                 labels={'principal component 1':'Principal Component 1', 'principal component 2':'Principal Component 2'},
+                 color_continuous_scale='viridis')
+    # fig.show()
+    return fig
+# def get_common_features(data):
+#     common_features = {}
+#     for cluster in data['cluster'].unique():
+#         cluster_data = data[data['cluster'] == cluster]
+#         cluster_features_counts = {}
+#         for column in cluster_data.drop(columns=["cluster"]).columns:
+#             top_feature = cluster_data[column].mode()[0]  # Use mode to find the most common category
+#             if top_feature != 0:  # Add only if most common feature is not 0
+#                 cluster_features_counts[column] = top_feature
+#         common_features[cluster] = cluster_features_counts
+#     return common_features
+def plot_pca_3D(clustered_data, principalDf,df,information_columns):
+    clustered_data = clustered_data.reset_index()  # To make sure indices match with df
+    finalDf = df.merge(clustered_data[['index', 'cluster']], left_index=True, right_on='index')
+    finalDf = finalDf.merge(principalDf, left_index=True, right_index=True)
+    finalDf.drop(columns=['index'], inplace=True)
+    # common_features = get_common_features(clustered_data)
+    # # Add most common features to finalDf
+    # for cluster, features in common_features.items():
+    #     for feature, value in features.items():
+    #         finalDf.loc[finalDf['cluster'] == cluster, f"Most common {feature}"] = value
+    hover_data = information_columns #+ [col for col in finalDf.columns if "Most common" in col]
+    fig = px.scatter_3d(finalDf,
+                 x='principal component 1',
+                 y='principal component 2',
+                 z='principal component 3',
+                 color='cluster',
+                 hover_data=hover_data,
+                 title='3 Component PCA',
+                 labels={f'principal component {i+1}': f'Principal Component {i+1}' for i in range(3)},
+                 color_continuous_scale='viridis')
+    # fig.show()
+    return fig