wu981526092 commited on
Commit
760a88c
1 Parent(s): 3fae384
Files changed (4) hide show
  1. app.py +98 -0
  2. data_preprocessing.py +123 -0
  3. kmeans.py +40 -0
  4. pca.py +66 -0
app.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import plotly.express as px
4
+ from kmeans import calculate_wcss, fit_kmeans, calculate_silhouette_scores, get_optimal_clusters_silhouette, plot_elbow
5
+ from pca import perform_pca, plot_pca, plot_pca_3D
6
+ from data_preprocessing import mainDataWrangling
7
+
8
+
9
+ @st.cache_data
10
+ def convert_df(df):
11
+ return df.to_csv().encode('utf-8')
12
+
13
+ # Streamlit code
14
+ st.set_option('deprecation.showPyplotGlobalUse', False)
15
+
16
+ st.title('📊 Holistic AI: Risk Mapping Data study: Optimal Cluster Analysis with PCA Visualization')
17
+ uploaded_file = st.file_uploader("📤 Upload a CSV file", type='csv')
18
+
19
+ if uploaded_file is not None:
20
+ df = pd.read_csv(uploaded_file)
21
+
22
+ raw_data = mainDataWrangling(df)
23
+ raw_data = raw_data.replace("High", 2).replace("Medium", 1).replace("Low", 0)
24
+ project_names = raw_data["projectName"]
25
+ information_columns = ["projectName", "Overall", "Financial", "Reputational", "Ethics", "Regulation", "Robustness",
26
+ "Efficacy", "Privacy", "Bias", "Explainability"]
27
+
28
+ data = raw_data.drop(columns=information_columns)
29
+ st.subheader('🔍 Data Preview')
30
+ st.write(df)
31
+
32
+ st.subheader('🔍 Preprocessed Data')
33
+ st.write(raw_data)
34
+
35
+ # Step 1: Plot Elbow Method and Silhouette Scores
36
+ wcss = calculate_wcss(data)
37
+ silhouette_scores = calculate_silhouette_scores(data)
38
+ st.header('Find Optimal Clusters: The Elbow Method and Silhouette Scores')
39
+
40
+ fig = px.line(x=list(range(2, len(silhouette_scores) + 2)), y=silhouette_scores,
41
+ labels={'x': 'Number of Clusters', 'y': 'Silhouette Scores'}, title='Silhouette Scores')
42
+ st.plotly_chart(fig)
43
+
44
+ fig = px.line(x=list(range(2, len(wcss) + 2)), y=wcss, labels={'x': 'Number of Clusters', 'y': 'WCSS'},
45
+ title='Elbow Method')
46
+ st.plotly_chart(fig)
47
+
48
+ st.markdown('''
49
+ **Directions:**
50
+
51
+ - Select the optimum number of clusters based on Silhouette Scores and ELBOW Graph.
52
+
53
+ - For the Silhouette Scores, the optimal number of clusters corresponds to the peak of the plot.
54
+
55
+ - For the Elbow graph, we can see that the graph will rapidly change at a point and thus creating an elbow shape.
56
+ From this point, the graph moves almost parallel to the X-axis. The K value corresponding to this point is the optimal
57
+ value of K or an optimal number of clusters.
58
+
59
+
60
+ ''')
61
+
62
+
63
+ optimal_clusters_silhouette = get_optimal_clusters_silhouette(silhouette_scores)
64
+ st.write(f'Optimal number of clusters based on Silhouette Scores is: {optimal_clusters_silhouette}')
65
+
66
+ optimal_clusters_elbow = st.slider('Number of clusters (Default to optimal number from Silhouette Scores )', min_value=2, max_value=len(wcss) + 1,
67
+ value=2, step=1)
68
+
69
+ # Step 2: KMeans fitting and PCA
70
+ st.header('KMeans Clustering and PCA')
71
+ st.write('Now we fit the KMeans algorithm with your chosen number of clusters, and perform PCA for visualization.')
72
+ kmeans, clustered_data = fit_kmeans(data, optimal_clusters_elbow)
73
+
74
+ # Add project names back to the data
75
+ display_data = pd.concat([project_names, clustered_data], axis=1)
76
+
77
+ st.subheader('📌 Clustered Data')
78
+ st.write(display_data[["projectName", "cluster"]])
79
+
80
+ principalDf = perform_pca(clustered_data, 2)
81
+
82
+ st.subheader('📊 2D PCA Plot')
83
+ fig2D = plot_pca(clustered_data, principalDf, raw_data, information_columns)
84
+ st.plotly_chart(fig2D)
85
+
86
+ principalDf_3D = perform_pca(clustered_data, 3)
87
+ st.subheader('📊 3D PCA Plot')
88
+ fig3D = plot_pca_3D(clustered_data, principalDf_3D, raw_data, information_columns)
89
+ st.plotly_chart(fig3D)
90
+
91
+ st.subheader('📩 Data Download')
92
+ csv = convert_df(display_data)
93
+ st.download_button(
94
+ label="Download clustered data as CSV",
95
+ data=csv,
96
+ file_name='clustered_data.csv',
97
+ mime='text/csv',
98
+ )
data_preprocessing.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ enterpriseGroups = ['facialRecognition',['safetySecurity','recruitment','biometricData']]
4
+ societyGroups = [['policing','controlAccessToServices']]
5
+ dataTypeGroups = [['dataTypePersonal','dataTypeSensistivePersonal'],['dataTypeRestricted']]
6
+ capabilitiesGroups = ['decisionSupportSystems']
7
+
8
+ technicalRisks = ['Robustness', 'Efficacy',
9
+ 'Privacy', 'Bias', 'Explainability']
10
+ governanceRisks = ['Financial', 'Reputational', 'Ethics', 'Regulation']
11
+ riskVerticals = ['Overall'] + governanceRisks + technicalRisks
12
+
13
+
14
+ def mergeCostColumns(home, commisioned, licensed):
15
+ if home == 'insignificant' or commisioned == 'insignificant' or licensed == 'insignificant':
16
+ output = 1
17
+ else:
18
+ output = 0
19
+ return output
20
+
21
+
22
+ def generateUniqueEntries(targetColumn):
23
+ listOfEntries = []
24
+ for i in targetColumn.values:
25
+ listOfEntries += i.split(',')
26
+ listOfEntries = set(listOfEntries)
27
+ return list(listOfEntries)
28
+
29
+
30
+ def generateOneHot(dataframe, targetColumn, groups):
31
+ for group in groups:
32
+ groupColumnName = ''
33
+ if type(group) == str:
34
+ groupColumnName = targetColumn + '_' + group
35
+ else:
36
+ for element in group:
37
+ if groupColumnName == '':
38
+ groupColumnName += targetColumn + '_' + element
39
+ else:
40
+ groupColumnName += '_' + element
41
+ dataframe[groupColumnName] = 0
42
+
43
+ for i, targetColumnData in enumerate(dataframe[targetColumn].values):
44
+ if type(group) == str:
45
+ if group in targetColumnData.split(','):
46
+ dataframe.loc[
47
+ i, groupColumnName] = 1 # this method of assignment gets rid of the SettingWithCopy warning
48
+ else:
49
+ for element in group:
50
+ if element in targetColumnData.split(','):
51
+ dataframe.loc[
52
+ i, groupColumnName] = 1 # this method of assignment gets rid of the SettingWithCopy warning
53
+
54
+ dataframe.insert(0, groupColumnName, dataframe.pop(groupColumnName)) # move the new column to the far left
55
+
56
+ dataframe.pop(targetColumn)
57
+
58
+
59
+ def convertToBinaryColumn(dataframe, targetColumn,
60
+ positiveGroup): # anything in the positive group gets assigned 1, o/w zero
61
+ for i, targetColumnData in enumerate(dataframe[targetColumn].values):
62
+ if targetColumnData in positiveGroup:
63
+ dataframe.loc[i, targetColumn] = 1 # this method of assignment gets rid of the SettingWithCopy warning
64
+ else:
65
+ dataframe.loc[i, targetColumn] = 0
66
+
67
+
68
+ def mainDataWrangling(data):
69
+ # 1. Throw away the columns that we don't need
70
+ columnsToKeep = [1, 4, 5, 6, 7, 8, 10, 22, 24, 34, 35, 36, 37, 39, 45, 48, 49, 50, 51, 52, 53, 54, 55, 56,
71
+ 57] # only keep the columns that we think are pertinent to scoring
72
+ data = data.iloc[:, columnsToKeep]
73
+
74
+ # 2. Merge the three development cost columns to get a single column for insignificant cost
75
+ data['insignificant'] = data.apply(
76
+ lambda x: mergeCostColumns(x['homeBuiltAmount'], x['commisionedAmount'], x['licensedAmount']), axis=1)
77
+ data.drop(data.iloc[:, 1:4], axis=1, inplace=True)
78
+
79
+ # 3. Replace headers with their compact forms
80
+ data.iloc[
81
+ 0, -1] = 'insignificant' # columns are currently in the first row of values. We attach a column name to the newly created column here, before copying the value row to the headers
82
+ data.iloc[0, 0] = 'projectName'
83
+ data.iloc[0, -11:-1] = data.columns[-11:-1].values # copy the risk vertical names to the header row
84
+ data.columns = data.iloc[0, :].values # copying the first value row to the headers
85
+ data = data.drop([0]) # remove the first value row
86
+ data.reset_index(drop=True, inplace=True) # reset the row indices
87
+ data.insert(0, 'insignificant', data.pop('insignificant')) # move the insignificant column to the far left
88
+
89
+ # 4. Remove/replace missing data
90
+ data = data.dropna(
91
+ subset=data.columns[-10:].values) # drop all the samples for which risk scoring hasn't yet been done
92
+ data.reset_index(drop=True, inplace=True) # reset the row indices
93
+ data['howEssentialHumanInTheLoop'].fillna('low', inplace=True) # replace NaNs for contingent question with 'low'
94
+
95
+ # 5. Perform one hot encoding and other encoding
96
+ generateOneHot(data, 'enterpriseUseCases', enterpriseGroups)
97
+ generateOneHot(data, 'soceityLevel', societyGroups)
98
+ convertToBinaryColumn(data, 'externalParties', ['yes'])
99
+ data['howWidelyDeployed'] = data['howWidelyDeployed'].map(
100
+ {'controlledEnvironment': 0, 'local': 0.2, 'multipleJurisdictions': 0.5, 'global': 1})
101
+ generateOneHot(data, 'dataType', dataTypeGroups)
102
+ data['autonomy'] = data['autonomy'].map({'humanInTheLoop': 0, 'autonomous': 1})
103
+ data['howEssentialHumanInTheLoop'] = data['howEssentialHumanInTheLoop'].map({'low': 0, 'medium': 0.5, 'high': 1})
104
+ data['damageCausedIfSubstantialFailure'] = data['damageCausedIfSubstantialFailure'].map(
105
+ {'none': 0, 'minor': 0.3, 'major': 0.7, 'critical': 1})
106
+ data['damageCausedIfMarginalFailure'] = data['damageCausedIfMarginalFailure'].map(
107
+ {'none': 0, 'minor': 0.3, 'major': 0.7, 'critical': 1})
108
+ generateOneHot(data, 'capabilities', capabilitiesGroups)
109
+ data['selfAdapting'] = data['selfAdapting'].map({'no': 0, 'yesWhenUpdatedMade': 0.5, 'yesRealTime': 1})
110
+
111
+ # convert risk level columns to numbers
112
+ # creation of binary columns for Low
113
+ for riskColumn in data.columns[-10:]:
114
+ data[riskColumn + '_binaryLow'] = data[riskColumn].map({'Low': 1, 'Medium': 0, 'High': 0})
115
+ # creation of binary columns for High
116
+ for riskColumn in data.columns[-20:-10]:
117
+ data[riskColumn + '_binaryHigh'] = data[riskColumn].map({'Low': 0, 'Medium': 0, 'High': 1})
118
+
119
+ data.insert(0, 'projectName', data.pop('projectName'))
120
+ data['insignificant'] = pd.to_numeric(data['insignificant'])
121
+ data['externalParties'] = pd.to_numeric(data['externalParties'])
122
+
123
+ return data
kmeans.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from matplotlib import pyplot as plt
2
+ from sklearn.cluster import KMeans
3
+ from sklearn.metrics import silhouette_score
4
+
5
+
6
+ def calculate_wcss(data):
7
+ wcss = []
8
+ for i in range(1, 11):
9
+ kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
10
+ kmeans.fit(data)
11
+ wcss.append(kmeans.inertia_)
12
+ return wcss
13
+
14
+ def calculate_silhouette_scores(data):
15
+ scores = []
16
+ range_values = range(2, 11)
17
+ for i in range_values:
18
+ kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
19
+ kmeans.fit(data)
20
+ score = silhouette_score(data, kmeans.labels_, metric='euclidean')
21
+ scores.append(score)
22
+ return scores
23
+
24
+ def plot_elbow(wcss):
25
+ plt.plot(range(1, 11), wcss)
26
+ plt.title('Elbow Method')
27
+ plt.xlabel('Number of clusters')
28
+ plt.ylabel('WCSS')
29
+ plt.show()
30
+
31
+ def get_optimal_clusters_silhouette(scores):
32
+ optimal_clusters = scores.index(max(scores)) + 2 # +2 because range_values starts from 2
33
+ print(f"Optimal number of clusters: {optimal_clusters}")
34
+ return optimal_clusters
35
+
36
+ def fit_kmeans(data, n_clusters):
37
+ kmeans = KMeans(n_clusters=n_clusters, random_state=0)
38
+ clusters = kmeans.fit_predict(data)
39
+ data['cluster'] = clusters
40
+ return kmeans, data
pca.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.decomposition import PCA
3
+ import plotly.express as px
4
+
5
+
6
+ def perform_pca(data, n_components):
7
+ pca = PCA(n_components=n_components)
8
+ principalComponents = pca.fit_transform(data)
9
+ principalDf = pd.DataFrame(data=principalComponents,
10
+ columns=[f'principal component {i+1}' for i in range(n_components)])
11
+ return principalDf
12
+
13
+ def plot_pca(clustered_data,principalDf,df,information_columns):
14
+ clustered_data = clustered_data.reset_index() # To make sure indices match with df
15
+ finalDf = df.merge(clustered_data[['index', 'cluster']], left_index=True, right_on='index')
16
+ finalDf = finalDf.merge(principalDf, left_index=True, right_index=True)
17
+
18
+ finalDf.drop(columns=['index'], inplace=True)
19
+ fig = px.scatter(finalDf,
20
+ x='principal component 1',
21
+ y='principal component 2',
22
+ color='cluster',
23
+ hover_data=information_columns,
24
+ title='2 Component PCA',
25
+ labels={'principal component 1':'Principal Component 1', 'principal component 2':'Principal Component 2'},
26
+ color_continuous_scale='viridis')
27
+ # fig.show()
28
+ return fig
29
+
30
+ # def get_common_features(data):
31
+ # common_features = {}
32
+ # for cluster in data['cluster'].unique():
33
+ # cluster_data = data[data['cluster'] == cluster]
34
+ # cluster_features_counts = {}
35
+ # for column in cluster_data.drop(columns=["cluster"]).columns:
36
+ # top_feature = cluster_data[column].mode()[0] # Use mode to find the most common category
37
+ # if top_feature != 0: # Add only if most common feature is not 0
38
+ # cluster_features_counts[column] = top_feature
39
+ # common_features[cluster] = cluster_features_counts
40
+ # return common_features
41
+
42
+
43
+ def plot_pca_3D(clustered_data, principalDf,df,information_columns):
44
+ clustered_data = clustered_data.reset_index() # To make sure indices match with df
45
+ finalDf = df.merge(clustered_data[['index', 'cluster']], left_index=True, right_on='index')
46
+ finalDf = finalDf.merge(principalDf, left_index=True, right_index=True)
47
+ finalDf.drop(columns=['index'], inplace=True)
48
+
49
+ # common_features = get_common_features(clustered_data)
50
+ # # Add most common features to finalDf
51
+ # for cluster, features in common_features.items():
52
+ # for feature, value in features.items():
53
+ # finalDf.loc[finalDf['cluster'] == cluster, f"Most common {feature}"] = value
54
+
55
+ hover_data = information_columns #+ [col for col in finalDf.columns if "Most common" in col]
56
+ fig = px.scatter_3d(finalDf,
57
+ x='principal component 1',
58
+ y='principal component 2',
59
+ z='principal component 3',
60
+ color='cluster',
61
+ hover_data=hover_data,
62
+ title='3 Component PCA',
63
+ labels={f'principal component {i+1}': f'Principal Component {i+1}' for i in range(3)},
64
+ color_continuous_scale='viridis')
65
+ # fig.show()
66
+ return fig