wu981526092
commited on
Commit
•
760a88c
1
Parent(s):
3fae384
update
Browse files- app.py +98 -0
- data_preprocessing.py +123 -0
- kmeans.py +40 -0
- pca.py +66 -0
app.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import plotly.express as px
|
4 |
+
from kmeans import calculate_wcss, fit_kmeans, calculate_silhouette_scores, get_optimal_clusters_silhouette, plot_elbow
|
5 |
+
from pca import perform_pca, plot_pca, plot_pca_3D
|
6 |
+
from data_preprocessing import mainDataWrangling
|
7 |
+
|
8 |
+
|
9 |
+
@st.cache_data
|
10 |
+
def convert_df(df):
|
11 |
+
return df.to_csv().encode('utf-8')
|
12 |
+
|
13 |
+
# Streamlit code
|
14 |
+
st.set_option('deprecation.showPyplotGlobalUse', False)
|
15 |
+
|
16 |
+
st.title('📊 Holistic AI: Risk Mapping Data study: Optimal Cluster Analysis with PCA Visualization')
|
17 |
+
uploaded_file = st.file_uploader("📤 Upload a CSV file", type='csv')
|
18 |
+
|
19 |
+
if uploaded_file is not None:
|
20 |
+
df = pd.read_csv(uploaded_file)
|
21 |
+
|
22 |
+
raw_data = mainDataWrangling(df)
|
23 |
+
raw_data = raw_data.replace("High", 2).replace("Medium", 1).replace("Low", 0)
|
24 |
+
project_names = raw_data["projectName"]
|
25 |
+
information_columns = ["projectName", "Overall", "Financial", "Reputational", "Ethics", "Regulation", "Robustness",
|
26 |
+
"Efficacy", "Privacy", "Bias", "Explainability"]
|
27 |
+
|
28 |
+
data = raw_data.drop(columns=information_columns)
|
29 |
+
st.subheader('🔍 Data Preview')
|
30 |
+
st.write(df)
|
31 |
+
|
32 |
+
st.subheader('🔍 Preprocessed Data')
|
33 |
+
st.write(raw_data)
|
34 |
+
|
35 |
+
# Step 1: Plot Elbow Method and Silhouette Scores
|
36 |
+
wcss = calculate_wcss(data)
|
37 |
+
silhouette_scores = calculate_silhouette_scores(data)
|
38 |
+
st.header('Find Optimal Clusters: The Elbow Method and Silhouette Scores')
|
39 |
+
|
40 |
+
fig = px.line(x=list(range(2, len(silhouette_scores) + 2)), y=silhouette_scores,
|
41 |
+
labels={'x': 'Number of Clusters', 'y': 'Silhouette Scores'}, title='Silhouette Scores')
|
42 |
+
st.plotly_chart(fig)
|
43 |
+
|
44 |
+
fig = px.line(x=list(range(2, len(wcss) + 2)), y=wcss, labels={'x': 'Number of Clusters', 'y': 'WCSS'},
|
45 |
+
title='Elbow Method')
|
46 |
+
st.plotly_chart(fig)
|
47 |
+
|
48 |
+
st.markdown('''
|
49 |
+
**Directions:**
|
50 |
+
|
51 |
+
- Select the optimum number of clusters based on Silhouette Scores and ELBOW Graph.
|
52 |
+
|
53 |
+
- For the Silhouette Scores, the optimal number of clusters corresponds to the peak of the plot.
|
54 |
+
|
55 |
+
- For the Elbow graph, we can see that the graph will rapidly change at a point and thus creating an elbow shape.
|
56 |
+
From this point, the graph moves almost parallel to the X-axis. The K value corresponding to this point is the optimal
|
57 |
+
value of K or an optimal number of clusters.
|
58 |
+
|
59 |
+
|
60 |
+
''')
|
61 |
+
|
62 |
+
|
63 |
+
optimal_clusters_silhouette = get_optimal_clusters_silhouette(silhouette_scores)
|
64 |
+
st.write(f'Optimal number of clusters based on Silhouette Scores is: {optimal_clusters_silhouette}')
|
65 |
+
|
66 |
+
optimal_clusters_elbow = st.slider('Number of clusters (Default to optimal number from Silhouette Scores )', min_value=2, max_value=len(wcss) + 1,
|
67 |
+
value=2, step=1)
|
68 |
+
|
69 |
+
# Step 2: KMeans fitting and PCA
|
70 |
+
st.header('KMeans Clustering and PCA')
|
71 |
+
st.write('Now we fit the KMeans algorithm with your chosen number of clusters, and perform PCA for visualization.')
|
72 |
+
kmeans, clustered_data = fit_kmeans(data, optimal_clusters_elbow)
|
73 |
+
|
74 |
+
# Add project names back to the data
|
75 |
+
display_data = pd.concat([project_names, clustered_data], axis=1)
|
76 |
+
|
77 |
+
st.subheader('📌 Clustered Data')
|
78 |
+
st.write(display_data[["projectName", "cluster"]])
|
79 |
+
|
80 |
+
principalDf = perform_pca(clustered_data, 2)
|
81 |
+
|
82 |
+
st.subheader('📊 2D PCA Plot')
|
83 |
+
fig2D = plot_pca(clustered_data, principalDf, raw_data, information_columns)
|
84 |
+
st.plotly_chart(fig2D)
|
85 |
+
|
86 |
+
principalDf_3D = perform_pca(clustered_data, 3)
|
87 |
+
st.subheader('📊 3D PCA Plot')
|
88 |
+
fig3D = plot_pca_3D(clustered_data, principalDf_3D, raw_data, information_columns)
|
89 |
+
st.plotly_chart(fig3D)
|
90 |
+
|
91 |
+
st.subheader('📩 Data Download')
|
92 |
+
csv = convert_df(display_data)
|
93 |
+
st.download_button(
|
94 |
+
label="Download clustered data as CSV",
|
95 |
+
data=csv,
|
96 |
+
file_name='clustered_data.csv',
|
97 |
+
mime='text/csv',
|
98 |
+
)
|
data_preprocessing.py
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
enterpriseGroups = ['facialRecognition',['safetySecurity','recruitment','biometricData']]
|
4 |
+
societyGroups = [['policing','controlAccessToServices']]
|
5 |
+
dataTypeGroups = [['dataTypePersonal','dataTypeSensistivePersonal'],['dataTypeRestricted']]
|
6 |
+
capabilitiesGroups = ['decisionSupportSystems']
|
7 |
+
|
8 |
+
technicalRisks = ['Robustness', 'Efficacy',
|
9 |
+
'Privacy', 'Bias', 'Explainability']
|
10 |
+
governanceRisks = ['Financial', 'Reputational', 'Ethics', 'Regulation']
|
11 |
+
riskVerticals = ['Overall'] + governanceRisks + technicalRisks
|
12 |
+
|
13 |
+
|
14 |
+
def mergeCostColumns(home, commisioned, licensed):
|
15 |
+
if home == 'insignificant' or commisioned == 'insignificant' or licensed == 'insignificant':
|
16 |
+
output = 1
|
17 |
+
else:
|
18 |
+
output = 0
|
19 |
+
return output
|
20 |
+
|
21 |
+
|
22 |
+
def generateUniqueEntries(targetColumn):
|
23 |
+
listOfEntries = []
|
24 |
+
for i in targetColumn.values:
|
25 |
+
listOfEntries += i.split(',')
|
26 |
+
listOfEntries = set(listOfEntries)
|
27 |
+
return list(listOfEntries)
|
28 |
+
|
29 |
+
|
30 |
+
def generateOneHot(dataframe, targetColumn, groups):
|
31 |
+
for group in groups:
|
32 |
+
groupColumnName = ''
|
33 |
+
if type(group) == str:
|
34 |
+
groupColumnName = targetColumn + '_' + group
|
35 |
+
else:
|
36 |
+
for element in group:
|
37 |
+
if groupColumnName == '':
|
38 |
+
groupColumnName += targetColumn + '_' + element
|
39 |
+
else:
|
40 |
+
groupColumnName += '_' + element
|
41 |
+
dataframe[groupColumnName] = 0
|
42 |
+
|
43 |
+
for i, targetColumnData in enumerate(dataframe[targetColumn].values):
|
44 |
+
if type(group) == str:
|
45 |
+
if group in targetColumnData.split(','):
|
46 |
+
dataframe.loc[
|
47 |
+
i, groupColumnName] = 1 # this method of assignment gets rid of the SettingWithCopy warning
|
48 |
+
else:
|
49 |
+
for element in group:
|
50 |
+
if element in targetColumnData.split(','):
|
51 |
+
dataframe.loc[
|
52 |
+
i, groupColumnName] = 1 # this method of assignment gets rid of the SettingWithCopy warning
|
53 |
+
|
54 |
+
dataframe.insert(0, groupColumnName, dataframe.pop(groupColumnName)) # move the new column to the far left
|
55 |
+
|
56 |
+
dataframe.pop(targetColumn)
|
57 |
+
|
58 |
+
|
59 |
+
def convertToBinaryColumn(dataframe, targetColumn,
|
60 |
+
positiveGroup): # anything in the positive group gets assigned 1, o/w zero
|
61 |
+
for i, targetColumnData in enumerate(dataframe[targetColumn].values):
|
62 |
+
if targetColumnData in positiveGroup:
|
63 |
+
dataframe.loc[i, targetColumn] = 1 # this method of assignment gets rid of the SettingWithCopy warning
|
64 |
+
else:
|
65 |
+
dataframe.loc[i, targetColumn] = 0
|
66 |
+
|
67 |
+
|
68 |
+
def mainDataWrangling(data):
|
69 |
+
# 1. Throw away the columns that we don't need
|
70 |
+
columnsToKeep = [1, 4, 5, 6, 7, 8, 10, 22, 24, 34, 35, 36, 37, 39, 45, 48, 49, 50, 51, 52, 53, 54, 55, 56,
|
71 |
+
57] # only keep the columns that we think are pertinent to scoring
|
72 |
+
data = data.iloc[:, columnsToKeep]
|
73 |
+
|
74 |
+
# 2. Merge the three development cost columns to get a single column for insignificant cost
|
75 |
+
data['insignificant'] = data.apply(
|
76 |
+
lambda x: mergeCostColumns(x['homeBuiltAmount'], x['commisionedAmount'], x['licensedAmount']), axis=1)
|
77 |
+
data.drop(data.iloc[:, 1:4], axis=1, inplace=True)
|
78 |
+
|
79 |
+
# 3. Replace headers with their compact forms
|
80 |
+
data.iloc[
|
81 |
+
0, -1] = 'insignificant' # columns are currently in the first row of values. We attach a column name to the newly created column here, before copying the value row to the headers
|
82 |
+
data.iloc[0, 0] = 'projectName'
|
83 |
+
data.iloc[0, -11:-1] = data.columns[-11:-1].values # copy the risk vertical names to the header row
|
84 |
+
data.columns = data.iloc[0, :].values # copying the first value row to the headers
|
85 |
+
data = data.drop([0]) # remove the first value row
|
86 |
+
data.reset_index(drop=True, inplace=True) # reset the row indices
|
87 |
+
data.insert(0, 'insignificant', data.pop('insignificant')) # move the insignificant column to the far left
|
88 |
+
|
89 |
+
# 4. Remove/replace missing data
|
90 |
+
data = data.dropna(
|
91 |
+
subset=data.columns[-10:].values) # drop all the samples for which risk scoring hasn't yet been done
|
92 |
+
data.reset_index(drop=True, inplace=True) # reset the row indices
|
93 |
+
data['howEssentialHumanInTheLoop'].fillna('low', inplace=True) # replace NaNs for contingent question with 'low'
|
94 |
+
|
95 |
+
# 5. Perform one hot encoding and other encoding
|
96 |
+
generateOneHot(data, 'enterpriseUseCases', enterpriseGroups)
|
97 |
+
generateOneHot(data, 'soceityLevel', societyGroups)
|
98 |
+
convertToBinaryColumn(data, 'externalParties', ['yes'])
|
99 |
+
data['howWidelyDeployed'] = data['howWidelyDeployed'].map(
|
100 |
+
{'controlledEnvironment': 0, 'local': 0.2, 'multipleJurisdictions': 0.5, 'global': 1})
|
101 |
+
generateOneHot(data, 'dataType', dataTypeGroups)
|
102 |
+
data['autonomy'] = data['autonomy'].map({'humanInTheLoop': 0, 'autonomous': 1})
|
103 |
+
data['howEssentialHumanInTheLoop'] = data['howEssentialHumanInTheLoop'].map({'low': 0, 'medium': 0.5, 'high': 1})
|
104 |
+
data['damageCausedIfSubstantialFailure'] = data['damageCausedIfSubstantialFailure'].map(
|
105 |
+
{'none': 0, 'minor': 0.3, 'major': 0.7, 'critical': 1})
|
106 |
+
data['damageCausedIfMarginalFailure'] = data['damageCausedIfMarginalFailure'].map(
|
107 |
+
{'none': 0, 'minor': 0.3, 'major': 0.7, 'critical': 1})
|
108 |
+
generateOneHot(data, 'capabilities', capabilitiesGroups)
|
109 |
+
data['selfAdapting'] = data['selfAdapting'].map({'no': 0, 'yesWhenUpdatedMade': 0.5, 'yesRealTime': 1})
|
110 |
+
|
111 |
+
# convert risk level columns to numbers
|
112 |
+
# creation of binary columns for Low
|
113 |
+
for riskColumn in data.columns[-10:]:
|
114 |
+
data[riskColumn + '_binaryLow'] = data[riskColumn].map({'Low': 1, 'Medium': 0, 'High': 0})
|
115 |
+
# creation of binary columns for High
|
116 |
+
for riskColumn in data.columns[-20:-10]:
|
117 |
+
data[riskColumn + '_binaryHigh'] = data[riskColumn].map({'Low': 0, 'Medium': 0, 'High': 1})
|
118 |
+
|
119 |
+
data.insert(0, 'projectName', data.pop('projectName'))
|
120 |
+
data['insignificant'] = pd.to_numeric(data['insignificant'])
|
121 |
+
data['externalParties'] = pd.to_numeric(data['externalParties'])
|
122 |
+
|
123 |
+
return data
|
kmeans.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from matplotlib import pyplot as plt
|
2 |
+
from sklearn.cluster import KMeans
|
3 |
+
from sklearn.metrics import silhouette_score
|
4 |
+
|
5 |
+
|
6 |
+
def calculate_wcss(data):
|
7 |
+
wcss = []
|
8 |
+
for i in range(1, 11):
|
9 |
+
kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
|
10 |
+
kmeans.fit(data)
|
11 |
+
wcss.append(kmeans.inertia_)
|
12 |
+
return wcss
|
13 |
+
|
14 |
+
def calculate_silhouette_scores(data):
|
15 |
+
scores = []
|
16 |
+
range_values = range(2, 11)
|
17 |
+
for i in range_values:
|
18 |
+
kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
|
19 |
+
kmeans.fit(data)
|
20 |
+
score = silhouette_score(data, kmeans.labels_, metric='euclidean')
|
21 |
+
scores.append(score)
|
22 |
+
return scores
|
23 |
+
|
24 |
+
def plot_elbow(wcss):
|
25 |
+
plt.plot(range(1, 11), wcss)
|
26 |
+
plt.title('Elbow Method')
|
27 |
+
plt.xlabel('Number of clusters')
|
28 |
+
plt.ylabel('WCSS')
|
29 |
+
plt.show()
|
30 |
+
|
31 |
+
def get_optimal_clusters_silhouette(scores):
|
32 |
+
optimal_clusters = scores.index(max(scores)) + 2 # +2 because range_values starts from 2
|
33 |
+
print(f"Optimal number of clusters: {optimal_clusters}")
|
34 |
+
return optimal_clusters
|
35 |
+
|
36 |
+
def fit_kmeans(data, n_clusters):
|
37 |
+
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
|
38 |
+
clusters = kmeans.fit_predict(data)
|
39 |
+
data['cluster'] = clusters
|
40 |
+
return kmeans, data
|
pca.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from sklearn.decomposition import PCA
|
3 |
+
import plotly.express as px
|
4 |
+
|
5 |
+
|
6 |
+
def perform_pca(data, n_components):
|
7 |
+
pca = PCA(n_components=n_components)
|
8 |
+
principalComponents = pca.fit_transform(data)
|
9 |
+
principalDf = pd.DataFrame(data=principalComponents,
|
10 |
+
columns=[f'principal component {i+1}' for i in range(n_components)])
|
11 |
+
return principalDf
|
12 |
+
|
13 |
+
def plot_pca(clustered_data,principalDf,df,information_columns):
|
14 |
+
clustered_data = clustered_data.reset_index() # To make sure indices match with df
|
15 |
+
finalDf = df.merge(clustered_data[['index', 'cluster']], left_index=True, right_on='index')
|
16 |
+
finalDf = finalDf.merge(principalDf, left_index=True, right_index=True)
|
17 |
+
|
18 |
+
finalDf.drop(columns=['index'], inplace=True)
|
19 |
+
fig = px.scatter(finalDf,
|
20 |
+
x='principal component 1',
|
21 |
+
y='principal component 2',
|
22 |
+
color='cluster',
|
23 |
+
hover_data=information_columns,
|
24 |
+
title='2 Component PCA',
|
25 |
+
labels={'principal component 1':'Principal Component 1', 'principal component 2':'Principal Component 2'},
|
26 |
+
color_continuous_scale='viridis')
|
27 |
+
# fig.show()
|
28 |
+
return fig
|
29 |
+
|
30 |
+
# def get_common_features(data):
|
31 |
+
# common_features = {}
|
32 |
+
# for cluster in data['cluster'].unique():
|
33 |
+
# cluster_data = data[data['cluster'] == cluster]
|
34 |
+
# cluster_features_counts = {}
|
35 |
+
# for column in cluster_data.drop(columns=["cluster"]).columns:
|
36 |
+
# top_feature = cluster_data[column].mode()[0] # Use mode to find the most common category
|
37 |
+
# if top_feature != 0: # Add only if most common feature is not 0
|
38 |
+
# cluster_features_counts[column] = top_feature
|
39 |
+
# common_features[cluster] = cluster_features_counts
|
40 |
+
# return common_features
|
41 |
+
|
42 |
+
|
43 |
+
def plot_pca_3D(clustered_data, principalDf,df,information_columns):
|
44 |
+
clustered_data = clustered_data.reset_index() # To make sure indices match with df
|
45 |
+
finalDf = df.merge(clustered_data[['index', 'cluster']], left_index=True, right_on='index')
|
46 |
+
finalDf = finalDf.merge(principalDf, left_index=True, right_index=True)
|
47 |
+
finalDf.drop(columns=['index'], inplace=True)
|
48 |
+
|
49 |
+
# common_features = get_common_features(clustered_data)
|
50 |
+
# # Add most common features to finalDf
|
51 |
+
# for cluster, features in common_features.items():
|
52 |
+
# for feature, value in features.items():
|
53 |
+
# finalDf.loc[finalDf['cluster'] == cluster, f"Most common {feature}"] = value
|
54 |
+
|
55 |
+
hover_data = information_columns #+ [col for col in finalDf.columns if "Most common" in col]
|
56 |
+
fig = px.scatter_3d(finalDf,
|
57 |
+
x='principal component 1',
|
58 |
+
y='principal component 2',
|
59 |
+
z='principal component 3',
|
60 |
+
color='cluster',
|
61 |
+
hover_data=hover_data,
|
62 |
+
title='3 Component PCA',
|
63 |
+
labels={f'principal component {i+1}': f'Principal Component {i+1}' for i in range(3)},
|
64 |
+
color_continuous_scale='viridis')
|
65 |
+
# fig.show()
|
66 |
+
return fig
|