wu981526092's picture
update
760a88c
import pandas as pd
enterpriseGroups = ['facialRecognition',['safetySecurity','recruitment','biometricData']]
societyGroups = [['policing','controlAccessToServices']]
dataTypeGroups = [['dataTypePersonal','dataTypeSensistivePersonal'],['dataTypeRestricted']]
capabilitiesGroups = ['decisionSupportSystems']
technicalRisks = ['Robustness', 'Efficacy',
'Privacy', 'Bias', 'Explainability']
governanceRisks = ['Financial', 'Reputational', 'Ethics', 'Regulation']
riskVerticals = ['Overall'] + governanceRisks + technicalRisks
def mergeCostColumns(home, commisioned, licensed):
if home == 'insignificant' or commisioned == 'insignificant' or licensed == 'insignificant':
output = 1
else:
output = 0
return output
def generateUniqueEntries(targetColumn):
listOfEntries = []
for i in targetColumn.values:
listOfEntries += i.split(',')
listOfEntries = set(listOfEntries)
return list(listOfEntries)
def generateOneHot(dataframe, targetColumn, groups):
for group in groups:
groupColumnName = ''
if type(group) == str:
groupColumnName = targetColumn + '_' + group
else:
for element in group:
if groupColumnName == '':
groupColumnName += targetColumn + '_' + element
else:
groupColumnName += '_' + element
dataframe[groupColumnName] = 0
for i, targetColumnData in enumerate(dataframe[targetColumn].values):
if type(group) == str:
if group in targetColumnData.split(','):
dataframe.loc[
i, groupColumnName] = 1 # this method of assignment gets rid of the SettingWithCopy warning
else:
for element in group:
if element in targetColumnData.split(','):
dataframe.loc[
i, groupColumnName] = 1 # this method of assignment gets rid of the SettingWithCopy warning
dataframe.insert(0, groupColumnName, dataframe.pop(groupColumnName)) # move the new column to the far left
dataframe.pop(targetColumn)
def convertToBinaryColumn(dataframe, targetColumn,
positiveGroup): # anything in the positive group gets assigned 1, o/w zero
for i, targetColumnData in enumerate(dataframe[targetColumn].values):
if targetColumnData in positiveGroup:
dataframe.loc[i, targetColumn] = 1 # this method of assignment gets rid of the SettingWithCopy warning
else:
dataframe.loc[i, targetColumn] = 0
def mainDataWrangling(data):
# 1. Throw away the columns that we don't need
columnsToKeep = [1, 4, 5, 6, 7, 8, 10, 22, 24, 34, 35, 36, 37, 39, 45, 48, 49, 50, 51, 52, 53, 54, 55, 56,
57] # only keep the columns that we think are pertinent to scoring
data = data.iloc[:, columnsToKeep]
# 2. Merge the three development cost columns to get a single column for insignificant cost
data['insignificant'] = data.apply(
lambda x: mergeCostColumns(x['homeBuiltAmount'], x['commisionedAmount'], x['licensedAmount']), axis=1)
data.drop(data.iloc[:, 1:4], axis=1, inplace=True)
# 3. Replace headers with their compact forms
data.iloc[
0, -1] = 'insignificant' # columns are currently in the first row of values. We attach a column name to the newly created column here, before copying the value row to the headers
data.iloc[0, 0] = 'projectName'
data.iloc[0, -11:-1] = data.columns[-11:-1].values # copy the risk vertical names to the header row
data.columns = data.iloc[0, :].values # copying the first value row to the headers
data = data.drop([0]) # remove the first value row
data.reset_index(drop=True, inplace=True) # reset the row indices
data.insert(0, 'insignificant', data.pop('insignificant')) # move the insignificant column to the far left
# 4. Remove/replace missing data
data = data.dropna(
subset=data.columns[-10:].values) # drop all the samples for which risk scoring hasn't yet been done
data.reset_index(drop=True, inplace=True) # reset the row indices
data['howEssentialHumanInTheLoop'].fillna('low', inplace=True) # replace NaNs for contingent question with 'low'
# 5. Perform one hot encoding and other encoding
generateOneHot(data, 'enterpriseUseCases', enterpriseGroups)
generateOneHot(data, 'soceityLevel', societyGroups)
convertToBinaryColumn(data, 'externalParties', ['yes'])
data['howWidelyDeployed'] = data['howWidelyDeployed'].map(
{'controlledEnvironment': 0, 'local': 0.2, 'multipleJurisdictions': 0.5, 'global': 1})
generateOneHot(data, 'dataType', dataTypeGroups)
data['autonomy'] = data['autonomy'].map({'humanInTheLoop': 0, 'autonomous': 1})
data['howEssentialHumanInTheLoop'] = data['howEssentialHumanInTheLoop'].map({'low': 0, 'medium': 0.5, 'high': 1})
data['damageCausedIfSubstantialFailure'] = data['damageCausedIfSubstantialFailure'].map(
{'none': 0, 'minor': 0.3, 'major': 0.7, 'critical': 1})
data['damageCausedIfMarginalFailure'] = data['damageCausedIfMarginalFailure'].map(
{'none': 0, 'minor': 0.3, 'major': 0.7, 'critical': 1})
generateOneHot(data, 'capabilities', capabilitiesGroups)
data['selfAdapting'] = data['selfAdapting'].map({'no': 0, 'yesWhenUpdatedMade': 0.5, 'yesRealTime': 1})
# convert risk level columns to numbers
# creation of binary columns for Low
for riskColumn in data.columns[-10:]:
data[riskColumn + '_binaryLow'] = data[riskColumn].map({'Low': 1, 'Medium': 0, 'High': 0})
# creation of binary columns for High
for riskColumn in data.columns[-20:-10]:
data[riskColumn + '_binaryHigh'] = data[riskColumn].map({'Low': 0, 'Medium': 0, 'High': 1})
data.insert(0, 'projectName', data.pop('projectName'))
data['insignificant'] = pd.to_numeric(data['insignificant'])
data['externalParties'] = pd.to_numeric(data['externalParties'])
return data