|
import pandas as pd |
|
|
|
enterpriseGroups = ['facialRecognition',['safetySecurity','recruitment','biometricData']] |
|
societyGroups = [['policing','controlAccessToServices']] |
|
dataTypeGroups = [['dataTypePersonal','dataTypeSensistivePersonal'],['dataTypeRestricted']] |
|
capabilitiesGroups = ['decisionSupportSystems'] |
|
|
|
technicalRisks = ['Robustness', 'Efficacy', |
|
'Privacy', 'Bias', 'Explainability'] |
|
governanceRisks = ['Financial', 'Reputational', 'Ethics', 'Regulation'] |
|
riskVerticals = ['Overall'] + governanceRisks + technicalRisks |
|
|
|
|
|
def mergeCostColumns(home, commisioned, licensed): |
|
if home == 'insignificant' or commisioned == 'insignificant' or licensed == 'insignificant': |
|
output = 1 |
|
else: |
|
output = 0 |
|
return output |
|
|
|
|
|
def generateUniqueEntries(targetColumn): |
|
listOfEntries = [] |
|
for i in targetColumn.values: |
|
listOfEntries += i.split(',') |
|
listOfEntries = set(listOfEntries) |
|
return list(listOfEntries) |
|
|
|
|
|
def generateOneHot(dataframe, targetColumn, groups): |
|
for group in groups: |
|
groupColumnName = '' |
|
if type(group) == str: |
|
groupColumnName = targetColumn + '_' + group |
|
else: |
|
for element in group: |
|
if groupColumnName == '': |
|
groupColumnName += targetColumn + '_' + element |
|
else: |
|
groupColumnName += '_' + element |
|
dataframe[groupColumnName] = 0 |
|
|
|
for i, targetColumnData in enumerate(dataframe[targetColumn].values): |
|
if type(group) == str: |
|
if group in targetColumnData.split(','): |
|
dataframe.loc[ |
|
i, groupColumnName] = 1 |
|
else: |
|
for element in group: |
|
if element in targetColumnData.split(','): |
|
dataframe.loc[ |
|
i, groupColumnName] = 1 |
|
|
|
dataframe.insert(0, groupColumnName, dataframe.pop(groupColumnName)) |
|
|
|
dataframe.pop(targetColumn) |
|
|
|
|
|
def convertToBinaryColumn(dataframe, targetColumn, |
|
positiveGroup): |
|
for i, targetColumnData in enumerate(dataframe[targetColumn].values): |
|
if targetColumnData in positiveGroup: |
|
dataframe.loc[i, targetColumn] = 1 |
|
else: |
|
dataframe.loc[i, targetColumn] = 0 |
|
|
|
|
|
def mainDataWrangling(data): |
|
|
|
columnsToKeep = [1, 4, 5, 6, 7, 8, 10, 22, 24, 34, 35, 36, 37, 39, 45, 48, 49, 50, 51, 52, 53, 54, 55, 56, |
|
57] |
|
data = data.iloc[:, columnsToKeep] |
|
|
|
|
|
data['insignificant'] = data.apply( |
|
lambda x: mergeCostColumns(x['homeBuiltAmount'], x['commisionedAmount'], x['licensedAmount']), axis=1) |
|
data.drop(data.iloc[:, 1:4], axis=1, inplace=True) |
|
|
|
|
|
data.iloc[ |
|
0, -1] = 'insignificant' |
|
data.iloc[0, 0] = 'projectName' |
|
data.iloc[0, -11:-1] = data.columns[-11:-1].values |
|
data.columns = data.iloc[0, :].values |
|
data = data.drop([0]) |
|
data.reset_index(drop=True, inplace=True) |
|
data.insert(0, 'insignificant', data.pop('insignificant')) |
|
|
|
|
|
data = data.dropna( |
|
subset=data.columns[-10:].values) |
|
data.reset_index(drop=True, inplace=True) |
|
data['howEssentialHumanInTheLoop'].fillna('low', inplace=True) |
|
|
|
|
|
generateOneHot(data, 'enterpriseUseCases', enterpriseGroups) |
|
generateOneHot(data, 'soceityLevel', societyGroups) |
|
convertToBinaryColumn(data, 'externalParties', ['yes']) |
|
data['howWidelyDeployed'] = data['howWidelyDeployed'].map( |
|
{'controlledEnvironment': 0, 'local': 0.2, 'multipleJurisdictions': 0.5, 'global': 1}) |
|
generateOneHot(data, 'dataType', dataTypeGroups) |
|
data['autonomy'] = data['autonomy'].map({'humanInTheLoop': 0, 'autonomous': 1}) |
|
data['howEssentialHumanInTheLoop'] = data['howEssentialHumanInTheLoop'].map({'low': 0, 'medium': 0.5, 'high': 1}) |
|
data['damageCausedIfSubstantialFailure'] = data['damageCausedIfSubstantialFailure'].map( |
|
{'none': 0, 'minor': 0.3, 'major': 0.7, 'critical': 1}) |
|
data['damageCausedIfMarginalFailure'] = data['damageCausedIfMarginalFailure'].map( |
|
{'none': 0, 'minor': 0.3, 'major': 0.7, 'critical': 1}) |
|
generateOneHot(data, 'capabilities', capabilitiesGroups) |
|
data['selfAdapting'] = data['selfAdapting'].map({'no': 0, 'yesWhenUpdatedMade': 0.5, 'yesRealTime': 1}) |
|
|
|
|
|
|
|
for riskColumn in data.columns[-10:]: |
|
data[riskColumn + '_binaryLow'] = data[riskColumn].map({'Low': 1, 'Medium': 0, 'High': 0}) |
|
|
|
for riskColumn in data.columns[-20:-10]: |
|
data[riskColumn + '_binaryHigh'] = data[riskColumn].map({'Low': 0, 'Medium': 0, 'High': 1}) |
|
|
|
data.insert(0, 'projectName', data.pop('projectName')) |
|
data['insignificant'] = pd.to_numeric(data['insignificant']) |
|
data['externalParties'] = pd.to_numeric(data['externalParties']) |
|
|
|
return data |