import pandas as pd enterpriseGroups = ['facialRecognition',['safetySecurity','recruitment','biometricData']] societyGroups = [['policing','controlAccessToServices']] dataTypeGroups = [['dataTypePersonal','dataTypeSensistivePersonal'],['dataTypeRestricted']] capabilitiesGroups = ['decisionSupportSystems'] technicalRisks = ['Robustness', 'Efficacy', 'Privacy', 'Bias', 'Explainability'] governanceRisks = ['Financial', 'Reputational', 'Ethics', 'Regulation'] riskVerticals = ['Overall'] + governanceRisks + technicalRisks def mergeCostColumns(home, commisioned, licensed): if home == 'insignificant' or commisioned == 'insignificant' or licensed == 'insignificant': output = 1 else: output = 0 return output def generateUniqueEntries(targetColumn): listOfEntries = [] for i in targetColumn.values: listOfEntries += i.split(',') listOfEntries = set(listOfEntries) return list(listOfEntries) def generateOneHot(dataframe, targetColumn, groups): for group in groups: groupColumnName = '' if type(group) == str: groupColumnName = targetColumn + '_' + group else: for element in group: if groupColumnName == '': groupColumnName += targetColumn + '_' + element else: groupColumnName += '_' + element dataframe[groupColumnName] = 0 for i, targetColumnData in enumerate(dataframe[targetColumn].values): if type(group) == str: if group in targetColumnData.split(','): dataframe.loc[ i, groupColumnName] = 1 # this method of assignment gets rid of the SettingWithCopy warning else: for element in group: if element in targetColumnData.split(','): dataframe.loc[ i, groupColumnName] = 1 # this method of assignment gets rid of the SettingWithCopy warning dataframe.insert(0, groupColumnName, dataframe.pop(groupColumnName)) # move the new column to the far left dataframe.pop(targetColumn) def convertToBinaryColumn(dataframe, targetColumn, positiveGroup): # anything in the positive group gets assigned 1, o/w zero for i, targetColumnData in enumerate(dataframe[targetColumn].values): if targetColumnData in positiveGroup: dataframe.loc[i, targetColumn] = 1 # this method of assignment gets rid of the SettingWithCopy warning else: dataframe.loc[i, targetColumn] = 0 def mainDataWrangling(data): # 1. Throw away the columns that we don't need columnsToKeep = [1, 4, 5, 6, 7, 8, 10, 22, 24, 34, 35, 36, 37, 39, 45, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57] # only keep the columns that we think are pertinent to scoring data = data.iloc[:, columnsToKeep] # 2. Merge the three development cost columns to get a single column for insignificant cost data['insignificant'] = data.apply( lambda x: mergeCostColumns(x['homeBuiltAmount'], x['commisionedAmount'], x['licensedAmount']), axis=1) data.drop(data.iloc[:, 1:4], axis=1, inplace=True) # 3. Replace headers with their compact forms data.iloc[ 0, -1] = 'insignificant' # columns are currently in the first row of values. We attach a column name to the newly created column here, before copying the value row to the headers data.iloc[0, 0] = 'projectName' data.iloc[0, -11:-1] = data.columns[-11:-1].values # copy the risk vertical names to the header row data.columns = data.iloc[0, :].values # copying the first value row to the headers data = data.drop([0]) # remove the first value row data.reset_index(drop=True, inplace=True) # reset the row indices data.insert(0, 'insignificant', data.pop('insignificant')) # move the insignificant column to the far left # 4. Remove/replace missing data data = data.dropna( subset=data.columns[-10:].values) # drop all the samples for which risk scoring hasn't yet been done data.reset_index(drop=True, inplace=True) # reset the row indices data['howEssentialHumanInTheLoop'].fillna('low', inplace=True) # replace NaNs for contingent question with 'low' # 5. Perform one hot encoding and other encoding generateOneHot(data, 'enterpriseUseCases', enterpriseGroups) generateOneHot(data, 'soceityLevel', societyGroups) convertToBinaryColumn(data, 'externalParties', ['yes']) data['howWidelyDeployed'] = data['howWidelyDeployed'].map( {'controlledEnvironment': 0, 'local': 0.2, 'multipleJurisdictions': 0.5, 'global': 1}) generateOneHot(data, 'dataType', dataTypeGroups) data['autonomy'] = data['autonomy'].map({'humanInTheLoop': 0, 'autonomous': 1}) data['howEssentialHumanInTheLoop'] = data['howEssentialHumanInTheLoop'].map({'low': 0, 'medium': 0.5, 'high': 1}) data['damageCausedIfSubstantialFailure'] = data['damageCausedIfSubstantialFailure'].map( {'none': 0, 'minor': 0.3, 'major': 0.7, 'critical': 1}) data['damageCausedIfMarginalFailure'] = data['damageCausedIfMarginalFailure'].map( {'none': 0, 'minor': 0.3, 'major': 0.7, 'critical': 1}) generateOneHot(data, 'capabilities', capabilitiesGroups) data['selfAdapting'] = data['selfAdapting'].map({'no': 0, 'yesWhenUpdatedMade': 0.5, 'yesRealTime': 1}) # convert risk level columns to numbers # creation of binary columns for Low for riskColumn in data.columns[-10:]: data[riskColumn + '_binaryLow'] = data[riskColumn].map({'Low': 1, 'Medium': 0, 'High': 0}) # creation of binary columns for High for riskColumn in data.columns[-20:-10]: data[riskColumn + '_binaryHigh'] = data[riskColumn].map({'Low': 0, 'Medium': 0, 'High': 1}) data.insert(0, 'projectName', data.pop('projectName')) data['insignificant'] = pd.to_numeric(data['insignificant']) data['externalParties'] = pd.to_numeric(data['externalParties']) return data