Spaces:

wu981526092
/

Optimal_Cluster_Analysis_with_PCA_Visualization

Sleeping

Optimal_Cluster_Analysis_with_PCA_Visualization

File size: 6,096 Bytes

760a88c

import pandas as pd

enterpriseGroups = ['facialRecognition',['safetySecurity','recruitment','biometricData']]
societyGroups = [['policing','controlAccessToServices']]
dataTypeGroups = [['dataTypePersonal','dataTypeSensistivePersonal'],['dataTypeRestricted']]
capabilitiesGroups = ['decisionSupportSystems']

technicalRisks = ['Robustness', 'Efficacy',
                  'Privacy', 'Bias', 'Explainability']
governanceRisks = ['Financial', 'Reputational', 'Ethics', 'Regulation']
riskVerticals = ['Overall'] + governanceRisks + technicalRisks


def mergeCostColumns(home, commisioned, licensed):
    if home == 'insignificant' or commisioned == 'insignificant' or licensed == 'insignificant':
        output = 1
    else:
        output = 0
    return output


def generateUniqueEntries(targetColumn):
    listOfEntries = []
    for i in targetColumn.values:
        listOfEntries += i.split(',')
    listOfEntries = set(listOfEntries)
    return list(listOfEntries)


def generateOneHot(dataframe, targetColumn, groups):
    for group in groups:
        groupColumnName = ''
        if type(group) == str:
            groupColumnName = targetColumn + '_' + group
        else:
            for element in group:
                if groupColumnName == '':
                    groupColumnName += targetColumn + '_' + element
                else:
                    groupColumnName += '_' + element
        dataframe[groupColumnName] = 0

        for i, targetColumnData in enumerate(dataframe[targetColumn].values):
            if type(group) == str:
                if group in targetColumnData.split(','):
                    dataframe.loc[
                        i, groupColumnName] = 1  # this method of assignment gets rid of the SettingWithCopy warning
            else:
                for element in group:
                    if element in targetColumnData.split(','):
                        dataframe.loc[
                            i, groupColumnName] = 1  # this method of assignment gets rid of the SettingWithCopy warning

        dataframe.insert(0, groupColumnName, dataframe.pop(groupColumnName))  # move the new column to the far left

    dataframe.pop(targetColumn)


def convertToBinaryColumn(dataframe, targetColumn,
                          positiveGroup):  # anything in the positive group gets assigned 1, o/w zero
    for i, targetColumnData in enumerate(dataframe[targetColumn].values):
        if targetColumnData in positiveGroup:
            dataframe.loc[i, targetColumn] = 1  # this method of assignment gets rid of the SettingWithCopy warning
        else:
            dataframe.loc[i, targetColumn] = 0


def mainDataWrangling(data):
    # 1. Throw away the columns that we don't need
    columnsToKeep = [1, 4, 5, 6, 7, 8, 10, 22, 24, 34, 35, 36, 37, 39, 45, 48, 49, 50, 51, 52, 53, 54, 55, 56,
                     57]  # only keep the columns that we think are pertinent to scoring
    data = data.iloc[:, columnsToKeep]

    # 2. Merge the three development cost columns to get a single column for insignificant cost
    data['insignificant'] = data.apply(
        lambda x: mergeCostColumns(x['homeBuiltAmount'], x['commisionedAmount'], x['licensedAmount']), axis=1)
    data.drop(data.iloc[:, 1:4], axis=1, inplace=True)

    # 3. Replace headers with their compact forms
    data.iloc[
        0, -1] = 'insignificant'  # columns are currently in the first row of values. We attach a column name to the newly created column here, before copying the value row to the headers
    data.iloc[0, 0] = 'projectName'
    data.iloc[0, -11:-1] = data.columns[-11:-1].values  # copy the risk vertical names to the header row
    data.columns = data.iloc[0, :].values  # copying the first value row to the headers
    data = data.drop([0])  # remove the first value row
    data.reset_index(drop=True, inplace=True)  # reset the row indices
    data.insert(0, 'insignificant', data.pop('insignificant'))  # move the insignificant column to the far left

    # 4. Remove/replace missing data
    data = data.dropna(
        subset=data.columns[-10:].values)  # drop all the samples for which risk scoring hasn't yet been done
    data.reset_index(drop=True, inplace=True)  # reset the row indices
    data['howEssentialHumanInTheLoop'].fillna('low', inplace=True)  # replace NaNs for contingent question with 'low'

    # 5. Perform one hot encoding and other encoding
    generateOneHot(data, 'enterpriseUseCases', enterpriseGroups)
    generateOneHot(data, 'soceityLevel', societyGroups)
    convertToBinaryColumn(data, 'externalParties', ['yes'])
    data['howWidelyDeployed'] = data['howWidelyDeployed'].map(
        {'controlledEnvironment': 0, 'local': 0.2, 'multipleJurisdictions': 0.5, 'global': 1})
    generateOneHot(data, 'dataType', dataTypeGroups)
    data['autonomy'] = data['autonomy'].map({'humanInTheLoop': 0, 'autonomous': 1})
    data['howEssentialHumanInTheLoop'] = data['howEssentialHumanInTheLoop'].map({'low': 0, 'medium': 0.5, 'high': 1})
    data['damageCausedIfSubstantialFailure'] = data['damageCausedIfSubstantialFailure'].map(
        {'none': 0, 'minor': 0.3, 'major': 0.7, 'critical': 1})
    data['damageCausedIfMarginalFailure'] = data['damageCausedIfMarginalFailure'].map(
        {'none': 0, 'minor': 0.3, 'major': 0.7, 'critical': 1})
    generateOneHot(data, 'capabilities', capabilitiesGroups)
    data['selfAdapting'] = data['selfAdapting'].map({'no': 0, 'yesWhenUpdatedMade': 0.5, 'yesRealTime': 1})

    # convert risk level columns to numbers
    # creation of binary columns for Low
    for riskColumn in data.columns[-10:]:
        data[riskColumn + '_binaryLow'] = data[riskColumn].map({'Low': 1, 'Medium': 0, 'High': 0})
    # creation of binary columns for High
    for riskColumn in data.columns[-20:-10]:
        data[riskColumn + '_binaryHigh'] = data[riskColumn].map({'Low': 0, 'Medium': 0, 'High': 1})

    data.insert(0, 'projectName', data.pop('projectName'))
    data['insignificant'] = pd.to_numeric(data['insignificant'])
    data['externalParties'] = pd.to_numeric(data['externalParties'])

    return data