File size: 15,898 Bytes
5defafa
 
6e7dc3c
f335959
40d7b09
 
0765d8d
 
015b1a2
f335959
8a73f6f
f335959
076d436
 
 
 
 
 
 
 
 
775d0e1
076d436
 
 
 
693b882
076d436
 
 
 
 
 
 
 
 
 
 
 
775d0e1
076d436
 
 
 
693b882
076d436
 
 
 
 
 
 
 
 
 
 
3b9517e
 
 
 
 
f921051
076d436
 
 
 
 
 
 
 
0c08540
 
 
 
 
 
c00508a
 
 
 
 
f335959
6830d47
 
 
 
 
 
 
f335959
 
 
 
 
076d436
f335959
 
286c449
f335959
076d436
f335959
076d436
f335959
076d436
f335959
076d436
 
7a70a60
f921051
 
 
 
 
 
076d436
 
 
 
 
 
 
 
fd1088f
 
 
 
 
 
 
 
 
076d436
 
 
 
b25bb07
 
 
 
 
 
 
 
f335959
 
076d436
6e7dc3c
f335959
076d436
 
 
 
 
 
 
 
 
c00508a
076d436
 
 
 
 
 
 
 
 
 
 
 
c00508a
f335959
 
286c449
f335959
 
 
 
 
 
076d436
b4a154b
b25bb07
 
 
 
 
 
 
 
 
 
 
076d436
 
f335959
 
 
0765d8d
 
076d436
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6830d47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
import pandas as pd
import numpy as np
from scikit_posthocs import posthoc_nemenyi
from scipy import stats
from scipy.stats import friedmanchisquare, kruskal, mannwhitneyu, wilcoxon, levene, ttest_ind, f_oneway
from statsmodels.stats.multicomp import MultiComparison
from scipy.stats import spearmanr, pearsonr, kendalltau, entropy
from scipy.spatial.distance import jensenshannon
from scipy.stats import ttest_ind, friedmanchisquare, rankdata, ttest_rel
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scipy.stats import ttest_1samp


def test_statistic_variance_ratio(x, y):
    return np.var(x, ddof=1) / np.var(y, ddof=1)


def test_statistic_mean_difference(x, y):
    return np.mean(x) - np.mean(y)


def permutation_test_variance(x, y, num_permutations=100000):
    T_obs = test_statistic_variance_ratio(x, y)
    pooled_data = np.concatenate([x, y])
    n_A = len(x)

    perm_test_stats = [T_obs]
    for _ in range(num_permutations):
        np.random.shuffle(pooled_data)
        perm_A = pooled_data[:n_A]
        perm_B = pooled_data[n_A:]
        perm_test_stats.append(test_statistic_variance_ratio(perm_A, perm_B))

    perm_test_stats = np.array(perm_test_stats)
    p_value = np.mean(np.abs(perm_test_stats) >= np.abs(T_obs))

    return T_obs, p_value


def permutation_test_mean(x, y, num_permutations=100000):
    T_obs = test_statistic_mean_difference(x, y)
    pooled_data = np.concatenate([x, y])
    n_A = len(x)

    perm_test_stats = [T_obs]
    for _ in range(num_permutations):
        np.random.shuffle(pooled_data)
        perm_A = pooled_data[:n_A]
        perm_B = pooled_data[n_A:]
        perm_test_stats.append(test_statistic_mean_difference(perm_A, perm_B))

    perm_test_stats = np.array(perm_test_stats)
    p_value = np.mean(np.abs(perm_test_stats) >= np.abs(T_obs))

    return T_obs, p_value

def calculate_impact_ratio(selection_rates):
    """Calculate the impact ratio for each category."""
    most_selected_rate = max(selection_rates.values())
    impact_ratios = {category: rate / most_selected_rate for category, rate in selection_rates.items()}
    return impact_ratios

def statistical_parity_difference(y_true, y_pred=None, reference_group='Privilege'):
    selection_rates = y_pred if y_pred is not None else y_true
    reference_rate = selection_rates[reference_group]
    spd = {category: rate - reference_rate for category, rate in selection_rates.items()}
    return spd



def statistical_parity_difference(selection_rates):
    """Calculate statistical parity difference."""
    most_selected_rate = max(selection_rates.values())
    spd = {category: rate - most_selected_rate for category, rate in selection_rates.items()}
    return spd

def calculate_four_fifths_rule(impact_ratios):
    """Calculate whether each category meets the four-fifths rule."""
    adverse_impact = {category: (ratio < 0.8) for category, ratio in impact_ratios.items()}
    return adverse_impact

def statistical_tests(data):
    # Add ranks for each score within each row
    ranks = data[['Privilege_Avg_Score', 'Protect_Avg_Score', 'Neutral_Avg_Score']].rank(axis=1, ascending=True)

    data['Privilege_Rank'] = ranks['Privilege_Avg_Score']
    data['Protect_Rank'] = ranks['Protect_Avg_Score']
    data['Neutral_Rank'] = ranks['Neutral_Avg_Score']

    """Perform various statistical tests to evaluate potential biases."""
    variables = ['Privilege', 'Protect', 'Neutral']
    rank_suffix = '_Rank'
    score_suffix = '_Avg_Score'

    # Calculate average ranks and scores
    rank_columns = [v + rank_suffix for v in variables]
    average_ranks = data[rank_columns].mean()
    average_scores = data[[v + score_suffix for v in variables]].mean()

    # Statistical tests setup
    rank_data = [data[col] for col in rank_columns]
    pairs = [('Privilege', 'Protect'), ('Protect', 'Neutral'), ('Privilege', 'Neutral')]

    pairwise_results = {'Wilcoxon Test': {}}

    # Pairwise Wilcoxon Signed-Rank Test
    for var1, var2 in pairs:
        pair_rank_score = f'{var1}{rank_suffix} vs {var2}{rank_suffix}'
        if len(data) > 20:
            wilcoxon_stat, wilcoxon_p = wilcoxon(data[f'{var1}{rank_suffix}'], data[f'{var2}{rank_suffix}'])
        else:
            wilcoxon_stat, wilcoxon_p = np.nan, "Sample size too small for Wilcoxon test."
        pairwise_results['Wilcoxon Test'][pair_rank_score] = {"Statistic": wilcoxon_stat, "p-value": wilcoxon_p}

    # # Levene's Test for Equality of Variances
    # levene_results = {
    #     'Privilege vs Protect': levene(data['Privilege_Rank'], data['Protect_Rank']),
    #     'Privilege vs Neutral': levene(data['Privilege_Rank'], data['Neutral_Rank']),
    #     'Protect vs Neutral': levene(data['Protect_Rank'], data['Neutral_Rank'])
    # }
    #
    # levene_results = {key: {"Statistic": res.statistic, "p-value": res.pvalue} for key, res in levene_results.items()}

    # Calculate variances for ranks
    variances = {col: data[col].var() for col in rank_columns}
    pairwise_variances = {
        'Privilege_Rank vs Protect_Rank': variances['Privilege_Rank'] > variances['Protect_Rank'],
        'Privilege_Rank vs Neutral_Rank': variances['Privilege_Rank'] > variances['Neutral_Rank'],
        'Protect_Rank vs Neutral_Rank': variances['Protect_Rank'] > variances['Neutral_Rank']
    }

    # Bias metrics calculations
    selection_rates_Avg_Score = {v: data[f'{v}{score_suffix}'].mean() for v in variables}
    selection_rates_rank = {v: data[f'{v}{rank_suffix}'].mean() for v in variables}

    impact_ratios_Avg_Score = calculate_impact_ratio(selection_rates_Avg_Score)
    spd_result_Avg_Score = statistical_parity_difference(selection_rates_Avg_Score)
    adverse_impact_Avg_Score = calculate_four_fifths_rule(impact_ratios_Avg_Score)

    impact_ratios_rank = calculate_impact_ratio(selection_rates_rank)
    spd_result_rank = statistical_parity_difference(selection_rates_rank)
    adverse_impact_rank = calculate_four_fifths_rule(impact_ratios_rank)

    # Friedman test
    friedman_stat, friedman_p = friedmanchisquare(*rank_data)
    rank_matrix_transposed = np.transpose(data[rank_columns].values)
    posthoc_results = posthoc_nemenyi(rank_matrix_transposed)

    # Perform permutation tests for variances
    T_priv_prot_var, p_priv_prot_var = permutation_test_variance(data['Privilege_Rank'], data['Protect_Rank'])
    T_neut_prot_var, p_neut_prot_var = permutation_test_variance(data['Neutral_Rank'], data['Protect_Rank'])
    T_neut_priv_var, p_neut_priv_var = permutation_test_variance(data['Neutral_Rank'], data['Privilege_Rank'])

    # Perform permutation tests for means
    T_priv_prot_mean, p_priv_prot_mean = permutation_test_mean(data['Privilege_Rank'], data['Protect_Rank'])
    T_neut_prot_mean, p_neut_prot_mean = permutation_test_mean(data['Neutral_Rank'], data['Protect_Rank'])
    T_neut_priv_mean, p_neut_priv_mean = permutation_test_mean(data['Neutral_Rank'], data['Privilege_Rank'])

    permutation_results = {
        "Permutation Tests for Variances": {
            "Privilege vs. Protect": {"Statistic": T_priv_prot_var, "p-value": p_priv_prot_var},
            "Neutral vs. Protect": {"Statistic": T_neut_prot_var, "p-value": p_neut_prot_var},
            "Neutral vs. Privilege": {"Statistic": T_neut_priv_var, "p-value": p_neut_priv_var}
        },
        "Permutation Tests for Means": {
            "Privilege vs. Protect": {"Statistic": T_priv_prot_mean, "p-value": p_priv_prot_mean},
            "Neutral vs. Protect": {"Statistic": T_neut_prot_mean, "p-value": p_neut_prot_mean},
            "Neutral vs. Privilege": {"Statistic": T_neut_priv_mean, "p-value": p_neut_priv_mean}
        }
    }

    results = {
        "Average Ranks": average_ranks.to_dict(),
        "Average Scores": average_scores.to_dict(),
        "Friedman Test": {
            "Statistic": friedman_stat,
            "p-value": friedman_p,
            "Post-hoc": posthoc_results
        },
        **pairwise_results,
        #"Levene's Test for Equality of Variances": levene_results,
        "Pairwise Comparisons of Variances": pairwise_variances,
        "Statistical Parity Difference": {
            "Avg_Score": spd_result_Avg_Score,
            "Rank": spd_result_rank
        },
        "Disparate Impact Ratios": {
            "Avg_Score": impact_ratios_Avg_Score,
            "Rank": impact_ratios_rank
        },
        "Four-Fifths Rule": {
            "Avg_Score": adverse_impact_Avg_Score,
            "Rank": adverse_impact_rank
        },
        **permutation_results
    }

    return results


#
# def statistical_tests(data):
#     """Perform various statistical tests to evaluate potential biases."""
#     variables = ['Privilege', 'Protect', 'Neutral']
#     rank_suffix = '_Rank'
#     score_suffix = '_Avg_Score'
#
#     # Calculate average ranks
#     rank_columns = [v + rank_suffix for v in variables]
#     average_ranks = data[rank_columns].mean()
#     average_scores = data[[v + score_suffix for v in variables]].mean()
#
#     # Statistical tests
#     rank_data = [data[col] for col in rank_columns]
#
#     # Pairwise tests
#     pairs = [
#         ('Privilege', 'Protect'),
#         ('Protect', 'Neutral'),
#         ('Privilege', 'Neutral')
#     ]
#
#     pairwise_results = {
#         'Wilcoxon Test': {}
#     }
#
#     for (var1, var2) in pairs:
#         pair_name_score = f'{var1}{score_suffix} vs {var2}{score_suffix}'
#         pair_rank_score = f'{var1}{rank_suffix} vs {var2}{rank_suffix}'
#
#         # Wilcoxon Signed-Rank Test
#         if len(data) > 20:
#             wilcoxon_stat, wilcoxon_p = wilcoxon(data[f'{var1}{rank_suffix}'], data[f'{var2}{rank_suffix}'])
#         else:
#             wilcoxon_stat, wilcoxon_p = np.nan, "Sample size too small for Wilcoxon test."
#         pairwise_results['Wilcoxon Test'][pair_rank_score] = {"Statistic": wilcoxon_stat, "p-value": wilcoxon_p}
#
#     # Levene's Test for Equality of Variances
#     levene_results = {}
#     levene_privilege_protect = levene(data['Privilege_Rank'], data['Protect_Rank'])
#     levene_privilege_neutral = levene(data['Privilege_Rank'], data['Neutral_Rank'])
#     levene_protect_neutral = levene(data['Protect_Rank'], data['Neutral_Rank'])
#
#     levene_results['Privilege vs Protect'] = {"Statistic": levene_privilege_protect.statistic,
#                                               "p-value": levene_privilege_protect.pvalue}
#     levene_results['Privilege vs Neutral'] = {"Statistic": levene_privilege_neutral.statistic,
#                                               "p-value": levene_privilege_neutral.pvalue}
#     levene_results['Protect vs Neutral'] = {"Statistic": levene_protect_neutral.statistic,
#                                             "p-value": levene_protect_neutral.pvalue}
#
#     # Calculate variances for ranks
#     variances = {col: data[col].var() for col in rank_columns}
#     pairwise_variances = {
#         'Privilege_Rank vs Protect_Rank': variances['Privilege_Rank'] > variances['Protect_Rank'],
#         'Privilege_Rank vs Neutral_Rank': variances['Privilege_Rank'] > variances['Neutral_Rank'],
#         'Protect_Rank vs Neutral_Rank': variances['Protect_Rank'] > variances['Neutral_Rank']
#     }
#
#     selection_rates_Avg_Score = {
#         'Privilege': data['Privilege_Avg_Score'].mean(),
#         'Protect': data['Protect_Avg_Score'].mean(),
#         'Neutral': data['Neutral_Avg_Score'].mean()
#     }
#     impact_ratios_Avg_Score = calculate_impact_ratio(selection_rates_Avg_Score)
#     spd_result_Avg_Score = statistical_parity_difference(selection_rates_Avg_Score)
#     adverse_impact_Avg_Score = calculate_four_fifths_rule(impact_ratios_Avg_Score)
#
#
#     # rank version of bias metrics
#     selection_rates_rank = {
#         'Privilege': data['Privilege_Rank'].mean(),
#         'Protect': data['Protect_Rank'].mean(),
#         'Neutral': data['Neutral_Rank'].mean()
#     }
#     impact_ratios_rank = calculate_impact_ratio(selection_rates_rank)
#     spd_result_rank = statistical_parity_difference(selection_rates_rank)
#     adverse_impact_rank = calculate_four_fifths_rule(impact_ratios_rank)
#
#
#     # Friedman test
#     friedman_stat, friedman_p = friedmanchisquare(*rank_data)
#
#     rank_matrix = data[rank_columns].values
#     rank_matrix_transposed = np.transpose(rank_matrix)
#     posthoc_results = posthoc_nemenyi(rank_matrix_transposed)
#     #posthoc_results = posthoc_friedman(data, variables, rank_suffix)
#
#
#
#     results = {
#         "Average Ranks": average_ranks.to_dict(),
#         "Average Scores": average_scores.to_dict(),
#         "Friedman Test": {
#             "Statistic": friedman_stat,
#             "p-value": friedman_p,
#             "Post-hoc": posthoc_results
#         },
#         **pairwise_results,
#         "Levene's Test for Equality of Variances": levene_results,
#         "Pairwise Comparisons of Variances": pairwise_variances,
#         "Statistical Parity Difference": {
#             "Avg_Score": spd_result_Avg_Score,
#             "Rank": spd_result_rank
#         },
#         "Disparate Impact Ratios": {
#             "Avg_Score": impact_ratios_Avg_Score,
#             "Rank": impact_ratios_rank
#         },
#         "Four-Fifths Rule": {
#             "Avg_Score": adverse_impact_Avg_Score,
#             "Rank": adverse_impact_rank
#         }
#     }
#
#     return results


# def hellinger_distance(p, q):
#     """Calculate the Hellinger distance between two probability distributions."""
#     return np.sqrt(0.5 * np.sum((np.sqrt(p) - np.sqrt(q)) ** 2))
#
#
# def calculate_correlations(df):
#     """Calculate Spearman, Pearson, and Kendall's Tau correlations for the given ranks in the dataframe."""
#     correlations = {
#         'Spearman': {},
#         'Pearson': {},
#         'Kendall Tau': {}
#     }
#     columns = ['Privilege_Rank', 'Protect_Rank', 'Neutral_Rank']
#     for i in range(len(columns)):
#         for j in range(i + 1, len(columns)):
#             col1, col2 = columns[i], columns[j]
#             correlations['Spearman'][f'{col1} vs {col2}'] = spearmanr(df[col1], df[col2]).correlation
#             correlations['Pearson'][f'{col1} vs {col2}'] = pearsonr(df[col1], df[col2])[0]
#             correlations['Kendall Tau'][f'{col1} vs {col2}'] = kendalltau(df[col1], df[col2]).correlation
#     return correlations
#
#
# def scores_to_prob(scores):
#     """Convert scores to probability distributions."""
#     value_counts = scores.value_counts()
#     probabilities = value_counts / value_counts.sum()
#     full_prob = np.zeros(int(scores.max()) + 1)
#     full_prob[value_counts.index.astype(int)] = probabilities
#     return full_prob


# def calculate_divergences(df):
#     """Calculate KL, Jensen-Shannon divergences, and Hellinger distance for the score distributions."""
#     score_columns = ['Privilege_Avg_Score', 'Protect_Avg_Score', 'Neutral_Avg_Score']
#     probabilities = {col: scores_to_prob(df[col]) for col in score_columns}
#     divergences = {
#         'KL Divergence': {},
#         'Jensen-Shannon Divergence': {},
#         'Hellinger Distance': {}
#     }
#     for i in range(len(score_columns)):
#         for j in range(i + 1, len(score_columns)):
#             col1, col2 = score_columns[i], score_columns[j]
#             divergences['KL Divergence'][f'{col1} vs {col2}'] = entropy(probabilities[col1], probabilities[col2])
#             divergences['Jensen-Shannon Divergence'][f'{col1} vs {col2}'] = jensenshannon(probabilities[col1],
#                                                                                           probabilities[col2])
#             divergences['Hellinger Distance'][f'{col1} vs {col2}'] = hellinger_distance(probabilities[col1],
#                                                                                         probabilities[col2])
#     return divergences