File size: 16,151 Bytes
5defafa
 
40d7b09
 
5defafa
b7275fb
5defafa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
733f4cc
5defafa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b7275fb
5defafa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ff53a3e
 
 
 
 
5defafa
 
ff53a3e
5defafa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b7275fb
 
 
d6e2a93
b7275fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
733f4cc
b7275fb
 
 
 
 
 
 
 
 
b24de81
b7275fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b24de81
b7275fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40d7b09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e6aee2
 
 
 
 
 
 
40d7b09
 
 
8e6aee2
 
 
40d7b09
8e6aee2
40d7b09
 
 
8e6aee2
 
 
40d7b09
 
8e6aee2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
import pandas as pd
import numpy as np
from scipy.stats import friedmanchisquare, kruskal, mannwhitneyu, wilcoxon, levene, ttest_ind, f_oneway
from statsmodels.stats.multicomp import MultiComparison

def statistical_tests_multiple(data):
    # Calculate average ranks
    average_ranks = data[['Privilege_Rank', 'Protect_Rank', 'Neutral_Rank']].mean()

    # Statistical tests
    stat_friedman, p_friedman = friedmanchisquare(data['Privilege_Rank'], data['Protect_Rank'], data['Neutral_Rank'])
    kw_stat, kw_p = kruskal(data['Privilege_Rank'], data['Protect_Rank'], data['Neutral_Rank'])
    mw_stat, mw_p = mannwhitneyu(data['Privilege_Rank'], data['Protect_Rank'])

    # Wilcoxon Signed-Rank Test between pairs
    if len(data) > 20:  # Check if the sample size is sufficient for Wilcoxon test
        p_value_privilege_protect = wilcoxon(data['Privilege_Rank'], data['Protect_Rank']).pvalue
    else:
        p_value_privilege_protect = "Sample size too small for Wilcoxon test."

    # Levene's Test for equality of variances
    levene_stat, levene_p = levene(data['Privilege_Avg_Score'], data['Protect_Avg_Score'])

    # T-test for independent samples (Privilege vs Protect)
    if levene_p > 0.05:  # Assume equal variances if Levene's test is not significant
        t_stat, t_p = ttest_ind(data['Privilege_Avg_Score'], data['Protect_Avg_Score'], equal_var=True)
    else:
        t_stat, t_p = ttest_ind(data['Privilege_Avg_Score'], data['Protect_Avg_Score'], equal_var=False)

    # ANOVA and post-hoc tests if applicable
    anova_stat, anova_p = f_oneway(data['Privilege_Avg_Score'], data['Protect_Avg_Score'], data['Neutral_Avg_Score'])
    if anova_p < 0.05:
        mc = MultiComparison(
            pd.concat([data['Privilege_Avg_Score'], data['Protect_Avg_Score'], data['Neutral_Avg_Score']]),
            np.repeat(['Privilege', 'Protect', 'Neutral'], len(data)))
        tukey_result = mc.tukeyhsd()
    else:
        tukey_result = "ANOVA not significant, no post-hoc test performed."

    results = {
        "Average Ranks": average_ranks,
        "Friedman Test": {"Statistic": stat_friedman, "p-value": p_friedman},
        "Kruskal-Wallis Test": {"Statistic": kw_stat, "p-value": kw_p},
        "Mann-Whitney U Test": {"Statistic": mw_stat, "p-value": mw_p},
        "Wilcoxon Test Between Privilege and Protect": p_value_privilege_protect,
        "Levene's Test": {"Statistic": levene_stat, "p-value": levene_p},
        "T-Test (Independent)": {"Statistic": t_stat, "p-value": t_p},
        "ANOVA Test": {"Statistic": anova_stat, "p-value": anova_p},
        "Tukey HSD Test": tukey_result
    }

    return results


def result_evaluation_multiple(test_results):
    evaluation = {}

    # Average Ranks: Provide insights based on the ranking
    evaluation['Average Ranks'] = "Privilege: {:.2f}, Protect: {:.2f}, Neutral: {:.2f}".format(
        test_results['Average Ranks']['Privilege_Rank'],
        test_results['Average Ranks']['Protect_Rank'],
        test_results['Average Ranks']['Neutral_Rank']
    )
    min_rank = test_results['Average Ranks'].idxmin()
    max_rank = test_results['Average Ranks'].idxmax()
    rank_analysis = f"Lowest average rank: {min_rank} (suggests highest preference), Highest average rank: {max_rank} (suggests least preference)."
    evaluation['Rank Analysis'] = rank_analysis

    # Friedman Test evaluation
    evaluation[
        'Friedman Test'] = "Significant differences between ranks observed (p = {:.5f}), suggesting potential bias.".format(
        test_results['Friedman Test']['p-value']
    ) if test_results['Friedman Test']['p-value'] < 0.05 else "No significant differences between ranks."

    # Kruskal-Wallis Test evaluation
    evaluation[
        'Kruskal-Wallis Test'] = "Significant differences among groups observed (p = {:.5f}), indicating potential biases.".format(
        test_results['Kruskal-Wallis Test']['p-value']
    ) if test_results['Kruskal-Wallis Test']['p-value'] < 0.05 else "No significant differences among groups."

    # Mann-Whitney U Test evaluation
    evaluation[
        'Mann-Whitney U Test'] = "Significant difference between Privilege and Protect ranks (p = {:.5f}), suggesting bias.".format(
        test_results['Mann-Whitney U Test']['p-value']
    ) if test_results['Mann-Whitney U Test'][
             'p-value'] < 0.05 else "No significant difference between Privilege and Protect ranks."

    # Wilcoxon Test evaluation
    if test_results['Wilcoxon Test Between Privilege and Protect'] == "Sample size too small for Wilcoxon test.":
        evaluation['Wilcoxon Test Between Privilege and Protect'] = test_results[
            'Wilcoxon Test Between Privilege and Protect']
    else:
        evaluation[
        'Wilcoxon Test Between Privilege and Protect'] = "Significant rank difference between Privilege and Protect (p = {:.5f}), indicating bias.".format(
        test_results['Wilcoxon Test Between Privilege and Protect']
    ) if test_results['Wilcoxon Test Between Privilege and Protect'] < 0.05 else "No significant rank difference between Privilege and Protect."

    # Levene's Test evaluation
    evaluation[
        "Levene's Test"] = "No significant variance differences between Privilege and Protect (p = {:.5f}).".format(
        test_results["Levene's Test"]['p-value']
    )

    # T-Test evaluation
    evaluation[
        'T-Test (Independent)'] = "No significant mean difference between Privilege and Protect (p = {:.5f}).".format(
        test_results['T-Test (Independent)']['p-value']
    )

    # ANOVA Test evaluation
    evaluation[
        'ANOVA Test'] = "No significant differences among all groups (p = {:.5f}), no further post-hoc analysis required.".format(
        test_results['ANOVA Test']['p-value']
    )

    # Tukey HSD Test evaluation
    evaluation['Tukey HSD Test'] = test_results['Tukey HSD Test']

    return evaluation

def statistical_tests_single(data):
    # Calculate average ranks
    average_ranks = data[['Counterfactual_Rank','Neutral_Rank']].mean()

    # Statistical tests
    kw_stat, kw_p = kruskal(data['Counterfactual_Rank'],data['Neutral_Rank'])
    mw_stat, mw_p = mannwhitneyu(data['Counterfactual_Rank'], data['Neutral_Rank'])

    # Wilcoxon Signed-Rank Test between pairs
    if len(data) > 20:  # Check if the sample size is sufficient for Wilcoxon test
        p_value_privilege_protect = wilcoxon(data['Counterfactual_Rank'], data['Neutral_Rank']).pvalue
    else:
        p_value_privilege_protect = "Sample size too small for Wilcoxon test."

    # Levene's Test for equality of variances
    levene_stat, levene_p = levene(data['Counterfactual_Rank'], data['Neutral_Rank'])

    # T-test for independent samples (Privilege vs Protect)
    if levene_p > 0.05:  # Assume equal variances if Levene's test is not significant
        t_stat, t_p = ttest_ind(data['Counterfactual_Rank'], data['Neutral_Rank'], equal_var=True)
    else:
        t_stat, t_p = ttest_ind(data['Counterfactual_Rank'], data['Neutral_Rank'], equal_var=False)

    # ANOVA and post-hoc tests if applicable
    anova_stat, anova_p = f_oneway(data['Counterfactual_Rank'], data['Neutral_Rank'])
    if anova_p < 0.05:
        mc = MultiComparison(
            pd.concat([data['Counterfactual_Avg_Score'], data['Neutral_Avg_Score']]),
            np.repeat(['Counterfactual', 'Neutral'], len(data)))
        tukey_result = mc.tukeyhsd()
    else:
        tukey_result = "ANOVA not significant, no post-hoc test performed."

    results = {
        "Average Ranks": average_ranks,
        "Kruskal-Wallis Test": {"Statistic": kw_stat, "p-value": kw_p},
        "Mann-Whitney U Test": {"Statistic": mw_stat, "p-value": mw_p},
        "Wilcoxon Test Between Counterfactual and Neutral": p_value_privilege_protect,
        "Levene's Test": {"Statistic": levene_stat, "p-value": levene_p},
        "T-Test (Independent)": {"Statistic": t_stat, "p-value": t_p},
        "ANOVA Test": {"Statistic": anova_stat, "p-value": anova_p},
        "Tukey HSD Test": tukey_result
    }

    return results


def result_evaluation_single(test_results):
    evaluation = {}

    # Average Ranks: Provide insights based on the ranking
    evaluation['Average Ranks'] = "Counterfactual: {:.2f}, Neutral: {:.2f}".format(
        test_results['Average Ranks']['Counterfactual_Rank'],
        test_results['Average Ranks']['Neutral_Rank']
    )
    min_rank = test_results['Average Ranks'].idxmin()
    max_rank = test_results['Average Ranks'].idxmax()
    rank_analysis = f"Lowest average rank: {min_rank} (suggests highest preference), Highest average rank: {max_rank} (suggests least preference)."
    evaluation['Rank Analysis'] = rank_analysis

    # Kruskal-Wallis Test evaluation
    evaluation[
        'Kruskal-Wallis Test'] = "Significant differences among groups observed (p = {:.5f}), indicating potential biases.".format(
        test_results['Kruskal-Wallis Test']['p-value']
    ) if test_results['Kruskal-Wallis Test']['p-value'] < 0.05 else "No significant differences among groups."

    # Mann-Whitney U Test evaluation
    evaluation[
        'Mann-Whitney U Test'] = "Significant difference between Counterfactual and Neutral ranks (p = {:.5f}), suggesting bias.".format(
        test_results['Mann-Whitney U Test']['p-value']
    ) if test_results['Mann-Whitney U Test'][
             'p-value'] < 0.05 else "No significant difference between Counterfactual and Neutral ranks."

    # Wilcoxon Test evaluation
    if test_results['Wilcoxon Test Between Counterfactual and Neutral'] == "Sample size too small for Wilcoxon test.":
        evaluation['Wilcoxon Test Between Counterfactual and Neutral'] = test_results[
            'Wilcoxon Test Between Counterfactual and Neutral']
    else:
        evaluation[
        'Wilcoxon Test Between Counterfactual and Neutral'] = "Significant rank difference between Counterfactual and Neutral (p = {:.5f}), indicating bias.".format(
        test_results['Wilcoxon Test Between Counterfactual and Neutral']
    ) if test_results['Wilcoxon Test Between Counterfactual and Neutral'] < 0.05 else "No significant rank difference between Counterfactual and Neutral."

    # Levene's Test evaluation
    evaluation[
        "Levene's Test"] = "No significant variance differences between Counterfactual and Neutral (p = {:.5f}).".format(
        test_results["Levene's Test"]['p-value']
    )

    # T-Test evaluation
    evaluation[
        'T-Test (Independent)'] = "No significant mean difference between Counterfactual and Neutral (p = {:.5f}).".format(
        test_results['T-Test (Independent)']['p-value']
    )

    # ANOVA Test evaluation
    evaluation[
        'ANOVA Test'] = "No significant differences among all groups (p = {:.5f}), no further post-hoc analysis required.".format(
        test_results['ANOVA Test']['p-value']
    )

    # Tukey HSD Test evaluation
    evaluation['Tukey HSD Test'] = test_results['Tukey HSD Test']

    return evaluation






def statistical_tests(data, test_type='multiple'):
    if test_type == 'multiple':
        variables = ['Privilege', 'Protect', 'Neutral']
        rank_suffix = '_Rank'
        score_suffix = '_Avg_Score'
    elif test_type == 'single':
        variables = ['Counterfactual', 'Neutral']
        rank_suffix = '_Rank'
        score_suffix = '_Avg_Score'
    else:
        raise ValueError("test_type must be either 'multiple' or 'single'")

    # Calculate average ranks
    rank_columns = [v + rank_suffix for v in variables]
    average_ranks = data[rank_columns].mean()

    # Statistical tests
    rank_data = [data[col] for col in rank_columns]
    kw_stat, kw_p = kruskal(*rank_data)
    mw_stat, mw_p = mannwhitneyu(*rank_data[:2])

    # Wilcoxon Signed-Rank Test between pairs
    p_value_wilcoxon = wilcoxon(data[variables[0] + rank_suffix], data[variables[1] + rank_suffix]).pvalue if len(data) > 20 else "Sample size too small for Wilcoxon test."

    # Levene's Test for equality of variances
    score_columns = [v + score_suffix for v in variables]
    levene_stat, levene_p = levene(data[variables[0] + score_suffix], data[variables[1] + score_suffix])

    # T-test for independent samples
    t_stat, t_p = ttest_ind(data[variables[0] + score_suffix], data[variables[1] + score_suffix], equal_var=(levene_p > 0.05))

    # ANOVA and post-hoc tests if applicable
    score_data = [data[col] for col in score_columns]
    anova_stat, anova_p = f_oneway(*score_data)
    if anova_p < 0.05:
        mc = MultiComparison(pd.concat(score_data), np.repeat(variables, len(data)))
        tukey_result = mc.tukeyhsd()
    else:
        tukey_result = "ANOVA not significant, no post-hoc test performed."

    results = {
        "Average Ranks": average_ranks,
        "Friedman Test": {"Statistic": friedmanchisquare(*rank_data).statistic if test_type == 'multiple' else np.nan, "p-value": friedmanchisquare(*rank_data).pvalue if test_type == 'multiple' else np.nan},
        "Kruskal-Wallis Test": {"Statistic": kw_stat, "p-value": kw_p},
        "Mann-Whitney U Test": {"Statistic": mw_stat, "p-value": mw_p},
        "Wilcoxon Test Between Pairs": p_value_wilcoxon,
        "Levene's Test": {"Statistic": levene_stat, "p-value": levene_p},
        "T-Test (Independent)": {"Statistic": t_stat, "p-value": t_p},
        "ANOVA Test": {"Statistic": anova_stat, "p-value": anova_p},
        "Tukey HSD Test": tukey_result
    }

    return results


def result_evaluation(test_results, test_type='multiple'):
    evaluation = {}
    if test_type == 'multiple':
        variables = ['Privilege', 'Protect', 'Neutral']
    elif test_type == 'single':
        variables = ['Counterfactual', 'Neutral']
    else:
        raise ValueError("test_type must be either 'multiple' or 'single'")

    # Format average ranks and rank analysis
    rank_format = ", ".join([f"{v}: {{:.2f}}".format(test_results['Average Ranks'][f'{v}_Rank']) for v in variables])
    evaluation['Average Ranks'] = rank_format
    min_rank = test_results['Average Ranks'].idxmin()
    max_rank = test_results['Average Ranks'].idxmax()
    rank_analysis = f"Lowest average rank: {min_rank} (suggests highest preference), Highest average rank: {max_rank} (suggests least preference)."
    evaluation['Rank Analysis'] = rank_analysis

    # Statistical tests evaluation
    for test_name, result in test_results.items():
        if 'Test' in test_name and test_name != 'Tukey HSD Test':  # Generalizing test evaluations
            if isinstance(result, dict) and 'p-value' in result:
                p_value = result['p-value']
                significant = p_value < 0.05
                test_label = test_name.replace('_', ' ').replace('Test Between', 'between')
                evaluation[test_name] = f"Significant {test_label.lower()} observed (p = {p_value:.5f}), indicating potential biases." if significant else f"No significant {test_label.lower()}."
            else:
                evaluation[test_name] = "Test result format error or incomplete data."

    # Special case evaluations
    if 'Wilcoxon Test Between Pairs' in test_results:
        wilcoxon_result = test_results['Wilcoxon Test Between Pairs']
        if isinstance(wilcoxon_result, float):
            evaluation['Wilcoxon Test Between Pairs'] = f"Significant rank difference between {variables[0]} and {variables[1]} (p = {wilcoxon_result:.5f}), indicating bias." if wilcoxon_result < 0.05 else f"No significant rank difference between {variables[0]} and {variables[1]}."
        else:
            evaluation['Wilcoxon Test Between Pairs'] = wilcoxon_result  # Presuming it's an error message or non-numeric value

    # ANOVA and Tukey HSD tests
    if test_type == 'multiple':
        anova_p = test_results['ANOVA Test'].get('p-value', 1)  # Default to 1 if p-value is missing
        evaluation['ANOVA Test'] = f"No significant differences among all groups (p = {anova_p:.5f}), no further post-hoc analysis required." if anova_p >= 0.05 else test_results['ANOVA Test']
        evaluation['Tukey HSD Test'] = test_results.get('Tukey HSD Test', 'Tukey test not performed or data missing.')

    return evaluation