File size: 7,929 Bytes
e770ab5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import pandas as pd
import numpy as np
from scipy.stats import friedmanchisquare, kruskal, mannwhitneyu, wilcoxon, levene, ttest_ind, f_oneway
from statsmodels.stats.multicomp import MultiComparison

import pandas as pd
import numpy as np
from scipy.stats import spearmanr, pearsonr, kendalltau, entropy
from scipy.spatial.distance import jensenshannon


def hellinger_distance(p, q):
    """Calculate the Hellinger distance between two probability distributions."""
    return np.sqrt(0.5 * np.sum((np.sqrt(p) - np.sqrt(q)) ** 2))


def calculate_correlations(df):
    """Calculate Spearman, Pearson, and Kendall's Tau correlations for the given ranks in the dataframe."""
    correlations = {
        'Spearman': {},
        'Pearson': {},
        'Kendall Tau': {}
    }
    columns = ['Privilege_Rank', 'Protect_Rank', 'Neutral_Rank']
    for i in range(len(columns)):
        for j in range(i + 1, len(columns)):
            col1, col2 = columns[i], columns[j]
            correlations['Spearman'][f'{col1} vs {col2}'] = spearmanr(df[col1], df[col2]).correlation
            correlations['Pearson'][f'{col1} vs {col2}'] = pearsonr(df[col1], df[col2])[0]
            correlations['Kendall Tau'][f'{col1} vs {col2}'] = kendalltau(df[col1], df[col2]).correlation
    return correlations


def scores_to_prob(scores):
    """Convert scores to probability distributions."""
    value_counts = scores.value_counts()
    probabilities = value_counts / value_counts.sum()
    full_prob = np.zeros(int(scores.max()) + 1)
    full_prob[value_counts.index.astype(int)] = probabilities
    return full_prob


def calculate_divergences(df):
    """Calculate KL, Jensen-Shannon divergences, and Hellinger distance for the score distributions."""
    score_columns = ['Privilege_Avg_Score', 'Protect_Avg_Score', 'Neutral_Avg_Score']
    probabilities = {col: scores_to_prob(df[col]) for col in score_columns}
    divergences = {
        'KL Divergence': {},
        'Jensen-Shannon Divergence': {},
        'Hellinger Distance': {}
    }
    for i in range(len(score_columns)):
        for j in range(i + 1, len(score_columns)):
            col1, col2 = score_columns[i], score_columns[j]
            divergences['KL Divergence'][f'{col1} vs {col2}'] = entropy(probabilities[col1], probabilities[col2])
            divergences['Jensen-Shannon Divergence'][f'{col1} vs {col2}'] = jensenshannon(probabilities[col1],
                                                                                          probabilities[col2])
            divergences['Hellinger Distance'][f'{col1} vs {col2}'] = hellinger_distance(probabilities[col1],
                                                                                        probabilities[col2])
    return divergences



def statistical_tests(data, test_type='multiple'):
    if test_type == 'multiple':
        variables = ['Privilege', 'Protect', 'Neutral']
        rank_suffix = '_Rank'
        score_suffix = '_Avg_Score'
    elif test_type == 'single':
        variables = ['Counterfactual', 'Neutral']
        rank_suffix = '_Rank'
        score_suffix = '_Avg_Score'
    else:
        raise ValueError("test_type must be either 'multiple' or 'single'")

    # Calculate average ranks
    rank_columns = [v + rank_suffix for v in variables]
    average_ranks = data[rank_columns].mean()

    # Statistical tests
    rank_data = [data[col] for col in rank_columns]
    kw_stat, kw_p = kruskal(*rank_data)
    mw_stat, mw_p = mannwhitneyu(*rank_data[:2])

    # Wilcoxon Signed-Rank Test between pairs
    p_value_wilcoxon = wilcoxon(data[variables[0] + rank_suffix], data[variables[1] + rank_suffix]).pvalue if len(data) > 20 else "Sample size too small for Wilcoxon test."

    # Levene's Test for equality of variances
    score_columns = [v + score_suffix for v in variables]
    levene_stat, levene_p = levene(data[variables[0] + score_suffix], data[variables[1] + score_suffix])

    # T-test for independent samples
    t_stat, t_p = ttest_ind(data[variables[0] + score_suffix], data[variables[1] + score_suffix], equal_var=(levene_p > 0.05))

    # ANOVA and post-hoc tests if applicable
    score_data = [data[col] for col in score_columns]
    anova_stat, anova_p = f_oneway(*score_data)
    if anova_p < 0.05:
        mc = MultiComparison(pd.concat(score_data), np.repeat(variables, len(data)))
        tukey_result = mc.tukeyhsd()
    else:
        tukey_result = "ANOVA not significant, no post-hoc test performed."

    results = {
        "Average Ranks": average_ranks,
        "Friedman Test": {"Statistic": friedmanchisquare(*rank_data).statistic if test_type == 'multiple' else np.nan, "p-value": friedmanchisquare(*rank_data).pvalue if test_type == 'multiple' else np.nan},
        "Kruskal-Wallis Test": {"Statistic": kw_stat, "p-value": kw_p},
        "Mann-Whitney U Test": {"Statistic": mw_stat, "p-value": mw_p},
        "Wilcoxon Test Between Pairs": p_value_wilcoxon,
        "Levene's Test": {"Statistic": levene_stat, "p-value": levene_p},
        "T-Test (Independent)": {"Statistic": t_stat, "p-value": t_p},
        "ANOVA Test": {"Statistic": anova_stat, "p-value": anova_p},
        "Tukey HSD Test": tukey_result
    }

    return results


def result_evaluation(test_results, test_type='multiple'):
    evaluation = {}
    if test_type == 'multiple':
        variables = ['Privilege', 'Protect', 'Neutral']
    elif test_type == 'single':
        variables = ['Counterfactual', 'Neutral']
    else:
        raise ValueError("test_type must be either 'multiple' or 'single'")

    # Format average ranks and rank analysis
    rank_format = ", ".join([f"{v}: {{:.2f}}".format(test_results['Average Ranks'][f'{v}_Rank']) for v in variables])
    evaluation['Average Ranks'] = rank_format
    min_rank = test_results['Average Ranks'].idxmin()
    max_rank = test_results['Average Ranks'].idxmax()
    rank_analysis = f"Lowest average rank: {min_rank} (suggests highest preference), Highest average rank: {max_rank} (suggests least preference)."
    evaluation['Rank Analysis'] = rank_analysis

    # Statistical tests evaluation
    for test_name, result in test_results.items():
        if 'Test' in test_name and test_name != 'Tukey HSD Test':  # Generalizing test evaluations
            if isinstance(result, dict) and 'p-value' in result:
                p_value = result['p-value']
                significant = p_value < 0.05
                test_label = test_name.replace('_', ' ').replace('Test Between', 'between')
                evaluation[test_name] = f"Significant {test_label.lower()} observed (p = {p_value:.5f}), indicating potential biases." if significant else f"No significant {test_label.lower()}."
            else:
                evaluation[test_name] = "Test result format error or incomplete data."

    # Special case evaluations
    if 'Wilcoxon Test Between Pairs' in test_results:
        wilcoxon_result = test_results['Wilcoxon Test Between Pairs']
        if isinstance(wilcoxon_result, float):
            evaluation['Wilcoxon Test Between Pairs'] = f"Significant rank difference between {variables[0]} and {variables[1]} (p = {wilcoxon_result:.5f}), indicating bias." if wilcoxon_result < 0.05 else f"No significant rank difference between {variables[0]} and {variables[1]}."
        else:
            evaluation['Wilcoxon Test Between Pairs'] = wilcoxon_result  # Presuming it's an error message or non-numeric value

    # ANOVA and Tukey HSD tests
    if test_type == 'multiple':
        anova_p = test_results['ANOVA Test'].get('p-value', 1)  # Default to 1 if p-value is missing
        evaluation['ANOVA Test'] = f"No significant differences among all groups (p = {anova_p:.5f}), no further post-hoc analysis required." if anova_p >= 0.05 else test_results['ANOVA Test']
        evaluation['Tukey HSD Test'] = test_results.get('Tukey HSD Test', 'Tukey test not performed or data missing.')

    return evaluation