job-fair / util /evaluation.py
Zekun Wu
update
ae16dbc
raw
history blame
No virus
12 kB
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import friedmanchisquare, kruskal, mannwhitneyu, wilcoxon, levene, ttest_ind, f_oneway
from statsmodels.stats.multicomp import MultiComparison
from scipy.stats import spearmanr, pearsonr, kendalltau, entropy
from scipy.spatial.distance import jensenshannon
from scipy.stats import ttest_ind, friedmanchisquare, rankdata, ttest_rel
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scipy.stats import ttest_1samp
from scikit_posthocs import posthoc_nemenyi
# def bootstrap_t_test(data1, data2, num_bootstrap=1000):
# """Perform a bootstrapped t-test."""
# observed_t_stat, _ = ttest_ind(data1, data2)
# combined = np.concatenate([data1, data2])
# t_stats = []
#
# for _ in range(num_bootstrap):
# np.random.shuffle(combined)
# new_data1 = combined[:len(data1)]
# new_data2 = combined[len(data1):]
# t_stat, _ = ttest_ind(new_data1, new_data2)
# t_stats.append(t_stat)
#
# p_value = np.sum(np.abs(t_stats) >= np.abs(observed_t_stat)) / num_bootstrap
# return observed_t_stat, p_value
# def bootstrap_t_test(data1, data2, num_bootstrap=1000):
# """Perform a bootstrapped paired t-test for mean difference being zero."""
# # Calculate the observed differences between paired samples
# differences = data1 - data2
# # Compute the observed t-statistic for the differences
# observed_t_stat, _ = ttest_1samp(differences, 0)
#
# t_stats = []
#
# for _ in range(num_bootstrap):
# # Resample the differences with replacement
# resampled_diffs = np.random.choice(differences, size=len(differences), replace=True)
# # Perform a one-sample t-test on the resampled differences against zero
# t_stat, _ = ttest_1samp(resampled_diffs, 0)
# # Append the t-statistic to the list
# t_stats.append(t_stat)
#
# # Calculate the p-value as the proportion of bootstrap t-statistics
# # that are as extreme as or more extreme than the observed t-statistic
# p_value = np.sum(np.abs(t_stats) >= np.abs(observed_t_stat)) / num_bootstrap
# return observed_t_stat, p_value
# def posthoc_friedman(data, variables, rank_suffix='_Rank'):
# """Perform a post-hoc analysis for the Friedman test using pairwise comparisons."""
# ranked_data = data[[v + rank_suffix for v in variables]].to_numpy()
# num_subjects = ranked_data.shape[0]
# num_conditions = ranked_data.shape[1]
# comparisons = []
#
# for i in range(num_conditions):
# for j in range(i + 1, num_conditions):
# diff = ranked_data[:, i] - ranked_data[:, j]
# abs_diff = np.abs(diff)
# avg_diff = np.mean(diff)
# se_diff = np.std(diff, ddof=1) / np.sqrt(num_subjects)
# z_value = avg_diff / se_diff
# p_value = 2 * (1 - stats.norm.cdf(np.abs(z_value)))
# comparisons.append({
# "Group1": variables[i],
# "Group2": variables[j],
# "Z": z_value,
# "p-value": p_value
# })
#
# return comparisons
def posthoc_friedman_nemenyi(data, variables, rank_suffix='_Rank'):
"""Perform post-hoc Nemenyi test for the Friedman test."""
ranked_data = data[[v + rank_suffix for v in variables]].to_numpy()
return posthoc_nemenyi(ranked_data)
def statistical_tests(data):
"""Perform various statistical tests to evaluate potential biases."""
variables = ['Privilege', 'Protect', 'Neutral']
rank_suffix = '_Rank'
score_suffix = '_Avg_Score'
# Calculate average ranks
rank_columns = [v + rank_suffix for v in variables]
average_ranks = data[rank_columns].mean()
# Statistical tests
rank_data = [data[col] for col in rank_columns]
# Pairwise tests
pairs = [
('Privilege', 'Protect'),
('Protect', 'Neutral'),
('Privilege', 'Neutral')
]
pairwise_results = {
'T-Test': {}
}
pairwise_results = {
'Wilcoxon Signed-Rank Test': {}
}
for (var1, var2) in pairs:
pair_name_score = f'{var1}{score_suffix} vs {var2}{score_suffix}'
pair_rank_score = f'{var1}{rank_suffix} vs {var2}{rank_suffix}'
# Wilcoxon signed-rank test for pairwise comparisons
wilcoxon_stat, wilcoxon_p = wilcoxon(data[f'{var1}{score_suffix}'], data[f'{var2}{score_suffix}'])
pairwise_results['Wilcoxon Signed-Rank Test'][pair_name_score] = {"Statistic": wilcoxon_stat,
"p-value": wilcoxon_p}
# Friedman test
friedman_stat, friedman_p = friedmanchisquare(*rank_data)
posthoc_results = posthoc_friedman_nemenyi(data, variables, rank_suffix)
results = {
"Average Ranks": average_ranks.to_dict(),
"Friedman Test": {
"Statistic": friedman_stat,
"p-value": friedman_p,
"Post-hoc": posthoc_results
},
**pairwise_results,
}
return results
def hellinger_distance(p, q):
"""Calculate the Hellinger distance between two probability distributions."""
return np.sqrt(0.5 * np.sum((np.sqrt(p) - np.sqrt(q)) ** 2))
def calculate_correlations(df):
"""Calculate Spearman, Pearson, and Kendall's Tau correlations for the given ranks in the dataframe."""
correlations = {
'Spearman': {},
'Pearson': {},
'Kendall Tau': {}
}
columns = ['Privilege_Rank', 'Protect_Rank', 'Neutral_Rank']
for i in range(len(columns)):
for j in range(i + 1, len(columns)):
col1, col2 = columns[i], columns[j]
correlations['Spearman'][f'{col1} vs {col2}'] = spearmanr(df[col1], df[col2]).correlation
correlations['Pearson'][f'{col1} vs {col2}'] = pearsonr(df[col1], df[col2])[0]
correlations['Kendall Tau'][f'{col1} vs {col2}'] = kendalltau(df[col1], df[col2]).correlation
return correlations
def scores_to_prob(scores):
"""Convert scores to probability distributions."""
value_counts = scores.value_counts()
probabilities = value_counts / value_counts.sum()
full_prob = np.zeros(int(scores.max()) + 1)
full_prob[value_counts.index.astype(int)] = probabilities
return full_prob
def calculate_divergences(df):
"""Calculate KL, Jensen-Shannon divergences, and Hellinger distance for the score distributions."""
score_columns = ['Privilege_Avg_Score', 'Protect_Avg_Score', 'Neutral_Avg_Score']
probabilities = {col: scores_to_prob(df[col]) for col in score_columns}
divergences = {
'KL Divergence': {},
'Jensen-Shannon Divergence': {},
'Hellinger Distance': {}
}
for i in range(len(score_columns)):
for j in range(i + 1, len(score_columns)):
col1, col2 = score_columns[i], score_columns[j]
divergences['KL Divergence'][f'{col1} vs {col2}'] = entropy(probabilities[col1], probabilities[col2])
divergences['Jensen-Shannon Divergence'][f'{col1} vs {col2}'] = jensenshannon(probabilities[col1],
probabilities[col2])
divergences['Hellinger Distance'][f'{col1} vs {col2}'] = hellinger_distance(probabilities[col1],
probabilities[col2])
return divergences
# def statistical_tests(data):
# """Perform various statistical tests to evaluate potential biases."""
# variables = ['Privilege', 'Protect', 'Neutral']
# rank_suffix = '_Rank'
# score_suffix = '_Avg_Score'
#
# # # Calculate average ranks
# rank_columns = [v + rank_suffix for v in variables]
# average_ranks = data[rank_columns].mean()
#
# # Statistical tests
# rank_data = [data[col] for col in rank_columns]
#
# # Pairwise tests
# pairs = [
# ('Privilege', 'Protect'),
# ('Protect', 'Neutral'),
# ('Privilege', 'Neutral')
# ]
#
# pairwise_results = {
# 'T-Test': {}
# }
#
# for (var1, var2) in pairs:
# pair_name_score = f'{var1}{score_suffix} vs {var2}{score_suffix}'
#
# # T-test for independent samples
# t_stat, t_p = ttest_ind(data[f'{var1}{score_suffix}'], data[f'{var2}{score_suffix}'])
# pairwise_results['T-Test'][pair_name_score] = {"Statistic": t_stat, "p-value": t_p}
#
# results = {
# "Average Ranks": average_ranks.to_dict(),
# "Friedman Test": {
# "Statistic": friedmanchisquare(*rank_data).statistic,
# "p-value": friedmanchisquare(*rank_data).pvalue
# },
# **pairwise_results,
# }
#
# return results
def disabled_statistical_tests(data):
"""Perform various statistical tests to evaluate potential biases."""
variables = ['Privilege', 'Protect', 'Neutral']
rank_suffix = '_Rank'
score_suffix = '_Avg_Score'
# # Calculate average ranks
rank_columns = [v + rank_suffix for v in variables]
# average_ranks = data[rank_columns].mean()
# Statistical tests
rank_data = [data[col] for col in rank_columns]
kw_stat, kw_p = kruskal(*rank_data)
# Pairwise tests
pairwise_results = {}
pairs = [
('Privilege', 'Protect'),
('Protect', 'Neutral'),
('Privilege', 'Neutral')
]
pairwise_results = {
# 'Mann-Whitney U Test': {},
# 'Wilcoxon Test': {},
# 'Levene\'s Test': {},
'T-Test': {}
}
for (var1, var2) in pairs:
pair_name_rank = f'{var1}{rank_suffix} vs {var2}{rank_suffix}'
pair_name_score = f'{var1}{score_suffix} vs {var2}{score_suffix}'
# # Mann-Whitney U Test
# mw_stat, mw_p = mannwhitneyu(data[f'{var1}{rank_suffix}'], data[f'{var2}{rank_suffix}'])
# pairwise_results['Mann-Whitney U Test'][pair_name_rank] = {"Statistic": mw_stat, "p-value": mw_p}
#
# # Wilcoxon Signed-Rank Test
# if len(data) > 20:
# wilcoxon_stat, wilcoxon_p = wilcoxon(data[f'{var1}{rank_suffix}'], data[f'{var2}{rank_suffix}'])
# else:
# wilcoxon_stat, wilcoxon_p = np.nan, "Sample size too small for Wilcoxon test."
# pairwise_results['Wilcoxon Test'][pair_name_rank] = {"Statistic": wilcoxon_stat, "p-value": wilcoxon_p}
#
# Levene's Test for equality of variances
# levene_stat, levene_p = levene(data[f'{var1}{score_suffix}'], data[f'{var2}{score_suffix}'])
# pairwise_results['Levene\'s Test'][pair_name_score] = {"Statistic": levene_stat, "p-value": levene_p}
# T-test for independent samples
t_stat, t_p = ttest_ind(data[f'{var1}{score_suffix}'], data[f'{var2}{score_suffix}'])
#equal_var=(levene_p > 0.05))
pairwise_results['T-Test'][pair_name_score] = {"Statistic": t_stat, "p-value": t_p}
# ANOVA and post-hoc tests if applicable
# score_columns = [v + score_suffix for v in variables]
# score_data = [data[col] for col in score_columns]
# anova_stat, anova_p = f_oneway(*score_data)
# if anova_p < 0.05:
# mc = MultiComparison(data.melt()['value'], data.melt()['variable'])
# tukey_result = mc.tukeyhsd()
# tukey_result_summary = tukey_result.summary().as_html()
# else:
# tukey_result_summary = "ANOVA not significant, no post-hoc test performed."
results = {
#"Average Ranks": average_ranks.to_dict(),
"Friedman Test": {
"Statistic": friedmanchisquare(*rank_data).statistic,
"p-value": friedmanchisquare(*rank_data).pvalue
},
# "Kruskal-Wallis Test": {"Statistic": kw_stat, "p-value": kw_p},
**pairwise_results,
# "ANOVA Test": {"Statistic": anova_stat, "p-value": anova_p},
#"Tukey HSD Test": tukey_result_summary
}
return results