Spaces:

holistic-ai
/

job-fair

Sleeping

App Files Files Community

Zekun Wu commited on May 13

Commit

fcfc515

•

1 Parent(s): 473c1df

update

Browse files

Files changed (1) hide show

util/evaluation.py +25 -20

util/evaluation.py CHANGED Viewed

@@ -62,12 +62,11 @@ def calculate_divergences(df):
 def statistical_tests(data):
     variables = ['Privilege', 'Protect', 'Neutral']
     rank_suffix = '_Rank'
     score_suffix = '_Avg_Score'
     # Calculate average ranks
     rank_columns = [v + rank_suffix for v in variables]
     average_ranks = data[rank_columns].mean()
@@ -75,49 +74,55 @@ def statistical_tests(data):
     # Statistical tests
     rank_data = [data[col] for col in rank_columns]
     kw_stat, kw_p = kruskal(*rank_data)
-    mw_stat, mw_p = mannwhitneyu(*rank_data[:2])
     # Wilcoxon Signed-Rank Test between pairs
-    p_value_wilcoxon = wilcoxon(data[variables[0] + rank_suffix], data[variables[1] + rank_suffix]).pvalue if len(data) > 20 else "Sample size too small for Wilcoxon test."
     # Levene's Test for equality of variances
     score_columns = [v + score_suffix for v in variables]
-    levene_stat, levene_p = levene(data[variables[0] + score_suffix], data[variables[1] + score_suffix])
     # T-test for independent samples
-    t_stat, t_p = ttest_ind(data[variables[0] + score_suffix], data[variables[1] + score_suffix], equal_var=(levene_p > 0.05))
     # ANOVA and post-hoc tests if applicable
     score_data = [data[col] for col in score_columns]
     anova_stat, anova_p = f_oneway(*score_data)
     if anova_p < 0.05:
-        mc = MultiComparison(pd.concat(score_data), np.repeat(variables, len(data)))
         tukey_result = mc.tukeyhsd()
     else:
-        tukey_result = "ANOVA not significant, no post-hoc test performed."
     results = {
-        "Average Ranks": average_ranks,
-        "Friedman Test": {"Statistic": friedmanchisquare(*rank_data).statistic, "p-value": friedmanchisquare(*rank_data).pvalue},
         "Kruskal-Wallis Test": {"Statistic": kw_stat, "p-value": kw_p},
         "Mann-Whitney U Test": {"Statistic": mw_stat, "p-value": mw_p},
-        "Wilcoxon Test Between Pairs": p_value_wilcoxon,
         "Levene's Test": {"Statistic": levene_stat, "p-value": levene_p},
         "T-Test (Independent)": {"Statistic": t_stat, "p-value": t_p},
         "ANOVA Test": {"Statistic": anova_stat, "p-value": anova_p},
-        "Tukey HSD Test": tukey_result
     }
     return results
 def result_evaluation(test_results):
     evaluation = {}
     variables = ['Privilege', 'Protect', 'Neutral']
     # Format average ranks and rank analysis
-    rank_format = ", ".join([f"{v}: {{:.2f}}".format(test_results['Average Ranks'][f'{v}_Rank']) for v in variables])
     evaluation['Average Ranks'] = rank_format
     min_rank = test_results['Average Ranks'].idxmin()
     max_rank = test_results['Average Ranks'].idxmax()
@@ -126,7 +131,7 @@ def result_evaluation(test_results):
     # Statistical tests evaluation
     for test_name, result in test_results.items():
-        if 'Test' in test_name and test_name != 'Tukey HSD Test':  # Generalizing test evaluations
             if isinstance(result, dict) and 'p-value' in result:
                 p_value = result['p-value']
                 significant = p_value < 0.05
@@ -138,14 +143,14 @@ def result_evaluation(test_results):
     # Special case evaluations
     if 'Wilcoxon Test Between Pairs' in test_results:
         wilcoxon_result = test_results['Wilcoxon Test Between Pairs']
-        if isinstance(wilcoxon_result, float):
-            evaluation['Wilcoxon Test Between Pairs'] = f"Significant rank difference between {variables[0]} and {variables[1]} (p = {wilcoxon_result:.5f}), indicating bias." if wilcoxon_result < 0.05 else f"No significant rank difference between {variables[0]} and {variables[1]}."
         else:
-            evaluation['Wilcoxon Test Between Pairs'] = wilcoxon_result  # Presuming it's an error message or non-numeric value
     # ANOVA and Tukey HSD tests
     anova_p = test_results['ANOVA Test'].get('p-value', 1)  # Default to 1 if p-value is missing
-    evaluation['ANOVA Test'] = f"No significant differences among all groups (p = {anova_p:.5f}), no further post-hoc analysis required." if anova_p >= 0.05 else test_results['ANOVA Test']
     evaluation['Tukey HSD Test'] = test_results.get('Tukey HSD Test', 'Tukey test not performed or data missing.')
     return evaluation

 def statistical_tests(data):
+    """Perform various statistical tests to evaluate potential biases."""
     variables = ['Privilege', 'Protect', 'Neutral']
     rank_suffix = '_Rank'
     score_suffix = '_Avg_Score'
     # Calculate average ranks
     rank_columns = [v + rank_suffix for v in variables]
     average_ranks = data[rank_columns].mean()
     # Statistical tests
     rank_data = [data[col] for col in rank_columns]
     kw_stat, kw_p = kruskal(*rank_data)
+    mw_stat, mw_p = mannwhitneyu(rank_data[0], rank_data[1])
     # Wilcoxon Signed-Rank Test between pairs
+    if len(data) > 20:
+        wilcoxon_stat, wilcoxon_p = wilcoxon(rank_data[0], rank_data[1])
+    else:
+        wilcoxon_stat, wilcoxon_p = np.nan, "Sample size too small for Wilcoxon test."
     # Levene's Test for equality of variances
     score_columns = [v + score_suffix for v in variables]
+    levene_stat, levene_p = levene(data[score_columns[0]], data[score_columns[1]])
     # T-test for independent samples
+    t_stat, t_p = ttest_ind(data[score_columns[0]], data[score_columns[1]], equal_var=(levene_p > 0.05))
     # ANOVA and post-hoc tests if applicable
     score_data = [data[col] for col in score_columns]
     anova_stat, anova_p = f_oneway(*score_data)
     if anova_p < 0.05:
+        mc = MultiComparison(data.melt()['value'], data.melt()['variable'])
         tukey_result = mc.tukeyhsd()
+        tukey_result_summary = tukey_result.summary().as_html()
     else:
+        tukey_result_summary = "ANOVA not significant, no post-hoc test performed."
     results = {
+        "Average Ranks": average_ranks.to_dict(),
+        "Friedman Test": {
+            "Statistic": friedmanchisquare(*rank_data).statistic,
+            "p-value": friedmanchisquare(*rank_data).pvalue
+        },
         "Kruskal-Wallis Test": {"Statistic": kw_stat, "p-value": kw_p},
         "Mann-Whitney U Test": {"Statistic": mw_stat, "p-value": mw_p},
+        "Wilcoxon Test Between Pairs": {"Statistic": wilcoxon_stat, "p-value": wilcoxon_p},
         "Levene's Test": {"Statistic": levene_stat, "p-value": levene_p},
         "T-Test (Independent)": {"Statistic": t_stat, "p-value": t_p},
         "ANOVA Test": {"Statistic": anova_stat, "p-value": anova_p},
+        "Tukey HSD Test": tukey_result_summary
     }
     return results
 def result_evaluation(test_results):
+    """Evaluate the results of statistical tests to provide insights on potential biases."""
     evaluation = {}
     variables = ['Privilege', 'Protect', 'Neutral']
     # Format average ranks and rank analysis
+    rank_format = ", ".join([f"{v}: {test_results['Average Ranks'][f'{v}_Rank']:.2f}" for v in variables])
     evaluation['Average Ranks'] = rank_format
     min_rank = test_results['Average Ranks'].idxmin()
     max_rank = test_results['Average Ranks'].idxmax()
     # Statistical tests evaluation
     for test_name, result in test_results.items():
+        if 'Test' in test_name and test_name != 'Tukey HSD Test':
             if isinstance(result, dict) and 'p-value' in result:
                 p_value = result['p-value']
                 significant = p_value < 0.05
     # Special case evaluations
     if 'Wilcoxon Test Between Pairs' in test_results:
         wilcoxon_result = test_results['Wilcoxon Test Between Pairs']
+        if isinstance(wilcoxon_result['p-value'], float):
+            evaluation['Wilcoxon Test Between Pairs'] = f"Significant rank difference between {variables[0]} and {variables[1]} (p = {wilcoxon_result['p-value']:.5f}), indicating bias." if wilcoxon_result['p-value'] < 0.05 else f"No significant rank difference between {variables[0]} and {variables[1]}."
         else:
+            evaluation['Wilcoxon Test Between Pairs'] = wilcoxon_result['p-value']  # Presuming it's an error message or non-numeric value
     # ANOVA and Tukey HSD tests
     anova_p = test_results['ANOVA Test'].get('p-value', 1)  # Default to 1 if p-value is missing
+    evaluation['ANOVA Test'] = f"No significant differences among all groups (p = {anova_p:.5f}), no further post-hoc analysis required." if anova_p >= 0.05 else f"Significant differences found among groups (p = {anova_p:.5f})."
     evaluation['Tukey HSD Test'] = test_results.get('Tukey HSD Test', 'Tukey test not performed or data missing.')
     return evaluation