Zekun Wu commited on
Commit
168431b
1 Parent(s): 54e8b17
Files changed (1) hide show
  1. util/evaluation.py +123 -54
util/evaluation.py CHANGED
@@ -60,7 +60,6 @@ def calculate_divergences(df):
60
  return divergences
61
 
62
 
63
-
64
  def statistical_tests(data):
65
  """Perform various statistical tests to evaluate potential biases."""
66
  variables = ['Privilege', 'Protect', 'Neutral']
@@ -74,22 +73,40 @@ def statistical_tests(data):
74
  # Statistical tests
75
  rank_data = [data[col] for col in rank_columns]
76
  kw_stat, kw_p = kruskal(*rank_data)
77
- mw_stat, mw_p = mannwhitneyu(rank_data[0], rank_data[1])
78
 
79
- # Wilcoxon Signed-Rank Test between pairs
80
- if len(data) > 20:
81
- wilcoxon_stat, wilcoxon_p = wilcoxon(rank_data[0], rank_data[1])
82
- else:
83
- wilcoxon_stat, wilcoxon_p = np.nan, "Sample size too small for Wilcoxon test."
 
 
84
 
85
- # Levene's Test for equality of variances
86
- score_columns = [v + score_suffix for v in variables]
87
- levene_stat, levene_p = levene(data[score_columns[0]], data[score_columns[1]])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
- # T-test for independent samples
90
- t_stat, t_p = ttest_ind(data[score_columns[0]], data[score_columns[1]], equal_var=(levene_p > 0.05))
 
 
91
 
92
  # ANOVA and post-hoc tests if applicable
 
93
  score_data = [data[col] for col in score_columns]
94
  anova_stat, anova_p = f_oneway(*score_data)
95
  if anova_p < 0.05:
@@ -106,52 +123,104 @@ def statistical_tests(data):
106
  "p-value": friedmanchisquare(*rank_data).pvalue
107
  },
108
  "Kruskal-Wallis Test": {"Statistic": kw_stat, "p-value": kw_p},
109
- "Mann-Whitney U Test": {"Statistic": mw_stat, "p-value": mw_p},
110
- "Wilcoxon Test Between Pairs": {"Statistic": wilcoxon_stat, "p-value": wilcoxon_p},
111
- "Levene's Test": {"Statistic": levene_stat, "p-value": levene_p},
112
- "T-Test (Independent)": {"Statistic": t_stat, "p-value": t_p},
113
  "ANOVA Test": {"Statistic": anova_stat, "p-value": anova_p},
114
  "Tukey HSD Test": tukey_result_summary
115
  }
116
 
117
  return results
118
 
119
- def result_evaluation(test_results):
120
- """Evaluate the results of statistical tests to provide insights on potential biases."""
121
- evaluation = {}
122
- variables = ['Privilege', 'Protect', 'Neutral']
123
-
124
- # Format average ranks and rank analysis
125
- rank_format = ", ".join([f"{v}: {test_results['Average Ranks'][f'{v}_Rank']:.2f}" for v in variables])
126
- evaluation['Average Ranks'] = rank_format
127
- min_rank = test_results['Average Ranks'].idxmin()
128
- max_rank = test_results['Average Ranks'].idxmax()
129
- rank_analysis = f"Lowest average rank: {min_rank} (suggests highest preference), Highest average rank: {max_rank} (suggests least preference)."
130
- evaluation['Rank Analysis'] = rank_analysis
131
-
132
- # Statistical tests evaluation
133
- for test_name, result in test_results.items():
134
- if 'Test' in test_name and test_name != 'Tukey HSD Test':
135
- if isinstance(result, dict) and 'p-value' in result:
136
- p_value = result['p-value']
137
- significant = p_value < 0.05
138
- test_label = test_name.replace('_', ' ').replace('Test Between', 'between')
139
- evaluation[test_name] = f"Significant {test_label.lower()} observed (p = {p_value:.5f}), indicating potential biases." if significant else f"No significant {test_label.lower()}."
140
- else:
141
- evaluation[test_name] = "Test result format error or incomplete data."
142
-
143
- # Special case evaluations
144
- if 'Wilcoxon Test Between Pairs' in test_results:
145
- wilcoxon_result = test_results['Wilcoxon Test Between Pairs']
146
- if isinstance(wilcoxon_result['p-value'], float):
147
- evaluation['Wilcoxon Test Between Pairs'] = f"Significant rank difference between {variables[0]} and {variables[1]} (p = {wilcoxon_result['p-value']:.5f}), indicating bias." if wilcoxon_result['p-value'] < 0.05 else f"No significant rank difference between {variables[0]} and {variables[1]}."
148
- else:
149
- evaluation['Wilcoxon Test Between Pairs'] = wilcoxon_result['p-value'] # Presuming it's an error message or non-numeric value
150
-
151
- # ANOVA and Tukey HSD tests
152
- anova_p = test_results['ANOVA Test'].get('p-value', 1) # Default to 1 if p-value is missing
153
- evaluation['ANOVA Test'] = f"No significant differences among all groups (p = {anova_p:.5f}), no further post-hoc analysis required." if anova_p >= 0.05 else f"Significant differences found among groups (p = {anova_p:.5f})."
154
- evaluation['Tukey HSD Test'] = test_results.get('Tukey HSD Test', 'Tukey test not performed or data missing.')
155
-
156
- return evaluation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
 
60
  return divergences
61
 
62
 
 
63
  def statistical_tests(data):
64
  """Perform various statistical tests to evaluate potential biases."""
65
  variables = ['Privilege', 'Protect', 'Neutral']
 
73
  # Statistical tests
74
  rank_data = [data[col] for col in rank_columns]
75
  kw_stat, kw_p = kruskal(*rank_data)
 
76
 
77
+ # Pairwise tests
78
+ pairwise_results = {}
79
+ pairs = [
80
+ ('Privilege', 'Protect'),
81
+ ('Protect', 'Neutral'),
82
+ ('Privilege', 'Neutral')
83
+ ]
84
 
85
+ for (var1, var2) in pairs:
86
+ pair_name = f'{var1} vs {var2}'
87
+
88
+ # Mann-Whitney U Test
89
+ mw_stat, mw_p = mannwhitneyu(data[f'{var1}{rank_suffix}'], data[f'{var2}{rank_suffix}'])
90
+ pairwise_results[f'Mann-Whitney U Test {pair_name}'] = {"Statistic": mw_stat, "p-value": mw_p}
91
+
92
+ # Wilcoxon Signed-Rank Test
93
+ if len(data) > 20:
94
+ wilcoxon_stat, wilcoxon_p = wilcoxon(data[f'{var1}{rank_suffix}'], data[f'{var2}{rank_suffix}'])
95
+ else:
96
+ wilcoxon_stat, wilcoxon_p = np.nan, "Sample size too small for Wilcoxon test."
97
+ pairwise_results[f'Wilcoxon Test {pair_name}'] = {"Statistic": wilcoxon_stat, "p-value": wilcoxon_p}
98
+
99
+ # Levene's Test for equality of variances
100
+ levene_stat, levene_p = levene(data[f'{var1}{score_suffix}'], data[f'{var2}{score_suffix}'])
101
+ pairwise_results[f'Levene\'s Test {pair_name}'] = {"Statistic": levene_stat, "p-value": levene_p}
102
 
103
+ # T-test for independent samples
104
+ t_stat, t_p = ttest_ind(data[f'{var1}{score_suffix}'], data[f'{var2}{score_suffix}'],
105
+ equal_var=(levene_p > 0.05))
106
+ pairwise_results[f'T-Test {pair_name}'] = {"Statistic": t_stat, "p-value": t_p}
107
 
108
  # ANOVA and post-hoc tests if applicable
109
+ score_columns = [v + score_suffix for v in variables]
110
  score_data = [data[col] for col in score_columns]
111
  anova_stat, anova_p = f_oneway(*score_data)
112
  if anova_p < 0.05:
 
123
  "p-value": friedmanchisquare(*rank_data).pvalue
124
  },
125
  "Kruskal-Wallis Test": {"Statistic": kw_stat, "p-value": kw_p},
126
+ **pairwise_results,
 
 
 
127
  "ANOVA Test": {"Statistic": anova_stat, "p-value": anova_p},
128
  "Tukey HSD Test": tukey_result_summary
129
  }
130
 
131
  return results
132
 
133
+ # def statistical_tests(data):
134
+ # """Perform various statistical tests to evaluate potential biases."""
135
+ # variables = ['Privilege', 'Protect', 'Neutral']
136
+ # rank_suffix = '_Rank'
137
+ # score_suffix = '_Avg_Score'
138
+ #
139
+ # # Calculate average ranks
140
+ # rank_columns = [v + rank_suffix for v in variables]
141
+ # average_ranks = data[rank_columns].mean()
142
+ #
143
+ # # Statistical tests
144
+ # rank_data = [data[col] for col in rank_columns]
145
+ # kw_stat, kw_p = kruskal(*rank_data)
146
+ # mw_stat, mw_p = mannwhitneyu(rank_data[0], rank_data[1])
147
+ #
148
+ # # Wilcoxon Signed-Rank Test between pairs
149
+ # if len(data) > 20:
150
+ # wilcoxon_stat, wilcoxon_p = wilcoxon(rank_data[0], rank_data[1])
151
+ # else:
152
+ # wilcoxon_stat, wilcoxon_p = np.nan, "Sample size too small for Wilcoxon test."
153
+ #
154
+ # # Levene's Test for equality of variances
155
+ # score_columns = [v + score_suffix for v in variables]
156
+ # levene_stat, levene_p = levene(data[score_columns[0]], data[score_columns[1]])
157
+ #
158
+ # # T-test for independent samples
159
+ # t_stat, t_p = ttest_ind(data[score_columns[0]], data[score_columns[1]], equal_var=(levene_p > 0.05))
160
+ #
161
+ # # ANOVA and post-hoc tests if applicable
162
+ # score_data = [data[col] for col in score_columns]
163
+ # anova_stat, anova_p = f_oneway(*score_data)
164
+ # if anova_p < 0.05:
165
+ # mc = MultiComparison(data.melt()['value'], data.melt()['variable'])
166
+ # tukey_result = mc.tukeyhsd()
167
+ # tukey_result_summary = tukey_result.summary().as_html()
168
+ # else:
169
+ # tukey_result_summary = "ANOVA not significant, no post-hoc test performed."
170
+ #
171
+ # results = {
172
+ # "Average Ranks": average_ranks.to_dict(),
173
+ # "Friedman Test": {
174
+ # "Statistic": friedmanchisquare(*rank_data).statistic,
175
+ # "p-value": friedmanchisquare(*rank_data).pvalue
176
+ # },
177
+ # "Kruskal-Wallis Test": {"Statistic": kw_stat, "p-value": kw_p},
178
+ # "Mann-Whitney U Test": {"Statistic": mw_stat, "p-value": mw_p},
179
+ # "Wilcoxon Test Between Pairs": {"Statistic": wilcoxon_stat, "p-value": wilcoxon_p},
180
+ # "Levene's Test": {"Statistic": levene_stat, "p-value": levene_p},
181
+ # "T-Test (Independent)": {"Statistic": t_stat, "p-value": t_p},
182
+ # "ANOVA Test": {"Statistic": anova_stat, "p-value": anova_p},
183
+ # "Tukey HSD Test": tukey_result_summary
184
+ # }
185
+ #
186
+ # return results
187
+
188
+ # def result_evaluation(test_results):
189
+ # """Evaluate the results of statistical tests to provide insights on potential biases."""
190
+ # evaluation = {}
191
+ # variables = ['Privilege', 'Protect', 'Neutral']
192
+ #
193
+ # # Format average ranks and rank analysis
194
+ # rank_format = ", ".join([f"{v}: {test_results['Average Ranks'][f'{v}_Rank']:.2f}" for v in variables])
195
+ # evaluation['Average Ranks'] = rank_format
196
+ # min_rank = test_results['Average Ranks'].idxmin()
197
+ # max_rank = test_results['Average Ranks'].idxmax()
198
+ # rank_analysis = f"Lowest average rank: {min_rank} (suggests highest preference), Highest average rank: {max_rank} (suggests least preference)."
199
+ # evaluation['Rank Analysis'] = rank_analysis
200
+ #
201
+ # # Statistical tests evaluation
202
+ # for test_name, result in test_results.items():
203
+ # if 'Test' in test_name and test_name != 'Tukey HSD Test':
204
+ # if isinstance(result, dict) and 'p-value' in result:
205
+ # p_value = result['p-value']
206
+ # significant = p_value < 0.05
207
+ # test_label = test_name.replace('_', ' ').replace('Test Between', 'between')
208
+ # evaluation[test_name] = f"Significant {test_label.lower()} observed (p = {p_value:.5f}), indicating potential biases." if significant else f"No significant {test_label.lower()}."
209
+ # else:
210
+ # evaluation[test_name] = "Test result format error or incomplete data."
211
+ #
212
+ # # Special case evaluations
213
+ # if 'Wilcoxon Test Between Pairs' in test_results:
214
+ # wilcoxon_result = test_results['Wilcoxon Test Between Pairs']
215
+ # if isinstance(wilcoxon_result['p-value'], float):
216
+ # evaluation['Wilcoxon Test Between Pairs'] = f"Significant rank difference between {variables[0]} and {variables[1]} (p = {wilcoxon_result['p-value']:.5f}), indicating bias." if wilcoxon_result['p-value'] < 0.05 else f"No significant rank difference between {variables[0]} and {variables[1]}."
217
+ # else:
218
+ # evaluation['Wilcoxon Test Between Pairs'] = wilcoxon_result['p-value'] # Presuming it's an error message or non-numeric value
219
+ #
220
+ # # ANOVA and Tukey HSD tests
221
+ # anova_p = test_results['ANOVA Test'].get('p-value', 1) # Default to 1 if p-value is missing
222
+ # evaluation['ANOVA Test'] = f"No significant differences among all groups (p = {anova_p:.5f}), no further post-hoc analysis required." if anova_p >= 0.05 else f"Significant differences found among groups (p = {anova_p:.5f})."
223
+ # evaluation['Tukey HSD Test'] = test_results.get('Tukey HSD Test', 'Tukey test not performed or data missing.')
224
+ #
225
+ # return evaluation
226