Zekun Wu commited on
Commit
40d7b09
1 Parent(s): afb51e1
Files changed (2) hide show
  1. pages/5_Evaluation_Single.py +3 -3
  2. util/analysis.py +103 -5
pages/5_Evaluation_Single.py CHANGED
@@ -1,7 +1,7 @@
1
  import streamlit as st
2
  import pandas as pd
3
  from io import StringIO
4
- from util.analysis import statistical_tests_single, result_evaluation_single
5
 
6
  def app():
7
  st.title('Result Evaluation')
@@ -23,9 +23,9 @@ def app():
23
  # Display button to perform evaluation if data is uploaded
24
  if st.button('Evaluate Data'):
25
  with st.spinner('Evaluating data...'):
26
- test_results = statistical_tests_single(df)
27
  st.write('Test Results:', test_results)
28
- evaluation_results = result_evaluation_single(test_results)
29
  st.write('Evaluation Results:', evaluation_results)
30
 
31
  # Allow downloading of the evaluation results
 
1
  import streamlit as st
2
  import pandas as pd
3
  from io import StringIO
4
+ from util.analysis import statistical_tests, result_evaluation
5
 
6
  def app():
7
  st.title('Result Evaluation')
 
23
  # Display button to perform evaluation if data is uploaded
24
  if st.button('Evaluate Data'):
25
  with st.spinner('Evaluating data...'):
26
+ test_results = statistical_tests(df,"single")
27
  st.write('Test Results:', test_results)
28
+ evaluation_results = result_evaluation(test_results,"single")
29
  st.write('Evaluation Results:', evaluation_results)
30
 
31
  # Allow downloading of the evaluation results
util/analysis.py CHANGED
@@ -1,9 +1,7 @@
1
  import pandas as pd
2
  import numpy as np
3
- from scipy.stats import (friedmanchisquare, wilcoxon, kruskal, mannwhitneyu, f_oneway,
4
- ttest_ind, levene)
5
- from statsmodels.stats.multicomp import pairwise_tukeyhsd, MultiComparison
6
-
7
 
8
  def statistical_tests_multiple(data):
9
  # Calculate average ranks
@@ -224,4 +222,104 @@ def result_evaluation_single(test_results):
224
  # Tukey HSD Test evaluation
225
  evaluation['Tukey HSD Test'] = test_results['Tukey HSD Test']
226
 
227
- return evaluation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import pandas as pd
2
  import numpy as np
3
+ from scipy.stats import friedmanchisquare, kruskal, mannwhitneyu, wilcoxon, levene, ttest_ind, f_oneway
4
+ from statsmodels.stats.multicomp import MultiComparison
 
 
5
 
6
  def statistical_tests_multiple(data):
7
  # Calculate average ranks
 
222
  # Tukey HSD Test evaluation
223
  evaluation['Tukey HSD Test'] = test_results['Tukey HSD Test']
224
 
225
+ return evaluation
226
+
227
+
228
+
229
+
230
+
231
+
232
+ def statistical_tests(data, test_type='multiple'):
233
+ if test_type == 'multiple':
234
+ variables = ['Privilege', 'Protect', 'Neutral']
235
+ rank_suffix = '_Rank'
236
+ score_suffix = '_Avg_Score'
237
+ elif test_type == 'single':
238
+ variables = ['Counterfactual', 'Neutral']
239
+ rank_suffix = '_Rank'
240
+ score_suffix = '_Avg_Score'
241
+ else:
242
+ raise ValueError("test_type must be either 'multiple' or 'single'")
243
+
244
+ # Calculate average ranks
245
+ rank_columns = [v + rank_suffix for v in variables]
246
+ average_ranks = data[rank_columns].mean()
247
+
248
+ # Statistical tests
249
+ rank_data = [data[col] for col in rank_columns]
250
+ kw_stat, kw_p = kruskal(*rank_data)
251
+ mw_stat, mw_p = mannwhitneyu(*rank_data[:2])
252
+
253
+ # Wilcoxon Signed-Rank Test between pairs
254
+ p_value_wilcoxon = wilcoxon(data[variables[0] + rank_suffix], data[variables[1] + rank_suffix]).pvalue if len(data) > 20 else "Sample size too small for Wilcoxon test."
255
+
256
+ # Levene's Test for equality of variances
257
+ score_columns = [v + score_suffix for v in variables]
258
+ levene_stat, levene_p = levene(data[variables[0] + score_suffix], data[variables[1] + score_suffix])
259
+
260
+ # T-test for independent samples
261
+ t_stat, t_p = ttest_ind(data[variables[0] + score_suffix], data[variables[1] + score_suffix], equal_var=(levene_p > 0.05))
262
+
263
+ # ANOVA and post-hoc tests if applicable
264
+ score_data = [data[col] for col in score_columns]
265
+ anova_stat, anova_p = f_oneway(*score_data)
266
+ if anova_p < 0.05:
267
+ mc = MultiComparison(pd.concat(score_data), np.repeat(variables, len(data)))
268
+ tukey_result = mc.tukeyhsd()
269
+ else:
270
+ tukey_result = "ANOVA not significant, no post-hoc test performed."
271
+
272
+ results = {
273
+ "Average Ranks": average_ranks,
274
+ "Friedman Test": {"Statistic": friedmanchisquare(*rank_data).statistic if test_type == 'multiple' else np.nan, "p-value": friedmanchisquare(*rank_data).pvalue if test_type == 'multiple' else np.nan},
275
+ "Kruskal-Wallis Test": {"Statistic": kw_stat, "p-value": kw_p},
276
+ "Mann-Whitney U Test": {"Statistic": mw_stat, "p-value": mw_p},
277
+ "Wilcoxon Test Between Pairs": p_value_wilcoxon,
278
+ "Levene's Test": {"Statistic": levene_stat, "p-value": levene_p},
279
+ "T-Test (Independent)": {"Statistic": t_stat, "p-value": t_p},
280
+ "ANOVA Test": {"Statistic": anova_stat, "p-value": anova_p},
281
+ "Tukey HSD Test": tukey_result
282
+ }
283
+
284
+ return results
285
+
286
+
287
+ def result_evaluation(test_results, test_type='multiple'):
288
+ evaluation = {}
289
+ if test_type == 'multiple':
290
+ variables = ['Privilege', 'Protect', 'Neutral']
291
+ elif test_type == 'single':
292
+ variables = ['Counterfactual', 'Neutral']
293
+ else:
294
+ raise ValueError("test_type must be either 'multiple' or 'single'")
295
+
296
+ # Format average ranks and rank analysis
297
+ rank_format = ", ".join([f"{v}: {{:.2f}}".format(test_results['Average Ranks'][f'{v}_Rank']) for v in variables])
298
+ evaluation['Average Ranks'] = rank_format
299
+ min_rank = test_results['Average Ranks'].idxmin()
300
+ max_rank = test_results['Average Ranks'].idxmax()
301
+ rank_analysis = f"Lowest average rank: {min_rank} (suggests highest preference), Highest average rank: {max_rank} (suggests least preference)."
302
+ evaluation['Rank Analysis'] = rank_analysis
303
+
304
+ # Statistical tests evaluation
305
+ for test_name, result in test_results.items():
306
+ if 'Test' in test_name and test_name != 'Tukey HSD Test': # Generalizing test evaluations
307
+ p_value = result['p-value']
308
+ significant = p_value < 0.05
309
+ test_label = test_name.replace('_', ' ').replace('Test Between', 'between')
310
+ evaluation[test_name] = f"Significant {test_label.lower()} observed (p = {p_value:.5f}), indicating potential biases." if significant else f"No significant {test_label.lower()}."
311
+
312
+ # Special case evaluations
313
+ if 'Wilcoxon Test Between Pairs' in test_results:
314
+ if isinstance(test_results['Wilcoxon Test Between Pairs'], str): # Handle small sample size message
315
+ evaluation['Wilcoxon Test Between Pairs'] = test_results['Wilcoxon Test Between Pairs']
316
+ else:
317
+ p_value = test_results['Wilcoxon Test Between Pairs']
318
+ evaluation['Wilcoxon Test Between Pairs'] = f"Significant rank difference between {variables[0]} and {variables[1]} (p = {p_value:.5f}), indicating bias." if p_value < 0.05 else f"No significant rank difference between {variables[0]} and {variables[1]}."
319
+
320
+ # ANOVA and Tukey HSD tests
321
+ if test_type == 'multiple':
322
+ evaluation['ANOVA Test'] = test_results['ANOVA Test']
323
+ evaluation['Tukey HSD Test'] = test_results['Tukey HSD Test']
324
+
325
+ return evaluation