saattrupdan commited on
Commit
576340d
1 Parent(s): ada1f6c

chore: Revert last change

Browse files
Files changed (1) hide show
  1. app.py +6 -50
app.py CHANGED
@@ -555,19 +555,18 @@ def produce_radial_plot(
555
  for language in languages:
556
  if model_id not in results_dfs_filtered[language].index:
557
  continue
558
-
559
  score_list = results_dfs_filtered[language].loc[model_id][task]
560
- if all(score < 1 for score in score_list):
561
- score_list = [100 * score for score in score_list]
562
-
563
  win_ratio = 100 * np.mean([
564
- scores_statistically_better(
565
- score_values_1=score_list, score_values_2=other_scores
566
- )
567
  for other_scores in results_dfs_filtered[language][task].dropna().drop(index=model_id)
568
  ])
569
  win_ratios.append(win_ratio)
570
 
 
 
 
571
  scores.append(np.mean(score_list))
572
  if use_win_ratio:
573
  result_list.append(np.mean(win_ratios))
@@ -688,48 +687,5 @@ def fetch_results() -> dict[Language, pd.DataFrame]:
688
 
689
  return results_dfs
690
 
691
-
692
- def scores_statistically_better(
693
- score_values_1: list[float], score_values_2: list[float]
694
- ) -> bool:
695
- """Determine whether the first score group is statistically better than the second.
696
-
697
- Args:
698
- score_values_1:
699
- The scores for the first group.
700
- score_values_2:
701
- The scores for the second group.
702
-
703
- Returns:
704
- Whether the first score group is statistically better than the second.
705
- """
706
- assert len(score_values_1) == len(score_values_2), (
707
- "The two score groups must have the same length."
708
- )
709
-
710
- # Separate the scores into groups of 10, consisting of the scores for each
711
- # dataset
712
- group_scores_1 = [
713
- score_values_1[idx:idx+10] for idx in range(0, len(score_values_1), 10)
714
- ]
715
- group_scores_2 = [
716
- score_values_2[idx:idx+10] for idx in range(0, len(score_values_2), 10)
717
- ]
718
-
719
- # Compute t-statistics for each group separately, and compute the mean
720
- # t-statistic
721
- t_statistics = [
722
- stats.ttest_ind(a=group_1, b=group_2, alternative="greater").statistic
723
- for group_1, group_2 in zip(group_scores_1, group_scores_2)
724
- ]
725
- mean_t_statistic = np.mean(t_statistics)
726
-
727
- # Compute the p-value for the mean t-statistic, where the null hypothesis is
728
- # that the first group does not have a larger mean score than the second group
729
- degrees_of_freedom = len(score_values_1) - 1
730
- p_value = 1 - stats.t.cdf(abs(mean_t_statistic), degrees_of_freedom)
731
-
732
- return p_value < 0.05
733
-
734
  if __name__ == "__main__":
735
  main()
 
555
  for language in languages:
556
  if model_id not in results_dfs_filtered[language].index:
557
  continue
 
558
  score_list = results_dfs_filtered[language].loc[model_id][task]
 
 
 
559
  win_ratio = 100 * np.mean([
560
+ stats.ttest_rel(
561
+ a=score_list, b=other_scores, alternative="greater"
562
+ ).pvalue < 0.05
563
  for other_scores in results_dfs_filtered[language][task].dropna().drop(index=model_id)
564
  ])
565
  win_ratios.append(win_ratio)
566
 
567
+ if all(score < 1 for score in score_list):
568
+ score_list = [100 * score for score in score_list]
569
+
570
  scores.append(np.mean(score_list))
571
  if use_win_ratio:
572
  result_list.append(np.mean(win_ratios))
 
687
 
688
  return results_dfs
689
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
690
  if __name__ == "__main__":
691
  main()