saattrupdan commited on
Commit
3a84b3b
1 Parent(s): 5f70754

chore: Small update

Browse files
Files changed (1) hide show
  1. app.py +12 -14
app.py CHANGED
@@ -45,12 +45,12 @@ the models 10 times with bootstrapped test sets and different few-shot examples
45
  iteration. This allows us to better measure the uncertainty of the results. We use the
46
  uncertainty in the radial plot when we compute the rank scores for the models. Namely,
47
  we compute the rank score by firstly computing the rank of the model on each task,
48
- where two models are considered to have the same rank if they have there is not a
49
- statistically significant difference between their scores (one-tailed t-test with p <
50
- 0.05). We next apply a logaritmic transformation to the ranks, to downplay the
51
- importance of the poorly performing models. Lastly, we invert and normalise the
52
- logaritmic ranks to the range [0, 1], resulting in the best performing models having
53
- large rank scores and the worst performing models having small rank scores.
54
 
55
  ## The Benchmark Datasets
56
 
@@ -551,9 +551,9 @@ def produce_radial_plot(
551
  ranks.append(rank)
552
 
553
  log_ranks = np.log(ranks)
554
- scores = log_ranks / log_ranks.max()
555
  for model_id, score in zip(model_ids_sorted, scores):
556
- all_rank_scores[task][language][model_id] = 1 - score
557
  logger.info("Successfully computed rank scores.")
558
 
559
  # Add all the evaluation results for each model
@@ -568,15 +568,13 @@ def produce_radial_plot(
568
  if model_id not in results_dfs_filtered[language].index:
569
  continue
570
 
571
- score_list = results_dfs_filtered[language].loc[model_id][task]
572
-
573
  rank_score = 100 * all_rank_scores[task][language][model_id]
574
  rank_scores.append(rank_score)
575
 
576
- if np.mean(score_list) < 1:
577
- score_list = [100 * score for score in score_list]
578
-
579
- scores.append(np.mean(score_list))
580
  if use_rank_score:
581
  result_list.append(np.mean(rank_scores))
582
  else:
 
45
  iteration. This allows us to better measure the uncertainty of the results. We use the
46
  uncertainty in the radial plot when we compute the rank scores for the models. Namely,
47
  we compute the rank score by firstly computing the rank of the model on each task,
48
+ where two models are considered to have the same rank if there is not a statistically
49
+ significant difference between their scores (one-tailed t-test with p < 0.05). We next
50
+ apply a logaritmic transformation to the ranks, to downplay the importance of the
51
+ poorly performing models. Lastly, we invert and normalise the logaritmic ranks to the
52
+ range [0, 1], resulting in the best performing models having rank scores close to 1 and
53
+ the worst performing models having rank scores close to 0.
54
 
55
  ## The Benchmark Datasets
56
 
 
551
  ranks.append(rank)
552
 
553
  log_ranks = np.log(ranks)
554
+ scores = 1 - (log_ranks / log_ranks.max())
555
  for model_id, score in zip(model_ids_sorted, scores):
556
+ all_rank_scores[task][language][model_id] = score
557
  logger.info("Successfully computed rank scores.")
558
 
559
  # Add all the evaluation results for each model
 
568
  if model_id not in results_dfs_filtered[language].index:
569
  continue
570
 
 
 
571
  rank_score = 100 * all_rank_scores[task][language][model_id]
572
  rank_scores.append(rank_score)
573
 
574
+ score_arr = np.array(results_dfs_filtered[language].loc[model_id][task])
575
+ if score_arr.mean() < 1:
576
+ score_arr *= 100
577
+ scores.append(score_arr.mean())
578
  if use_rank_score:
579
  result_list.append(np.mean(rank_scores))
580
  else: