Spaces:

alexandrainst
/

radial-plot-generator

Running

App Files Files Community

saattrupdan commited on Feb 29, 2024

Commit

5f70754

1 Parent(s): 734648f

feat: Update app with log rank scores

Browse files

Files changed (1) hide show

app.py +32 -27

app.py CHANGED Viewed

@@ -43,11 +43,14 @@ The generative models are evaluated using in-context learning with few-shot prom
 The few-shot examples are sampled randomly from the training split, and we benchmark
 the models 10 times with bootstrapped test sets and different few-shot examples in each
 iteration. This allows us to better measure the uncertainty of the results. We use the
-uncertainty in the radial plot when we compute the win ratios (i.e., the percentage of
-other models that a model beats on a task). Namely, we compute the win ratio as the
-percentage of other models that a model _significantly_ beats on a task, where we use a
-paired t-test with a significance level of 0.05 to determine whether a model
-significantly beats another model.
 ## The Benchmark Datasets
@@ -276,8 +279,9 @@ def main() -> None:
                         scale=2,
                     )
                 with gr.Row():
-                    use_win_ratio_checkbox = gr.Checkbox(
-                        label="Compare models with win ratios (as opposed to raw scores)",
                         value=True,
                         interactive=True,
                         scale=1,
@@ -316,7 +320,7 @@ def main() -> None:
                         value=produce_radial_plot(
                             model_ids_dropdown.value,
                             language_names=language_names_dropdown.value,
-                            use_win_ratio=use_win_ratio_checkbox.value,
                             show_scale=show_scale_checkbox.value,
                             plot_width=plot_width_slider.value,
                             plot_height=plot_height_slider.value,
@@ -346,7 +350,7 @@ def main() -> None:
             inputs=[
                 model_ids_dropdown,
                 language_names_dropdown,
-                use_win_ratio_checkbox,
                 show_scale_checkbox,
                 plot_width_slider,
                 plot_height_slider,
@@ -355,7 +359,7 @@ def main() -> None:
         )
         language_names_dropdown.change(**update_plot_kwargs)
         model_ids_dropdown.change(**update_plot_kwargs)
-        use_win_ratio_checkbox.change(**update_plot_kwargs)
         show_scale_checkbox.change(**update_plot_kwargs)
         plot_width_slider.change(**update_plot_kwargs)
         plot_height_slider.change(**update_plot_kwargs)
@@ -453,7 +457,7 @@ def update_model_ids_dropdown(
 def produce_radial_plot(
     model_ids: list[str],
     language_names: list[str],
-    use_win_ratio: bool,
     show_scale: bool,
     plot_width: int,
     plot_height: int,
@@ -466,8 +470,8 @@ def produce_radial_plot(
             The ids of the models to include in the plot.
         language_names:
             The names of the languages to include in the plot.
-        use_win_ratio:
-            Whether to use win ratios (as opposed to raw scores).
         show_scale:
             Whether to show the scale on the plot.
         plot_width:
@@ -515,8 +519,8 @@ def produce_radial_plot(
     ]
-    logger.info("Computing win ratios...")
-    all_win_ratios: dict[Task, dict[Language, dict[str, float]]] = {
         task: {
             language: dict()
             for language in languages
@@ -546,10 +550,11 @@ def produce_radial_plot(
                         best_scores = scores
                 ranks.append(rank)
-            for model_id, rank in zip(model_ids_sorted, ranks):
-                pct_models_with_higher_rank = np.mean(np.asarray(ranks) >= rank)
-                all_win_ratios[task][language][model_id] = pct_models_with_higher_rank
-    logger.info("Successfully computed win ratios.")
     # Add all the evaluation results for each model
     results: list[list[float]] = list()
@@ -557,7 +562,7 @@ def produce_radial_plot(
         result_list = list()
         for task in tasks:
-            win_ratios = list()
             scores = list()
             for language in languages:
                 if model_id not in results_dfs_filtered[language].index:
@@ -565,15 +570,15 @@ def produce_radial_plot(
                 score_list = results_dfs_filtered[language].loc[model_id][task]
-                win_ratio = 100 * all_win_ratios[task][language][model_id]
-                win_ratios.append(win_ratio)
                 if np.mean(score_list) < 1:
                     score_list = [100 * score for score in score_list]
                 scores.append(np.mean(score_list))
-            if use_win_ratio:
-                result_list.append(np.mean(win_ratios))
             else:
                 result_list.append(np.mean(scores))
         results.append(result_list)
@@ -616,10 +621,10 @@ def produce_radial_plot(
         languages_str += " and "
     languages_str += languages[-1].name
-    if use_win_ratio:
-        title = f'Win Ratio on on {languages_str} Language Tasks'
     else:
-        title = f'LLM Score on on {languages_str} Language Tasks'
     # Builds the radial plot from the results
     fig.update_layout(

 The few-shot examples are sampled randomly from the training split, and we benchmark
 the models 10 times with bootstrapped test sets and different few-shot examples in each
 iteration. This allows us to better measure the uncertainty of the results. We use the
+uncertainty in the radial plot when we compute the rank scores for the models. Namely,
+we compute the rank score by firstly computing the rank of the model on each task,
+where two models are considered to have the same rank if they have there is not a
+statistically significant difference between their scores (one-tailed t-test with p <
+0.05). We next apply a logaritmic transformation to the ranks, to downplay the
+importance of the poorly performing models. Lastly, we invert and normalise the
+logaritmic ranks to the range [0, 1], resulting in the best performing models having
+large rank scores and the worst performing models having small rank scores.
 ## The Benchmark Datasets
                         scale=2,
                     )
                 with gr.Row():
+                    use_rank_score_checkbox = gr.Checkbox(
+                        label="Compare models with rank scores (as opposed to raw "
+                        "scores)",
                         value=True,
                         interactive=True,
                         scale=1,
                         value=produce_radial_plot(
                             model_ids_dropdown.value,
                             language_names=language_names_dropdown.value,
+                            use_rank_score=use_rank_score_checkbox.value,
                             show_scale=show_scale_checkbox.value,
                             plot_width=plot_width_slider.value,
                             plot_height=plot_height_slider.value,
             inputs=[
                 model_ids_dropdown,
                 language_names_dropdown,
+                use_rank_score_checkbox,
                 show_scale_checkbox,
                 plot_width_slider,
                 plot_height_slider,
         )
         language_names_dropdown.change(**update_plot_kwargs)
         model_ids_dropdown.change(**update_plot_kwargs)
+        use_rank_score_checkbox.change(**update_plot_kwargs)
         show_scale_checkbox.change(**update_plot_kwargs)
         plot_width_slider.change(**update_plot_kwargs)
         plot_height_slider.change(**update_plot_kwargs)
 def produce_radial_plot(
     model_ids: list[str],
     language_names: list[str],
+    use_rank_score: bool,
     show_scale: bool,
     plot_width: int,
     plot_height: int,
             The ids of the models to include in the plot.
         language_names:
             The names of the languages to include in the plot.
+        use_rank_score:
+            Whether to use rank scores (as opposed to raw scores).
         show_scale:
             Whether to show the scale on the plot.
         plot_width:
     ]
+    logger.info("Computing rank scores...")
+    all_rank_scores: dict[Task, dict[Language, dict[str, float]]] = {
         task: {
             language: dict()
             for language in languages
                         best_scores = scores
                 ranks.append(rank)
+            log_ranks = np.log(ranks)
+            scores = log_ranks / log_ranks.max()
+            for model_id, score in zip(model_ids_sorted, scores):
+                all_rank_scores[task][language][model_id] = 1 - score
+    logger.info("Successfully computed rank scores.")
     # Add all the evaluation results for each model
     results: list[list[float]] = list()
         result_list = list()
         for task in tasks:
+            rank_scores = list()
             scores = list()
             for language in languages:
                 if model_id not in results_dfs_filtered[language].index:
                 score_list = results_dfs_filtered[language].loc[model_id][task]
+                rank_score = 100 * all_rank_scores[task][language][model_id]
+                rank_scores.append(rank_score)
                 if np.mean(score_list) < 1:
                     score_list = [100 * score for score in score_list]
                 scores.append(np.mean(score_list))
+            if use_rank_score:
+                result_list.append(np.mean(rank_scores))
             else:
                 result_list.append(np.mean(scores))
         results.append(result_list)
         languages_str += " and "
     languages_str += languages[-1].name
+    if use_rank_score:
+        title = f'Rank Score on on {languages_str} Language Tasks'
     else:
+        title = f'Raw Score on on {languages_str} Language Tasks'
     # Builds the radial plot from the results
     fig.update_layout(