|
"""Script to produce radial plots.""" |
|
|
|
from functools import partial |
|
import plotly.graph_objects as go |
|
import json |
|
import numpy as np |
|
from collections import defaultdict |
|
import pandas as pd |
|
from pydantic import BaseModel |
|
import gradio as gr |
|
import requests |
|
import random |
|
import logging |
|
import datetime as dt |
|
import scipy.stats as stats |
|
import itertools as it |
|
|
|
|
|
fmt = "%(asctime)s [%(levelname)s] <%(name)s> %(message)s" |
|
logging.basicConfig(level=logging.INFO, format=fmt) |
|
logger = logging.getLogger("radial_plot_generator") |
|
|
|
|
|
INTRO_MARKDOWN = """ |
|
# Radial Plot Generator |
|
|
|
This demo allows you to generate a radial plot comparing the performance of different |
|
language models on different tasks. It is based on the generative results from the |
|
[ScandEval benchmark](https://scandeval.com). |
|
""" |
|
|
|
|
|
ABOUT_MARKDOWN = """ |
|
## About the ScandEval Benchmark |
|
|
|
The [ScandEval benchmark](https://scandeval.com) is used compare pretrained language |
|
models on tasks in Danish, Swedish, Norwegian Bokmål, Norwegian Nynorsk, Icelandic, |
|
Faroese, German, Dutch and English. The benchmark supports both encoder models (such as |
|
BERT) and generative models (such as GPT), and leaderboards for both kinds [are |
|
available](https://scandeval.com). |
|
|
|
The generative models are evaluated using in-context learning with few-shot prompts. |
|
The few-shot examples are sampled randomly from the training split, and we benchmark |
|
the models 10 times with bootstrapped test sets and different few-shot examples in each |
|
iteration. This allows us to better measure the uncertainty of the results. We use the |
|
uncertainty in the radial plot when we compute the win ratios (i.e., the percentage of |
|
other models that a model beats on a task). Namely, we compute the win ratio as the |
|
percentage of other models that a model _significantly_ beats on a task, where we use a |
|
paired t-test with a significance level of 0.05 to determine whether a model |
|
significantly beats another model. |
|
|
|
## The Benchmark Datasets |
|
|
|
The ScandEval generative benchmark currently covers the languages Danish, Swedish, |
|
Norwegian, Icelandic, German, Dutch and English. For each language, the benchmark |
|
consists of 7 different tasks, each of which consists of 1-2 datasets. The tasks are |
|
the following: |
|
|
|
### Text Classification |
|
Given a piece of text, classify it into a number of classes. For this task we extract |
|
the first token of the possible labels, and choose the label whose first token has the |
|
highest probability. All datasets in this category are currently trinary sentiment |
|
classification datasets. We use the Matthews Correlation Coefficient (MCC) as the |
|
evaluation metric. |
|
|
|
### Information Extraction |
|
Given a piece of text, extract a number of entities from the text. As the model needs |
|
to extract multiple entities, we use [structured |
|
generation](https://github.com/noamgat/lm-format-enforcer) to make the model generate a |
|
JSON dictionary with keys being the entity categories and values being lists of the |
|
identified entities. All datasets in this task are named entity recognition datasets. |
|
We use the micro-averaged F1 score as the evaluation metric, where we ignore the |
|
Miscellaneous category. |
|
|
|
### Grammar |
|
Given a piece of text, determine whether it is grammatically correct or not. All |
|
datasets in this task are built from the dependency treebanks of the languages, where |
|
words are removed or swapped, in a way that makes the sentence ungrammatical. We use |
|
the Matthews Correlation Coefficient (MCC) as the evaluation metric. |
|
|
|
### Question Answering |
|
Given a question and a piece of text, extract the answer to the question from the text. |
|
All datasets in this task are extractive question answering datasets. We use the exact |
|
match (EM) score as the evaluation metric. |
|
|
|
### Summarisation |
|
Given a piece of text, generate a summary of the text. All the datasets come from |
|
either news articles or WikiHow articles. We use the BERTScore metric as the evaluation |
|
metric, where the encoder model used is |
|
[microsoft/mdeberta-v3-base](https://huggingface.co/microsoft/mdeberta-v3-base). |
|
|
|
### Knowledge |
|
Given a trivia-style question with multiple choice answers, choose the correct answer. |
|
As with text classification, we use the probabilities of the answer letter (a, b, c or |
|
d) to choose the answer. The datasets in this task are machine translated versions of |
|
the [MMLU](https://doi.org/10.48550/arXiv.2009.03300) and |
|
[ARC](https://allenai.org/data/arc) datasets. We use the Matthews Correlation |
|
Coefficient (MCC) as the evaluation metric. |
|
|
|
### Reasoning |
|
Given a scenario and multiple possible endings, choose the correct ending. As with text |
|
classification, we use the probabilities of the answer letter (a, b, c or d) to choose |
|
the answer. The datasets in this task are machine translated versions of the |
|
[HellaSwag](https://rowanzellers.com/hellaswag/) dataset. We use the Matthews |
|
Correlation Coefficient (MCC) as the evaluation metric. |
|
|
|
|
|
## Citation |
|
|
|
If you use the ScandEval benchmark in your work, please cite [the |
|
paper](https://aclanthology.org/2023.nodalida-1.20): |
|
|
|
``` |
|
@inproceedings{nielsen2023scandeval, |
|
title={ScandEval: A Benchmark for Scandinavian Natural Language Processing}, |
|
author={Nielsen, Dan}, |
|
booktitle={Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)}, |
|
pages={185--201}, |
|
year={2023} |
|
} |
|
``` |
|
""" |
|
|
|
|
|
UPDATE_FREQUENCY_MINUTES = 30 |
|
MIN_COLOUR_DISTANCE_BETWEEN_MODELS = 200 |
|
|
|
|
|
class Task(BaseModel): |
|
"""Class to hold task information.""" |
|
|
|
name: str |
|
metric: str |
|
|
|
def __hash__(self): |
|
return hash(self.name) |
|
|
|
|
|
class Language(BaseModel): |
|
"""Class to hold language information.""" |
|
|
|
code: str |
|
name: str |
|
|
|
def __hash__(self): |
|
return hash(self.code) |
|
|
|
|
|
class Dataset(BaseModel): |
|
"""Class to hold dataset information.""" |
|
|
|
name: str |
|
language: Language |
|
task: Task |
|
|
|
def __hash__(self): |
|
return hash(self.name) |
|
|
|
|
|
TEXT_CLASSIFICATION = Task(name="text classification", metric="mcc") |
|
INFORMATION_EXTRACTION = Task(name="information extraction", metric="micro_f1_no_misc") |
|
GRAMMAR = Task(name="grammar", metric="mcc") |
|
QUESTION_ANSWERING = Task(name="question answering", metric="em") |
|
SUMMARISATION = Task(name="summarisation", metric="bertscore") |
|
KNOWLEDGE = Task(name="knowledge", metric="mcc") |
|
REASONING = Task(name="reasoning", metric="mcc") |
|
ALL_TASKS = [obj for obj in globals().values() if isinstance(obj, Task)] |
|
|
|
|
|
DANISH = Language(code="da", name="Danish") |
|
NORWEGIAN = Language(code="no", name="Norwegian") |
|
SWEDISH = Language(code="sv", name="Swedish") |
|
ICELANDIC = Language(code="is", name="Icelandic") |
|
GERMAN = Language(code="de", name="German") |
|
DUTCH = Language(code="nl", name="Dutch") |
|
ENGLISH = Language(code="en", name="English") |
|
ALL_LANGUAGES = { |
|
obj.name: obj for obj in globals().values() if isinstance(obj, Language) |
|
} |
|
|
|
DATASETS = [ |
|
Dataset(name="swerec", language=SWEDISH, task=TEXT_CLASSIFICATION), |
|
Dataset(name="angry-tweets", language=DANISH, task=TEXT_CLASSIFICATION), |
|
Dataset(name="norec", language=NORWEGIAN, task=TEXT_CLASSIFICATION), |
|
Dataset(name="sb10k", language=GERMAN, task=TEXT_CLASSIFICATION), |
|
Dataset(name="dutch-social", language=DUTCH, task=TEXT_CLASSIFICATION), |
|
Dataset(name="sst5", language=ENGLISH, task=TEXT_CLASSIFICATION), |
|
Dataset(name="suc3", language=SWEDISH, task=INFORMATION_EXTRACTION), |
|
Dataset(name="dansk", language=DANISH, task=INFORMATION_EXTRACTION), |
|
Dataset(name="norne-nb", language=NORWEGIAN, task=INFORMATION_EXTRACTION), |
|
Dataset(name="norne-nn", language=NORWEGIAN, task=INFORMATION_EXTRACTION), |
|
Dataset(name="mim-gold-ner", language=ICELANDIC, task=INFORMATION_EXTRACTION), |
|
Dataset(name="germeval", language=GERMAN, task=INFORMATION_EXTRACTION), |
|
Dataset(name="conll-nl", language=DUTCH, task=INFORMATION_EXTRACTION), |
|
Dataset(name="conll-en", language=ENGLISH, task=INFORMATION_EXTRACTION), |
|
Dataset(name="scala-sv", language=SWEDISH, task=GRAMMAR), |
|
Dataset(name="scala-da", language=DANISH, task=GRAMMAR), |
|
Dataset(name="scala-nb", language=NORWEGIAN, task=GRAMMAR), |
|
Dataset(name="scala-nn", language=NORWEGIAN, task=GRAMMAR), |
|
Dataset(name="scala-is", language=ICELANDIC, task=GRAMMAR), |
|
Dataset(name="scala-de", language=GERMAN, task=GRAMMAR), |
|
Dataset(name="scala-nl", language=DUTCH, task=GRAMMAR), |
|
Dataset(name="scala-en", language=ENGLISH, task=GRAMMAR), |
|
Dataset(name="scandiqa-da", language=DANISH, task=QUESTION_ANSWERING), |
|
Dataset(name="norquad", language=NORWEGIAN, task=QUESTION_ANSWERING), |
|
Dataset(name="scandiqa-sv", language=SWEDISH, task=QUESTION_ANSWERING), |
|
Dataset(name="nqii", language=ICELANDIC, task=QUESTION_ANSWERING), |
|
Dataset(name="germanquad", language=GERMAN, task=QUESTION_ANSWERING), |
|
Dataset(name="squad", language=ENGLISH, task=QUESTION_ANSWERING), |
|
Dataset(name="squad-nl", language=DUTCH, task=QUESTION_ANSWERING), |
|
Dataset(name="nordjylland-news", language=DANISH, task=SUMMARISATION), |
|
Dataset(name="mlsum", language=GERMAN, task=SUMMARISATION), |
|
Dataset(name="rrn", language=ICELANDIC, task=SUMMARISATION), |
|
Dataset(name="no-sammendrag", language=NORWEGIAN, task=SUMMARISATION), |
|
Dataset(name="wiki-lingua-nl", language=DUTCH, task=SUMMARISATION), |
|
Dataset(name="swedn", language=SWEDISH, task=SUMMARISATION), |
|
Dataset(name="cnn-dailymail", language=ENGLISH, task=SUMMARISATION), |
|
Dataset(name="mmlu-da", language=DANISH, task=KNOWLEDGE), |
|
Dataset(name="mmlu-no", language=NORWEGIAN, task=KNOWLEDGE), |
|
Dataset(name="mmlu-sv", language=SWEDISH, task=KNOWLEDGE), |
|
Dataset(name="mmlu-is", language=ICELANDIC, task=KNOWLEDGE), |
|
Dataset(name="mmlu-de", language=GERMAN, task=KNOWLEDGE), |
|
Dataset(name="mmlu-nl", language=DUTCH, task=KNOWLEDGE), |
|
Dataset(name="mmlu", language=ENGLISH, task=KNOWLEDGE), |
|
Dataset(name="arc-da", language=DANISH, task=KNOWLEDGE), |
|
Dataset(name="arc-no", language=NORWEGIAN, task=KNOWLEDGE), |
|
Dataset(name="arc-sv", language=SWEDISH, task=KNOWLEDGE), |
|
Dataset(name="arc-is", language=ICELANDIC, task=KNOWLEDGE), |
|
Dataset(name="arc-de", language=GERMAN, task=KNOWLEDGE), |
|
Dataset(name="arc-nl", language=DUTCH, task=KNOWLEDGE), |
|
Dataset(name="arc", language=ENGLISH, task=KNOWLEDGE), |
|
Dataset(name="hellaswag-da", language=DANISH, task=REASONING), |
|
Dataset(name="hellaswag-no", language=NORWEGIAN, task=REASONING), |
|
Dataset(name="hellaswag-sv", language=SWEDISH, task=REASONING), |
|
Dataset(name="hellaswag-is", language=ICELANDIC, task=REASONING), |
|
Dataset(name="hellaswag-de", language=GERMAN, task=REASONING), |
|
Dataset(name="hellaswag-nl", language=DUTCH, task=REASONING), |
|
Dataset(name="hellaswag", language=ENGLISH, task=REASONING), |
|
] |
|
|
|
|
|
def main() -> None: |
|
"""Produce a radial plot.""" |
|
|
|
global last_fetch |
|
results_dfs = fetch_results() |
|
last_fetch = dt.datetime.now() |
|
|
|
all_languages = [language.name for language in ALL_LANGUAGES.values()] |
|
danish_models = list({model_id for model_id in results_dfs[DANISH].index}) |
|
|
|
|
|
all_models = list( |
|
{model_id for df in results_dfs.values() for model_id in df.index} |
|
) |
|
colour_mapping: dict[str, tuple[int, int, int]] = dict() |
|
|
|
for i in it.count(): |
|
min_colour_distance = MIN_COLOUR_DISTANCE_BETWEEN_MODELS - i |
|
|
|
if i > 0: |
|
logger.info( |
|
f"All retries failed. Trying again with min colour distance " |
|
f"{min_colour_distance}." |
|
) |
|
|
|
random.seed(4242 + i) |
|
retries_left = 10 * len(all_models) |
|
for model_id in all_models: |
|
r, g, b = 0, 0, 0 |
|
too_bright, similar_to_other_model = True, True |
|
while (too_bright or similar_to_other_model) and retries_left > 0: |
|
r, g, b = tuple(random.randint(0, 255) for _ in range(3)) |
|
too_bright = np.min([r, g, b]) > 200 |
|
similar_to_other_model = any( |
|
np.abs( |
|
np.array(colour) - np.array([r, g, b]) |
|
).sum() < min_colour_distance |
|
for colour in colour_mapping.values() |
|
) |
|
retries_left -= 1 |
|
logger.info(f"Retries left to find a colour mapping: {retries_left}") |
|
colour_mapping[model_id] = (r, g, b) |
|
|
|
if retries_left: |
|
logger.info( |
|
f"Successfully found a colour mapping with min colour distance " |
|
f"{min_colour_distance}." |
|
) |
|
break |
|
|
|
with gr.Blocks(theme=gr.themes.Monochrome()) as demo: |
|
gr.Markdown(INTRO_MARKDOWN) |
|
|
|
with gr.Tab(label="Build a Radial Plot"): |
|
with gr.Column(): |
|
with gr.Row(): |
|
language_names_dropdown = gr.Dropdown( |
|
choices=all_languages, |
|
multiselect=True, |
|
label="Languages", |
|
value=["Danish"], |
|
interactive=True, |
|
scale=2, |
|
) |
|
model_ids_dropdown = gr.Dropdown( |
|
choices=danish_models, |
|
multiselect=True, |
|
label="Models", |
|
value=["gpt-4-0613", "mistralai/Mistral-7B-v0.1"], |
|
interactive=True, |
|
scale=2, |
|
) |
|
with gr.Row(): |
|
use_win_ratio_checkbox = gr.Checkbox( |
|
label="Compare models with win ratios (as opposed to raw scores)", |
|
value=True, |
|
interactive=True, |
|
scale=1, |
|
) |
|
show_scale_checkbox = gr.Checkbox( |
|
label="Show the scale on the plot (always 0-100)", |
|
value=False, |
|
interactive=True, |
|
scale=1, |
|
) |
|
plot_width_slider = gr.Slider( |
|
label="Plot width", |
|
minimum=600, |
|
maximum=1000, |
|
step=10, |
|
value=800, |
|
interactive=True, |
|
scale=1, |
|
) |
|
plot_height_slider = gr.Slider( |
|
label="Plot height", |
|
minimum=300, |
|
maximum=700, |
|
step=10, |
|
value=500, |
|
interactive=True, |
|
scale=1, |
|
) |
|
with gr.Row(): |
|
plot = gr.Plot( |
|
value=produce_radial_plot( |
|
model_ids_dropdown.value, |
|
language_names=language_names_dropdown.value, |
|
use_win_ratio=use_win_ratio_checkbox.value, |
|
show_scale=show_scale_checkbox.value, |
|
plot_width=plot_width_slider.value, |
|
plot_height=plot_height_slider.value, |
|
colour_mapping=colour_mapping, |
|
results_dfs=results_dfs, |
|
), |
|
) |
|
with gr.Tab(label="About"): |
|
gr.Markdown(ABOUT_MARKDOWN) |
|
|
|
gr.Markdown( |
|
"<center>Made with ❤️ by the <a href=\"https://alexandra.dk\">" |
|
"Alexandra Institute</a>.</center>" |
|
) |
|
|
|
language_names_dropdown.change( |
|
fn=partial(update_model_ids_dropdown, results_dfs=results_dfs), |
|
inputs=[language_names_dropdown, model_ids_dropdown], |
|
outputs=model_ids_dropdown, |
|
) |
|
|
|
|
|
update_plot_kwargs = dict( |
|
fn=partial( |
|
produce_radial_plot, |
|
colour_mapping=colour_mapping, |
|
results_dfs=results_dfs, |
|
), |
|
inputs=[ |
|
model_ids_dropdown, |
|
language_names_dropdown, |
|
use_win_ratio_checkbox, |
|
show_scale_checkbox, |
|
plot_width_slider, |
|
plot_height_slider, |
|
], |
|
outputs=plot, |
|
) |
|
language_names_dropdown.change(**update_plot_kwargs) |
|
model_ids_dropdown.change(**update_plot_kwargs) |
|
use_win_ratio_checkbox.change(**update_plot_kwargs) |
|
show_scale_checkbox.change(**update_plot_kwargs) |
|
plot_width_slider.change(**update_plot_kwargs) |
|
plot_height_slider.change(**update_plot_kwargs) |
|
|
|
demo.launch() |
|
|
|
|
|
def update_model_ids_dropdown( |
|
language_names: list[str], |
|
model_ids: list[str], |
|
results_dfs: dict[Language, pd.DataFrame] | None, |
|
) -> dict: |
|
"""When the language names are updated, update the model ids dropdown. |
|
|
|
Args: |
|
language_names: |
|
The names of the languages to include in the plot. |
|
model_ids: |
|
The ids of the models to include in the plot. |
|
results_dfs: |
|
The results dataframes for each language. |
|
|
|
Returns: |
|
The Gradio update to the model ids dropdown. |
|
""" |
|
global last_fetch |
|
minutes_since_last_fetch = (dt.datetime.now() - last_fetch).total_seconds() / 60 |
|
if minutes_since_last_fetch > UPDATE_FREQUENCY_MINUTES: |
|
results_dfs = fetch_results() |
|
last_fetch = dt.datetime.now() |
|
|
|
if results_dfs is None or len(language_names) == 0: |
|
if results_dfs is None: |
|
logger.info("No results fetched yet. Resetting model ids dropdown.") |
|
else: |
|
logger.info("No languages selected. Resetting model ids dropdown.") |
|
return gr.update(choices=[], value=[]) |
|
|
|
tasks = [ |
|
task |
|
for task in ALL_TASKS |
|
if all( |
|
task in df.columns |
|
for language, df in results_dfs.items() |
|
if language.name in language_names |
|
) |
|
] |
|
|
|
filtered_results_dfs = { |
|
language: df[tasks] |
|
for language, df in results_dfs.items() |
|
if language.name in language_names |
|
} |
|
|
|
unique_models = { |
|
model_id |
|
for df in filtered_results_dfs.values() |
|
for model_id in df.index |
|
} |
|
|
|
filtered_models = [ |
|
model_id |
|
for model_id in unique_models |
|
if all(model_id in df.index for df in filtered_results_dfs.values()) |
|
] |
|
|
|
if len(filtered_models) == 0: |
|
logger.info( |
|
"No valid models for the selected languages. Resetting model ids dropdown." |
|
) |
|
return gr.update(choices=[], value=[]) |
|
|
|
valid_selected_models = [ |
|
model_id for model_id in model_ids if model_id in filtered_models |
|
] |
|
if not valid_selected_models: |
|
if len(filtered_models) > 1: |
|
valid_selected_models = random.sample(filtered_models, k=2) |
|
elif len(filtered_models) == 1: |
|
valid_selected_models = random.sample(filtered_models, k=1) |
|
|
|
logger.info( |
|
f"Updated model ids dropdown with {len(filtered_models):,} valid models for " |
|
f"the selected languages, with {valid_selected_models} selected." |
|
) |
|
|
|
return gr.update(choices=filtered_models, value=valid_selected_models) |
|
|
|
|
|
def produce_radial_plot( |
|
model_ids: list[str], |
|
language_names: list[str], |
|
use_win_ratio: bool, |
|
show_scale: bool, |
|
plot_width: int, |
|
plot_height: int, |
|
colour_mapping: dict[str, tuple[int, int, int]], |
|
results_dfs: dict[Language, pd.DataFrame] | None, |
|
) -> go.Figure: |
|
"""Produce a radial plot as a plotly figure. |
|
|
|
Args: |
|
model_ids: |
|
The ids of the models to include in the plot. |
|
language_names: |
|
The names of the languages to include in the plot. |
|
use_win_ratio: |
|
Whether to use win ratios (as opposed to raw scores). |
|
show_scale: |
|
Whether to show the scale on the plot. |
|
plot_width: |
|
The width of the plot. |
|
plot_height: |
|
The height of the plot. |
|
colour_mapping: |
|
A mapping from model ids to RGB triplets. |
|
results_dfs: |
|
The results dataframes for each language. |
|
|
|
Returns: |
|
A plotly figure. |
|
""" |
|
global last_fetch |
|
minutes_since_last_fetch = (dt.datetime.now() - last_fetch).total_seconds() / 60 |
|
if minutes_since_last_fetch > UPDATE_FREQUENCY_MINUTES: |
|
results_dfs = fetch_results() |
|
last_fetch = dt.datetime.now() |
|
|
|
if results_dfs is None or len(language_names) == 0 or len(model_ids) == 0: |
|
if results_dfs is None: |
|
logger.info("No results fetched yet. Resetting plot.") |
|
elif len(language_names) == 0: |
|
logger.info("No languages selected. Resetting plot.") |
|
else: |
|
logger.info("No models selected. Resetting plot.") |
|
return go.Figure() |
|
|
|
logger.info( |
|
f"Producing radial plot for models {model_ids!r} on languages " |
|
f"{language_names!r}..." |
|
) |
|
|
|
languages = [ALL_LANGUAGES[language_name] for language_name in language_names] |
|
|
|
results_dfs_filtered = { |
|
language: df |
|
for language, df in results_dfs.items() |
|
if language.name in language_names |
|
} |
|
|
|
tasks = [ |
|
task |
|
for task in ALL_TASKS |
|
if all(task in df.columns for df in results_dfs_filtered.values()) |
|
] |
|
|
|
|
|
results: list[list[float]] = list() |
|
for model_id in model_ids: |
|
result_list = list() |
|
for task in tasks: |
|
win_ratios = list() |
|
scores = list() |
|
for language in languages: |
|
if model_id not in results_dfs_filtered[language].index: |
|
continue |
|
score_list = results_dfs_filtered[language].loc[model_id][task] |
|
win_ratio = 100 * np.mean([ |
|
stats.ttest_rel( |
|
a=score_list, b=other_scores, alternative="greater" |
|
).pvalue < 0.05 |
|
for other_scores in results_dfs_filtered[language][task].dropna().drop(index=model_id) |
|
]) |
|
win_ratios.append(win_ratio) |
|
|
|
if all(score < 1 for score in score_list): |
|
score_list = [100 * score for score in score_list] |
|
|
|
scores.append(np.mean(score_list)) |
|
if use_win_ratio: |
|
result_list.append(np.mean(win_ratios)) |
|
else: |
|
result_list.append(np.mean(scores)) |
|
results.append(result_list) |
|
|
|
|
|
|
|
result_matrix = np.array(results) |
|
num_models = result_matrix.shape[0] |
|
num_tasks = result_matrix.shape[1] |
|
num_models_beaten = np.zeros((num_models, num_tasks)) |
|
for i in range(num_models): |
|
for j in range(num_tasks): |
|
num_models_beaten[i, j] = np.sum( |
|
result_matrix[i, j] > result_matrix[:, j] |
|
) |
|
|
|
|
|
|
|
|
|
sorted_idxs = num_models_beaten.sum(axis=1).argsort()[::-1] |
|
model_ids = np.asarray(model_ids)[sorted_idxs].tolist() |
|
results = result_matrix[sorted_idxs].tolist() |
|
|
|
|
|
fig = go.Figure() |
|
for model_id, result_list in zip(model_ids, results): |
|
r, g, b = colour_mapping[model_id] |
|
fig.add_trace(go.Scatterpolar( |
|
r=result_list, |
|
theta=[task.name for task in tasks], |
|
name=model_id, |
|
fill='toself', |
|
fillcolor=f'rgba({r}, {g}, {b}, 0.6)', |
|
line=dict(color=f'rgb({r}, {g}, {b})'), |
|
)) |
|
|
|
languages_str = "" |
|
if len(languages) > 1: |
|
languages_str = ", ".join([language.name for language in languages[:-1]]) |
|
languages_str += " and " |
|
languages_str += languages[-1].name |
|
|
|
if use_win_ratio: |
|
title = f'Win Ratio on on {languages_str} Language Tasks' |
|
else: |
|
title = f'LLM Score on on {languages_str} Language Tasks' |
|
|
|
|
|
fig.update_layout( |
|
polar=dict(radialaxis=dict(visible=show_scale, range=[0, 100])), |
|
showlegend=True, |
|
title=title, |
|
width=plot_width, |
|
height=plot_height, |
|
) |
|
|
|
logger.info("Successfully produced radial plot.") |
|
|
|
return fig |
|
|
|
def fetch_results() -> dict[Language, pd.DataFrame]: |
|
"""Fetch the results from the ScandEval benchmark. |
|
|
|
Returns: |
|
A dictionary of languages -> results-dataframes, whose indices are the |
|
models and columns are the tasks. |
|
""" |
|
logger.info("Fetching results from ScandEval benchmark...") |
|
|
|
response = requests.get( |
|
"https://www.scandeval.com/scandeval_benchmark_results.jsonl" |
|
) |
|
response.raise_for_status() |
|
records = [ |
|
json.loads(dct_str) |
|
for dct_str in response.text.split("\n") |
|
if dct_str.strip("\n") |
|
] |
|
|
|
|
|
|
|
results_dfs = dict() |
|
for language in {dataset.language for dataset in DATASETS}: |
|
possible_dataset_names = { |
|
dataset.name for dataset in DATASETS if dataset.language == language |
|
} |
|
data_dict = defaultdict(dict) |
|
for record in records: |
|
model_name = record["model"] |
|
dataset_name = record["dataset"] |
|
if dataset_name in possible_dataset_names: |
|
dataset = next( |
|
dataset for dataset in DATASETS if dataset.name == dataset_name |
|
) |
|
scores = [ |
|
test_score_dict.get( |
|
f"test_{dataset.task.metric}", |
|
test_score_dict.get(dataset.task.metric) |
|
) |
|
for test_score_dict in record["results"]["raw"]["test"] |
|
] |
|
if dataset.task in data_dict[model_name]: |
|
data_dict[model_name][dataset.task].append(scores) |
|
else: |
|
data_dict[model_name][dataset.task] = [scores] |
|
results_df = pd.DataFrame(data_dict).T.map( |
|
lambda lists_or_nan: |
|
list(it.chain(lists_or_nan)) |
|
if lists_or_nan == lists_or_nan |
|
else lists_or_nan |
|
).dropna().map(lambda lst: lst[0]) |
|
results_dfs[language] = results_df |
|
|
|
logger.info("Successfully fetched results from ScandEval benchmark.") |
|
|
|
return results_dfs |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|