import json
from collections import defaultdict
from dataclasses import dataclass, field
from functools import cached_property
from pathlib import Path
import numpy as np
import pandas as pd
import gradio as gr
from pandas import DataFrame
from pandas.io.formats.style import Styler
from content import *
ARC = "arc"
HELLASWAG = "hellaswag"
MMLU = "mmlu"
TRUTHFULQA = "truthfulqa"
BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA]
METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
MODEL_COL = "Model"
AVERAGE_COL = "Average"
ARC_COL = "ARC (25-shot)"
HELLASWAG_COL = "HellaSwag (10-shot)️"
MMLU_COL = "MMLU (5-shot)"
TRUTHFULQA_COL = "TruthfulQA (0-shot)"
TRAIN_TYPE_COL = "Training type"
TRAIN_TYPE_COL = "Training type"
NUM_PARAMETERS = "Num. parameters"
@dataclass
class Result:
train_type: str
num_parameters: int
arc: float = field(default=0.)
hellaswag: float = field(default=0.)
mmlu: float = field(default=0.)
truthfulqa: float = field(default=0.)
@cached_property
def num_parameters_kmb(self) -> str:
return convert_number_to_kmb(self.num_parameters)
@cached_property
def average(self) -> float:
return self.arc + self.hellaswag + self.mmlu + self.truthfulqa / 4
def convert_number_to_kmb(number: int) -> str:
"""
Converts a number to a string with K, M or B suffix
:param number: the number to convert
:return: a string with the number and a suffix, e.g. "7B", rounded to one decimal
"""
if number >= 1_000_000_000:
return f"{round(number / 1_000_000_000, 1)}B"
elif number >= 1_000_000:
return f"{round(number / 1_000_000, 1)}M"
elif number >= 1_000:
return f"{round(number / 1_000, 1)}K"
else:
return str(number)
def collect_results() -> dict[tuple[str, str], dict[str, float]]:
"""
Collects results from the evals folder and returns a dictionary of results
:return: a dictionary of results where the keys are typles of (model_name, language) and the values are
dictionaries of the form {benchmark_name: performance_score}
"""
performance_dict = defaultdict(dict)
for pfin in Path("evals").rglob("*.json"):
data = json.loads(pfin.read_text(encoding="utf-8"))
if "results" not in data or "config" not in data:
continue
results = data["results"]
config = data["config"]
if "model_args" not in config:
continue
model_args = config["model_args"].split(",")
pretrained = [x for x in model_args if x.startswith("pretrained=")]
if len(pretrained) != 1:
continue
pretrained = pretrained[0].split("=")[1]
pretrained = pretrained.split("/")[-1]
for lang_task, perfs in results.items():
task, lang = lang_task.split("_")
assert task in BENCHMARKS
if lang and task:
metric = METRICS[BENCHMARKS.index(task)]
p = round(perfs[metric] * 100, 1)
performance_dict[(pretrained, lang)][task] = p
return dict(performance_dict)
def build_performance_df(performance_dict: dict[tuple[str, str], dict[str, float]]) -> DataFrame:
"""
Builds a dataframe from the performance dictionary
:param performance_dict: a dictionary of results where the keys are typles of (model_name, language) and the values are
dictionaries of the form {benchmark_name: performance_score}
:return: a pd.DataFrame that has as rows the model names and as columns the benchmarks
"""
data = []
dutch_training_info = json.loads(Path(__file__).parent.joinpath("evals/dutch_models.json").read_text(encoding="utf-8"))
for (pretrained, lang), perfs in performance_dict.items():
arc_perf = perfs.get(ARC, 0.0)
hellaswag_perf = perfs.get(HELLASWAG, 0.0)
mmlu_perf = perfs.get(MMLU, 0.0)
truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
training_type = dutch_training_info.get(pretrained, "NA")
avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
row = [pretrained, training_type, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf]
data.append(row)
df = pd.DataFrame.from_records(data, columns=COLS)
df = df.sort_values(by=[AVERAGE_COL], ascending=False)
return df
def style_df(df: DataFrame) -> Styler:
"""
Styles the dataframe by rounding to two decimals and putting the max value in bold per column
:param df: the dataframe to style
:return: the Styler
"""
styler = df.style.format("{:.2f}", subset=df.columns[2:])
def highlight_max(col):
return np.where(col == np.nanmax(col.to_numpy()), "font-weight: bold;", None)
styler = styler.apply(highlight_max, axis=1, subset=df.columns[2:])
styler = styler.hide()
return styler
MODEL_COL = "Model"
AVERAGE_COL = "Average"
ARC_COL = "ARC (25-shot)"
HELLASWAG_COL = "HellaSwag (10-shot)️"
MMLU_COL = "MMLU (5-shot)"
TRUTHFULQA_COL = "TruthfulQA (0-shot)"
TRAIN_TYPE_COL = "Training type"
TRAIN_TYPE_COL = "Training type"
NUM_PARAMETERS = "Num. parameters"
COLS = [MODEL_COL, TRAIN_TYPE_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL]
TYPES = ["str", "number", "number", "number", "number", "number"]
results = collect_results()
original_df = build_performance_df(results)
styled_df = style_df(original_df)
with gr.Blocks() as demo:
gr.HTML(TITLE)
gr.Markdown(INTRO_TEXT)
gr.Markdown("## Leaderboard\nOnly representative for the Dutch version (`*_nl`) of the benchmarks!")
gr.components.Dataframe(
value=original_df,
headers=COLS,
datatype=TYPES,
elem_id="leaderboard-table",
)
gr.Markdown("Training type: PT
: pretrained on only/mostly Dutch; FT
: **only** finetuned on"
" Dutch; NA
not specifically pretrained nor finetuned on Dutch but Dutch data may have been a (small) portion of the training data")
gr.Markdown("## LaTeX")
gr.Code(styled_df.to_latex(convert_css=True))
gr.Markdown(CREDIT, elem_classes="markdown-text")
gr.Markdown(CITATION, elem_classes="markdown-text")
if __name__ == '__main__':
demo.launch()