File size: 3,626 Bytes
d1253a8
 
 
 
 
f067bfb
d1253a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f067bfb
d1253a8
 
 
 
 
 
 
 
 
 
f067bfb
 
 
d1253a8
 
 
 
 
 
 
 
 
 
 
 
 
 
f067bfb
 
d1253a8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import os
import json
import glob
from collections import defaultdict
import gradio as gr
from content import *
import glob

ARC = "arc_challenge"
HELLASWAG = "hellaswag"
MMLU = "mmlu"
TRUTHFULQA = "truthfulqa-mc"
BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA]

METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]


def collect_results():
    performance_dict = defaultdict(dict)
    pretrained_models = set()
    for file in glob.glob('evals/*/*.json'):
        with open(file, 'r') as f:
            data = json.load(f)
        if 'results' not in data:
            continue
        if 'config' not in data:
            continue
        results = data['results']
        config = data['config']
        if 'model_args' not in config:
            continue

        model_args = config['model_args'].split(',')
        pretrained = [x for x in model_args if x.startswith('pretrained=')]
        if len(pretrained) != 1:
            continue
        pretrained = pretrained[0].split('=')[1]
        pretrained = pretrained.split('/')[-1]
        pretrained_models.add(pretrained)

        for lang_task, perfs in results.items():
            if lang_task.startswith('arc_') and lang_task.endswith('_challenge'):
                lang = lang_task.split('_')[1]
                task = ARC
            elif lang_task.startswith('hellaswag_'):
                _, lang = lang_task.split('_')
                task = HELLASWAG
            elif lang_task.startswith('mmlu_'):
                _, lang = lang_task.split('_')
                task = MMLU
            elif lang_task.startswith('truthfulqa_') and lang_task.endswith('_mc'):
                lang = lang_task.split('_')[1]
                task = TRUTHFULQA

            if lang and task:
                metric = METRICS[BENCHMARKS.index(task)]
                p = round(perfs[metric] * 100, 1)
                performance_dict[(pretrained, lang)][task] = p
    return performance_dict, pretrained_models


def get_leaderboard_df(performance_dict, pretrained_models):
    df = list()
    for (pretrained, lang), perfs in performance_dict.items():
        arc_perf = perfs.get(ARC, 0.0)
        hellaswag_perf = perfs.get(HELLASWAG, 0.0)
        mmlu_perf = perfs.get(MMLU, 0.0)
        truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)

        if arc_perf * hellaswag_perf * mmlu_perf * truthfulqa_perf == 0:
            continue
        avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
        row = [pretrained, lang, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf]
        df.append(row)
    return df


MODEL_COL = "Model"
LANG_COL = "Language"
AVERAGE_COL = "Average"
ARC_COL = "ARC (25-shot)"
HELLASWAG_COL = "HellaSwag (10-shot)️"
MMLU_COL = "MMLU (5-shot)"
TRUTHFULQA_COL = "TruthfulQA (0-shot)"

COLS = [MODEL_COL, LANG_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL]
TYPES = ["str", "str", "number", "number", "number", "number", "number"]

args = collect_results()
leaderboard_df = get_leaderboard_df(*args)

demo = gr.Blocks()
with demo:
    gr.HTML(TITLE)
    gr.Markdown(INTRO_TEXT, elem_classes="markdown-text")
    gr.Markdown(HOW_TO, elem_classes="markdown-text")

    with gr.Box():
        search_bar = gr.Textbox(
            placeholder="Search models...", show_label=False, elem_id="search-bar"
        )

        leaderboard_table = gr.components.Dataframe(
            value=leaderboard_df,
            headers=COLS,
            datatype=TYPES,
            max_rows=5,
            elem_id="leaderboard-table",
        )

    gr.Markdown(CITATION, elem_classes="markdown-text")

demo.launch()