Spaces:
Sleeping
Sleeping
Clémentine
commited on
Commit
•
84b5dfa
1
Parent(s):
4b2522c
POC, v0
Browse files- app.py +75 -4
- requirements.txt +2 -1
- src/leaderboards/get_from_hub.py +16 -5
- src/leaderboards/saved.py +1 -1
- src/static/about.py +29 -26
- src/static/display.py +22 -0
- src/static/tag_info.py +157 -0
app.py
CHANGED
@@ -1,31 +1,102 @@
|
|
1 |
import gradio as gr
|
2 |
from apscheduler.schedulers.background import BackgroundScheduler
|
3 |
from src.static.env import API, REPO_ID, HF_TOKEN
|
4 |
-
from src.static.about import TITLE, INTRO, ABOUT
|
5 |
|
6 |
from src.leaderboards.get_from_hub import get_leaderboard_info
|
|
|
|
|
7 |
|
8 |
|
9 |
def restart_space():
|
10 |
API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
|
15 |
demo = gr.Blocks()
|
16 |
with demo:
|
17 |
-
gr.
|
18 |
gr.Markdown(INTRO, elem_classes="markdown-text")
|
19 |
|
20 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
21 |
with gr.TabItem("Search"):
|
22 |
gr.Markdown("Let's look for leaderboards relevant for you! Select the categories of your choice")
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
with gr.TabItem("About"):
|
27 |
gr.Markdown(ABOUT, elem_classes="markdown-text")
|
28 |
|
|
|
|
|
|
|
29 |
scheduler = BackgroundScheduler()
|
30 |
scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
|
31 |
scheduler.start()
|
|
|
1 |
import gradio as gr
|
2 |
from apscheduler.schedulers.background import BackgroundScheduler
|
3 |
from src.static.env import API, REPO_ID, HF_TOKEN
|
4 |
+
from src.static.about import TITLE, INTRO, ABOUT, DOCUMENTATION
|
5 |
|
6 |
from src.leaderboards.get_from_hub import get_leaderboard_info
|
7 |
+
from src.static.tag_info import *
|
8 |
+
from src.static.display import make_clickable
|
9 |
|
10 |
|
11 |
def restart_space():
|
12 |
API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
|
13 |
|
14 |
+
LEADERBOARDS_TO_INFO, INFO_TO_LEADERBOARDS = get_leaderboard_info()
|
15 |
+
|
16 |
+
def update_leaderboards(show_all, modality_tags, submission_tags, test_set_tags, evaluation_tags, language_tags):
|
17 |
+
spaces_of_interest = []
|
18 |
+
if show_all:
|
19 |
+
spaces_of_interest = INFO_TO_LEADERBOARDS["all"]
|
20 |
+
else:
|
21 |
+
for tag in modality_tags:
|
22 |
+
spaces_of_interest.extend(INFO_TO_LEADERBOARDS["modality"][tag.lower()])
|
23 |
+
for tag in submission_tags:
|
24 |
+
spaces_of_interest.extend(INFO_TO_LEADERBOARDS["submission"][tag.lower()])
|
25 |
+
for tag in test_set_tags:
|
26 |
+
spaces_of_interest.extend(INFO_TO_LEADERBOARDS["test"][tag.lower()])
|
27 |
+
for tag in evaluation_tags:
|
28 |
+
spaces_of_interest.extend(INFO_TO_LEADERBOARDS["modality"][tag.lower()])
|
29 |
+
for tag in language_tags:
|
30 |
+
spaces_of_interest.extend(INFO_TO_LEADERBOARDS["language"][tag.lower()])
|
31 |
+
|
32 |
+
return "- " + "\n - ".join([
|
33 |
+
make_clickable(space) +
|
34 |
+
f"\n*Tags: {', '.join(LEADERBOARDS_TO_INFO[space])}*"
|
35 |
+
for space in spaces_of_interest
|
36 |
+
])
|
37 |
+
|
38 |
|
39 |
|
40 |
demo = gr.Blocks()
|
41 |
with demo:
|
42 |
+
gr.Markdown(TITLE)
|
43 |
gr.Markdown(INTRO, elem_classes="markdown-text")
|
44 |
|
45 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
46 |
with gr.TabItem("Search"):
|
47 |
gr.Markdown("Let's look for leaderboards relevant for you! Select the categories of your choice")
|
48 |
+
with gr.Row():
|
49 |
+
with gr.Column():
|
50 |
+
show_all = gr.Checkbox(
|
51 |
+
value=False,
|
52 |
+
label="Show all leaderboards"
|
53 |
+
)
|
54 |
|
55 |
+
modality_tags = gr.CheckboxGroup(
|
56 |
+
choices=[tag.name for tag in Modality],
|
57 |
+
value=[],
|
58 |
+
label="Modality of choice"
|
59 |
+
)
|
60 |
+
submission_tags = gr.CheckboxGroup(
|
61 |
+
choices=[tag.name for tag in SubmissionType],
|
62 |
+
value=[],
|
63 |
+
label="Submission type"
|
64 |
+
)
|
65 |
+
test_set_tags = gr.CheckboxGroup(
|
66 |
+
choices=[tag.name for tag in TestSetStatus],
|
67 |
+
value=[],
|
68 |
+
label="Test set status"
|
69 |
+
)
|
70 |
+
with gr.Column():
|
71 |
+
evaluation_tags = gr.CheckboxGroup(
|
72 |
+
choices=[tag.name for tag in EvaluationCategory],
|
73 |
+
value=[],
|
74 |
+
label="Specific evaluation categories"
|
75 |
+
)
|
76 |
+
language_tags = gr.CheckboxGroup(
|
77 |
+
choices=[tag.capitalize() for tag in sorted(list(INFO_TO_LEADERBOARDS["language"].keys()))],
|
78 |
+
value=[],
|
79 |
+
label="Specific languages"
|
80 |
+
)
|
81 |
+
with gr.Row():
|
82 |
+
leaderboards = gr.Markdown(
|
83 |
+
value="",
|
84 |
+
)
|
85 |
+
|
86 |
+
for selector in [show_all, modality_tags, submission_tags, test_set_tags, evaluation_tags, language_tags]:
|
87 |
+
selector.change(
|
88 |
+
update_leaderboards,
|
89 |
+
[show_all, modality_tags, submission_tags, test_set_tags, evaluation_tags, language_tags],
|
90 |
+
leaderboards,
|
91 |
+
queue=True,
|
92 |
+
)
|
93 |
|
94 |
with gr.TabItem("About"):
|
95 |
gr.Markdown(ABOUT, elem_classes="markdown-text")
|
96 |
|
97 |
+
with gr.TabItem("Documentation"):
|
98 |
+
gr.Markdown(DOCUMENTATION, elem_classes="markdown-text")
|
99 |
+
|
100 |
scheduler = BackgroundScheduler()
|
101 |
scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
|
102 |
scheduler.start()
|
requirements.txt
CHANGED
@@ -1 +1,2 @@
|
|
1 |
-
huggingface_hub
|
|
|
|
1 |
+
huggingface_hub
|
2 |
+
appscheduler
|
src/leaderboards/get_from_hub.py
CHANGED
@@ -45,22 +45,33 @@ def get_leaderboard_info() -> tuple[list, dict]:
|
|
45 |
saved_leaderboards = [(k, v) for k, v in leaderboard_to_tags.items()]
|
46 |
|
47 |
seen_leaderboards = []
|
48 |
-
|
49 |
info_to_leaderboard = defaultdict(lambda: defaultdict(list))
|
50 |
for name, tags in leaderboards + arenas + saved_leaderboards:
|
|
|
|
|
51 |
if name in seen_leaderboards:
|
52 |
continue
|
53 |
|
54 |
seen_leaderboards.append(name)
|
55 |
|
|
|
56 |
if name in leaderboard_to_tags:
|
57 |
tags += leaderboard_to_tags[name]
|
58 |
|
59 |
grouped_tags = group_all_tags(tags)
|
60 |
-
current_info = grouped_tags
|
61 |
-
current_info["name"] = name
|
62 |
-
leaderboard_df.append(current_info)
|
63 |
for category, tags in grouped_tags.items():
|
64 |
for tag in tags:
|
65 |
info_to_leaderboard[category][tag].append(name)
|
66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
saved_leaderboards = [(k, v) for k, v in leaderboard_to_tags.items()]
|
46 |
|
47 |
seen_leaderboards = []
|
48 |
+
leaderboard_to_info = defaultdict(list)
|
49 |
info_to_leaderboard = defaultdict(lambda: defaultdict(list))
|
50 |
for name, tags in leaderboards + arenas + saved_leaderboards:
|
51 |
+
# If we have a duplicate between the leaderboards from the hub (leaderboards, arena)
|
52 |
+
# and the ones we saved manually, we use the version from the hub
|
53 |
if name in seen_leaderboards:
|
54 |
continue
|
55 |
|
56 |
seen_leaderboards.append(name)
|
57 |
|
58 |
+
# If the model has its own tags, plus the ones we saved, we aggregate them
|
59 |
if name in leaderboard_to_tags:
|
60 |
tags += leaderboard_to_tags[name]
|
61 |
|
62 |
grouped_tags = group_all_tags(tags)
|
|
|
|
|
|
|
63 |
for category, tags in grouped_tags.items():
|
64 |
for tag in tags:
|
65 |
info_to_leaderboard[category][tag].append(name)
|
66 |
+
leaderboard_to_info[name].append(f"{category}:{tag}")
|
67 |
+
|
68 |
+
# We pass everything to sets
|
69 |
+
for leaderboard, tags in leaderboard_to_info.items():
|
70 |
+
leaderboard_to_info[leaderboard] = sorted(list(set(tags)))
|
71 |
+
|
72 |
+
for category, category_dict in info_to_leaderboard.items():
|
73 |
+
for tag, space_list in category_dict.items():
|
74 |
+
info_to_leaderboard[category][tag] = sorted(list(set(space_list)))
|
75 |
+
|
76 |
+
info_to_leaderboard["all"] = sorted(list(set(seen_leaderboards)))
|
77 |
+
return leaderboard_to_info, info_to_leaderboard
|
src/leaderboards/saved.py
CHANGED
@@ -11,7 +11,7 @@ leaderboard_to_tags = {
|
|
11 |
"mteb/leaderboard": ["submission:semiautomatic", "modality:text", "Embeddings", "modality:artefacts"],
|
12 |
"gaia-benchmark/leaderboard": ["submission:automatic", "test:private", "judge:auto", "modality:text", "modality:tools", "modality:text", "modality:image", "modality:video"],
|
13 |
"opencompass/opencompass-llm-leaderboard": ["submission:manual", "modality:text", "language:chinese"],
|
14 |
-
"upstage/open-ko-llm-leaderboard": ["submission:automatic", "judge:auto", "test:mix", "modality:text", ],
|
15 |
"BramVanroy/open_dutch_llm_leaderboard": ["submission:manual", "judge:auto", "modality:text", "language:dutch"],
|
16 |
"vectara/leaderboard": ["submission:semiautomatic", "judge:model", "modality:text", "Hallucinations"],
|
17 |
"facebook/CyberSecEval": ["submission:closed", "eval:code", "eval:safety"],
|
|
|
11 |
"mteb/leaderboard": ["submission:semiautomatic", "modality:text", "Embeddings", "modality:artefacts"],
|
12 |
"gaia-benchmark/leaderboard": ["submission:automatic", "test:private", "judge:auto", "modality:text", "modality:tools", "modality:text", "modality:image", "modality:video"],
|
13 |
"opencompass/opencompass-llm-leaderboard": ["submission:manual", "modality:text", "language:chinese"],
|
14 |
+
"upstage/open-ko-llm-leaderboard": ["submission:automatic", "judge:auto", "test:mix", "modality:text", "language:korean"],
|
15 |
"BramVanroy/open_dutch_llm_leaderboard": ["submission:manual", "judge:auto", "modality:text", "language:dutch"],
|
16 |
"vectara/leaderboard": ["submission:semiautomatic", "judge:model", "modality:text", "Hallucinations"],
|
17 |
"facebook/CyberSecEval": ["submission:closed", "eval:code", "eval:safety"],
|
src/static/about.py
CHANGED
@@ -1,10 +1,12 @@
|
|
|
|
|
|
1 |
TITLE = "# Leaderboard explorer"
|
2 |
|
3 |
INTRO = """
|
4 |
Have you ever wondered which leaderboard would be best for your use case?
|
5 |
"""
|
6 |
|
7 |
-
ABOUT = """
|
8 |
If you want your leaderboard to appear in our suggestions, feel free to add relevant information in its tag metadata, and it will be displayed here.
|
9 |
|
10 |
# First step
|
@@ -21,44 +23,45 @@ tags:
|
|
21 |
## Submission type
|
22 |
Arenas are not concerned by this category.
|
23 |
|
24 |
-
|
25 |
-
-
|
26 |
-
|
27 |
-
- `submission:closed`: the leaderboard does not accept submissions at the moment
|
28 |
-
|
29 |
## Test set status
|
30 |
Arenas are not concerned by this category.
|
31 |
|
32 |
-
|
33 |
-
-
|
34 |
-
|
35 |
-
- `test:rolling`: the test sets used change regularly through time and evaluation scores are refreshed
|
36 |
|
37 |
## Judges
|
38 |
-
|
39 |
-
|
40 |
-
-
|
41 |
-
|
42 |
|
43 |
## Modalities
|
44 |
Can be any (or several) of the following list:
|
45 |
-
|
46 |
-
|
47 |
-
-
|
48 |
-
|
49 |
-
A bit outside of usual modalities
|
50 |
-
- `modality:tools`: requires added tool usage - mostly for assistant models
|
51 |
-
- `modality:artefacts`: the leaderboard concerns itself with machine learning artefacts as themselves, for example, quality evaluation of text embeddings.
|
52 |
|
53 |
## Evaluation categories
|
54 |
Can be any (or several) of the following list:
|
55 |
-
|
56 |
-
|
57 |
-
-
|
58 |
-
|
59 |
-
- `eval:safety`: safety, toxicity, bias evaluations
|
60 |
|
61 |
## Language
|
62 |
You can indicate the languages covered by your benchmark like so: `language:mylanguage`.
|
63 |
At the moment, we do not support language codes, please use the language name in English.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
"""
|
|
|
1 |
+
from src.static.tag_info import *
|
2 |
+
|
3 |
TITLE = "# Leaderboard explorer"
|
4 |
|
5 |
INTRO = """
|
6 |
Have you ever wondered which leaderboard would be best for your use case?
|
7 |
"""
|
8 |
|
9 |
+
ABOUT = ("""
|
10 |
If you want your leaderboard to appear in our suggestions, feel free to add relevant information in its tag metadata, and it will be displayed here.
|
11 |
|
12 |
# First step
|
|
|
23 |
## Submission type
|
24 |
Arenas are not concerned by this category.
|
25 |
|
26 |
+
""" +
|
27 |
+
"\n".join([f"- {s.value.key}: {s.value.usage}" for s in SubmissionType]) +
|
28 |
+
"""
|
|
|
|
|
29 |
## Test set status
|
30 |
Arenas are not concerned by this category.
|
31 |
|
32 |
+
""" +
|
33 |
+
"\n".join([f"- {s.value.key}: {s.value.usage}" for s in TestSetStatus]) +
|
34 |
+
"""
|
|
|
35 |
|
36 |
## Judges
|
37 |
+
|
38 |
+
""" +
|
39 |
+
"\n".join([f"- {s.value.key}: {s.value.usage}" for s in Judge]) +
|
40 |
+
"""
|
41 |
|
42 |
## Modalities
|
43 |
Can be any (or several) of the following list:
|
44 |
+
|
45 |
+
""" +
|
46 |
+
"\n".join([f"- {s.value.key}: {s.value.usage}" for s in Modality]) +
|
47 |
+
"""
|
|
|
|
|
|
|
48 |
|
49 |
## Evaluation categories
|
50 |
Can be any (or several) of the following list:
|
51 |
+
|
52 |
+
""" +
|
53 |
+
"\n".join([f"- {s.value.key}: {s.value.usage}" for s in EvaluationCategory]) +
|
54 |
+
"""
|
|
|
55 |
|
56 |
## Language
|
57 |
You can indicate the languages covered by your benchmark like so: `language:mylanguage`.
|
58 |
At the moment, we do not support language codes, please use the language name in English.
|
59 |
+
""")
|
60 |
+
|
61 |
+
DOCUMENTATION = """
|
62 |
+
How to create your own leaderboard?
|
63 |
+
|
64 |
+
I'll make an updated documentation page here at some point, but for now, you can check our [demo leaderboard org](https://huggingface.co/demo-leaderboard-backend)!
|
65 |
+
|
66 |
+
You just need to duplicate the front space (and backend if you want to run your leaderboard on spaces compute), copy the datasets to your own org, and edit the env variables.
|
67 |
"""
|
src/static/display.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def space_html_block(space_info) -> str:
|
2 |
+
url = space_info.url
|
3 |
+
|
4 |
+
return f"""
|
5 |
+
<article class="">
|
6 |
+
<a href="{url}" class="relative z-0 mx-auto flex flex-col items-center justify-center bg-gradient-to-br p-4 filter from-blue-600 to-blue-600 overflow-hidden hover:brightness-110 h-40 rounded-lg">
|
7 |
+
<div class="absolute left-0 top-0 h-24 w-1/2 bg-gradient-to-br from-black/20 via-transparent to-transparent"></div>
|
8 |
+
<div class="absolute flex items-center rounded-xl top-2.5 right-4 text-sm"><svg class="mr-1.5 text-white" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32" fill="currentColor"><path d="M22.45,6a5.47,5.47,0,0,1,3.91,1.64,5.7,5.7,0,0,1,0,8L16,26.13,5.64,15.64a5.7,5.7,0,0,1,0-8,5.48,5.48,0,0,1,7.82,0L16,10.24l2.53-2.58A5.44,5.44,0,0,1,22.45,6m0-2a7.47,7.47,0,0,0-5.34,2.24L16,7.36,14.89,6.24a7.49,7.49,0,0,0-10.68,0,7.72,7.72,0,0,0,0,10.82L16,29,27.79,17.06a7.72,7.72,0,0,0,0-10.82A7.49,7.49,0,0,0,22.45,4Z"></path></svg>
|
9 |
+
<span class="text-white">22</span></div>
|
10 |
+
<div class="absolute opacity-60 text-6xl mb-1 drop-shadow-xl">{icons}</div>
|
11 |
+
<h4 class="z-40 max-w-full truncate text-center font-bold leading-tight text-blue-50 text-xl " style="text-shadow: 0px 1px 2px rgba(0, 0, 0, 0.25);">{name}</h4>
|
12 |
+
</a>
|
13 |
+
"""
|
14 |
+
|
15 |
+
def model_hyperlink(link, model_name):
|
16 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
17 |
+
|
18 |
+
|
19 |
+
def make_clickable(space):
|
20 |
+
link = f"https://huggingface.co/{space}"
|
21 |
+
|
22 |
+
return model_hyperlink(link, space)
|
src/static/tag_info.py
ADDED
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from enum import Enum
|
2 |
+
from dataclasses import dataclass
|
3 |
+
|
4 |
+
@dataclass
|
5 |
+
class Tag:
|
6 |
+
key: str
|
7 |
+
name: str # for display
|
8 |
+
usage: str # explains usage
|
9 |
+
icon: str
|
10 |
+
|
11 |
+
class SubmissionType(Enum):
|
12 |
+
automatic = Tag(
|
13 |
+
key="submission:automatic",
|
14 |
+
name="Automatic",
|
15 |
+
usage="users can submit their models as such to the leaderboard, and evaluation is run automatically without human intervention",
|
16 |
+
icon=""
|
17 |
+
)
|
18 |
+
semiautomatic = Tag(
|
19 |
+
key="submission:semiautomatic",
|
20 |
+
name="Semi Automatic",
|
21 |
+
usage="the leaderboard requires the model owner to run evaluations on his side and submit the results",
|
22 |
+
icon=""
|
23 |
+
)
|
24 |
+
manual = Tag(
|
25 |
+
key="submission:manual",
|
26 |
+
name="Manual",
|
27 |
+
usage="the leaderboard requires the leaderboard owner to run evaluations for new submissions",
|
28 |
+
icon=""
|
29 |
+
)
|
30 |
+
closed = Tag(
|
31 |
+
key="submission:closed",
|
32 |
+
name="Closed",
|
33 |
+
usage="the leaderboard does not accept submissions at the moment",
|
34 |
+
icon=""
|
35 |
+
)
|
36 |
+
|
37 |
+
class TestSetStatus(Enum):
|
38 |
+
public = Tag(
|
39 |
+
key="test:public",
|
40 |
+
name="Public",
|
41 |
+
usage="all the test sets used are public, the evaluations are completely reproducible",
|
42 |
+
icon=""
|
43 |
+
)
|
44 |
+
mix = Tag(
|
45 |
+
key="test:mix",
|
46 |
+
name="Mix",
|
47 |
+
usage="some test sets are public and some private",
|
48 |
+
icon=""
|
49 |
+
)
|
50 |
+
private = Tag(
|
51 |
+
key="test:private",
|
52 |
+
name="Private",
|
53 |
+
usage="all the test sets used are private, the evaluations are hard to game",
|
54 |
+
icon=""
|
55 |
+
)
|
56 |
+
rolling = Tag(
|
57 |
+
key="test:rolling",
|
58 |
+
name="Rolling",
|
59 |
+
usage="the test sets used change regularly through time and evaluation scores are refreshed",
|
60 |
+
icon=""
|
61 |
+
)
|
62 |
+
|
63 |
+
class Judge(Enum):
|
64 |
+
public = Tag(
|
65 |
+
key="judge:auto",
|
66 |
+
name="Automatic metric",
|
67 |
+
usage="evaluations are run automatically, using an evaluation suite such as `lm_eval` or `lighteval`",
|
68 |
+
icon=""
|
69 |
+
)
|
70 |
+
model = Tag(
|
71 |
+
key="judge:model",
|
72 |
+
name="Model",
|
73 |
+
usage="evaluations are run using a model as a judge approach to rate answer",
|
74 |
+
icon=""
|
75 |
+
)
|
76 |
+
humans = Tag(
|
77 |
+
key="judge:humans",
|
78 |
+
name="Human",
|
79 |
+
usage="evaluations are done by humans to rate answer - this is an arena",
|
80 |
+
icon=""
|
81 |
+
)
|
82 |
+
vibe_check = Tag(
|
83 |
+
key="judge:vibe_check",
|
84 |
+
name="Vibe check",
|
85 |
+
usage="evaluations are done manually by one or several humans",
|
86 |
+
icon=""
|
87 |
+
)
|
88 |
+
|
89 |
+
class Modality(Enum):
|
90 |
+
text = Tag(
|
91 |
+
key="modality:text",
|
92 |
+
name="Text",
|
93 |
+
usage="",
|
94 |
+
icon=""
|
95 |
+
)
|
96 |
+
image = Tag(
|
97 |
+
key="modality:image",
|
98 |
+
name="Image",
|
99 |
+
usage="",
|
100 |
+
icon=""
|
101 |
+
)
|
102 |
+
audio = Tag(
|
103 |
+
key="modality:audio",
|
104 |
+
name="Audio",
|
105 |
+
usage="",
|
106 |
+
icon=""
|
107 |
+
)
|
108 |
+
video = Tag(
|
109 |
+
key="modality:video",
|
110 |
+
name="Video",
|
111 |
+
usage="",
|
112 |
+
icon=""
|
113 |
+
)
|
114 |
+
tools = Tag(
|
115 |
+
key="modality:tools",
|
116 |
+
name="Tools",
|
117 |
+
usage="requires added tool usage - mostly for assistant models (a bit outside of usual modalities)",
|
118 |
+
icon=""
|
119 |
+
)
|
120 |
+
artefacts = Tag(
|
121 |
+
key="modality:artefacts",
|
122 |
+
name="Artefacts",
|
123 |
+
usage="the leaderboard concerns itself with machine learning artefacts as themselves, for example, quality evaluation of text embeddings (a bit outside of usual modalities)",
|
124 |
+
icon=""
|
125 |
+
)
|
126 |
+
|
127 |
+
class EvaluationCategory(Enum):
|
128 |
+
generation = Tag(
|
129 |
+
key="eval:generation",
|
130 |
+
name="Generation",
|
131 |
+
usage="the evaluation looks at generation capabilities specifically (can be image generation, text generation, ...) ",
|
132 |
+
icon=""
|
133 |
+
)
|
134 |
+
math = Tag(
|
135 |
+
key="eval:math",
|
136 |
+
name="Math",
|
137 |
+
usage="the evaluation tests math abilities",
|
138 |
+
icon=""
|
139 |
+
)
|
140 |
+
code = Tag(
|
141 |
+
key="eval:code",
|
142 |
+
name="Code",
|
143 |
+
usage="the evaluation tests coding capabilities",
|
144 |
+
icon=""
|
145 |
+
)
|
146 |
+
performance = Tag(
|
147 |
+
key="eval:performance",
|
148 |
+
name="Performance",
|
149 |
+
usage="model performance (speed, energy consumption, ...)",
|
150 |
+
icon=""
|
151 |
+
)
|
152 |
+
safety = Tag(
|
153 |
+
key="eval:safety",
|
154 |
+
name="Safety",
|
155 |
+
usage="the evaluation considers safety, toxicity, bias",
|
156 |
+
icon=""
|
157 |
+
)
|