Spaces:
Sleeping
Sleeping
Clémentine
commited on
Commit
•
4b2522c
1
Parent(s):
509661e
need to add the selectors now
Browse files- .gitignore +2 -0
- README.md +47 -1
- app.py +33 -0
- leaderboards_metadata.py +0 -107
- requirements.txt +1 -0
- src/leaderboards/get_from_hub.py +66 -0
- src/leaderboards/saved.py +42 -0
- src/static/about.py +64 -0
- src/static/env.py +7 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
*.pyc
|
2 |
+
.vscode
|
README.md
CHANGED
@@ -9,4 +9,50 @@ app_file: app.py
|
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
+
If you want your leaderboard to appear, feel free to add relevant information in its metadata, and it will be displayed here.
|
13 |
+
|
14 |
+
# Categories
|
15 |
+
|
16 |
+
## Submission type
|
17 |
+
Arenas are not concerned by this category.
|
18 |
+
|
19 |
+
- `submission:automatic`: users can submit their models as such to the leaderboard, and evaluation is run automatically without human intervention
|
20 |
+
- `submission:semiautomatic`: the leaderboard requires the model owner to run evaluations on his side and submit the results
|
21 |
+
- `submission:manual`: the leaderboard requires the leaderboard owner to run evaluations for new submissions
|
22 |
+
- `submission:closed`: the leaderboard does not accept submissions at the moment
|
23 |
+
|
24 |
+
## Test set status
|
25 |
+
Arenas are not concerned by this category.
|
26 |
+
|
27 |
+
- `test:public`: all the test sets used are public, the evaluations are completely reproducible
|
28 |
+
- `test:mix`: some test sets are public and some private
|
29 |
+
- `test:private`: all the test sets used are private, the evaluations are hard to game
|
30 |
+
- `test:rolling`: the test sets used change regularly through time and evaluation scores are refreshed
|
31 |
+
|
32 |
+
## Judges
|
33 |
+
- `judge:auto`: evaluations are run automatically, using an evaluation suite such as `lm_eval` or `lighteval`
|
34 |
+
- `judge:model`: evaluations are run using a model as a judge approach to rate answer
|
35 |
+
- `judge:humans`: evaluations are done by humans to rate answer - this is an arena
|
36 |
+
- `judge:vibe_check`: evaluations are done manually by one human
|
37 |
+
|
38 |
+
## Modalities
|
39 |
+
Can be any (or several) of the following list:
|
40 |
+
- `modality:text`
|
41 |
+
- `modality:image`
|
42 |
+
- `modality:video`
|
43 |
+
- `modality:audio`
|
44 |
+
A bit outside of usual modalities
|
45 |
+
- `modality:tools`: requires added tool usage - mostly for assistant models
|
46 |
+
- `modality:artefacts`: the leaderboard concerns itself with machine learning artefacts as themselves, for example, quality evaluation of text embeddings.
|
47 |
+
|
48 |
+
## Evaluation categories
|
49 |
+
Can be any (or several) of the following list:
|
50 |
+
- `eval:generation`: the evaluation looks at generation capabilities specifically (can be image generation, text generation, ...)
|
51 |
+
- `eval:math`
|
52 |
+
- `eval:code`
|
53 |
+
- `eval:performance`: model performance (speed, energy consumption, ...)
|
54 |
+
- `eval:safety`: safety, toxicity, bias evaluations
|
55 |
+
|
56 |
+
## Language
|
57 |
+
You can indicate the languages covered by your benchmark like so: `language:mylanguage`.
|
58 |
+
At the moment, we do not support language codes, please use the language name in English.
|
app.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
3 |
+
from src.static.env import API, REPO_ID, HF_TOKEN
|
4 |
+
from src.static.about import TITLE, INTRO, ABOUT
|
5 |
+
|
6 |
+
from src.leaderboards.get_from_hub import get_leaderboard_info
|
7 |
+
|
8 |
+
|
9 |
+
def restart_space():
|
10 |
+
API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
|
11 |
+
|
12 |
+
leaderboards_to_info, info_to_leaderboards = get_leaderboard_info()
|
13 |
+
|
14 |
+
|
15 |
+
demo = gr.Blocks()
|
16 |
+
with demo:
|
17 |
+
gr.HTML(TITLE)
|
18 |
+
gr.Markdown(INTRO, elem_classes="markdown-text")
|
19 |
+
|
20 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
21 |
+
with gr.TabItem("Search"):
|
22 |
+
gr.Markdown("Let's look for leaderboards relevant for you! Select the categories of your choice")
|
23 |
+
|
24 |
+
|
25 |
+
|
26 |
+
with gr.TabItem("About"):
|
27 |
+
gr.Markdown(ABOUT, elem_classes="markdown-text")
|
28 |
+
|
29 |
+
scheduler = BackgroundScheduler()
|
30 |
+
scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
|
31 |
+
scheduler.start()
|
32 |
+
|
33 |
+
demo.queue(default_concurrency_limit=40).launch()
|
leaderboards_metadata.py
DELETED
@@ -1,107 +0,0 @@
|
|
1 |
-
from enum import Enum, auto
|
2 |
-
#from dataclasses import dataclass
|
3 |
-
|
4 |
-
SubmissionType = Enum(
|
5 |
-
"SubmissionType",
|
6 |
-
[
|
7 |
-
"Automatic",
|
8 |
-
"SemiAutomatic",
|
9 |
-
"Manual",
|
10 |
-
"Closed",
|
11 |
-
"Arena"
|
12 |
-
]
|
13 |
-
)
|
14 |
-
|
15 |
-
Evaluators = Enum(
|
16 |
-
"Evaluators",
|
17 |
-
[
|
18 |
-
"Humans", # Arena
|
19 |
-
"Automatic",
|
20 |
-
"Model"
|
21 |
-
]
|
22 |
-
)
|
23 |
-
|
24 |
-
TestSet = Enum(
|
25 |
-
"TestSet",
|
26 |
-
[
|
27 |
-
"Private",
|
28 |
-
"Public",
|
29 |
-
"Mix",
|
30 |
-
"Rolling",
|
31 |
-
"N/A"
|
32 |
-
]
|
33 |
-
)
|
34 |
-
|
35 |
-
Categories = Enum(
|
36 |
-
"Categories",
|
37 |
-
[
|
38 |
-
"Text",
|
39 |
-
"Image",
|
40 |
-
"Audio",
|
41 |
-
"Video",
|
42 |
-
"Multimodal",
|
43 |
-
"Generation",
|
44 |
-
"Math",
|
45 |
-
"Code",
|
46 |
-
"LanguageSpecific",
|
47 |
-
"Performance",
|
48 |
-
"Safety",
|
49 |
-
"VibeCheck",
|
50 |
-
"Tools",
|
51 |
-
"Artefacts"
|
52 |
-
]
|
53 |
-
)
|
54 |
-
|
55 |
-
Languages = Enum(
|
56 |
-
"Languages",
|
57 |
-
[
|
58 |
-
"Chinese",
|
59 |
-
"Korean",
|
60 |
-
"Dutch",
|
61 |
-
"Portuguese",
|
62 |
-
"Italian",
|
63 |
-
"Malay",
|
64 |
-
"Polish",
|
65 |
-
"Turkish"
|
66 |
-
|
67 |
-
]
|
68 |
-
)
|
69 |
-
|
70 |
-
leaderboard_to_tags = {
|
71 |
-
"HuggingFaceH4/open_llm_leaderboard": [SubmissionType.Automatic, Evaluators.Automatic, TestSet.Public, Categories.Text, Categories.Math],
|
72 |
-
"bigcode/bigcode-models-leaderboard": [SubmissionType.SemiAutomatic, Evaluators.Automatic, TestSet.Public, Categories.Code],
|
73 |
-
"optimum/llm-perf-leaderboard": [SubmissionType.Manual, Evaluators.Automatic, Categories.Performance],
|
74 |
-
"lmsys/chatbot-arena-leaderboard": [SubmissionType.Arena, Evaluators.Humans, Categories.Text, Categories.Generation],
|
75 |
-
"llmonitor/benchmarks": [SubmissionType.Manual, Evaluators.Humans, Categories.Text, Categories.VibeCheck],
|
76 |
-
"mteb/leaderboard": [SubmissionType.SemiAutomatic, Categories.Text, "Embeddings", Categories.Artefacts],
|
77 |
-
"gaia-benchmark/leaderboard": [SubmissionType.Automatic, TestSet.Private, Evaluators.Automatic, Categories.Text, Categories.Tools, Categories.Multimodal],
|
78 |
-
"opencompass/opencompass-llm-leaderboard": [SubmissionType.Manual, Categories.Text, Categories.LanguageSpecific, Languages.Chinese],
|
79 |
-
"upstage/open-ko-llm-leaderboard": [SubmissionType.Automatic, Evaluators.Automatic, TestSet.Mix, Categories.Text, Languages.Korean],
|
80 |
-
"BramVanroy/open_dutch_llm_leaderboard": [SubmissionType.Manual, Evaluators.Automatic, Categories.Text, Languages.Dutch],
|
81 |
-
"vectara/leaderboard": [SubmissionType.SemiAutomatic, Evaluators.Model, Categories.Text, "Hallucinations"],
|
82 |
-
"facebook/CyberSecEval": [SubmissionType.Closed, Categories.Code, Categories.Safety],
|
83 |
-
"mlabonne/Yet_Another_LLM_Leaderboard": [SubmissionType.Manual, Categories.Text, Evaluators.Automatic],
|
84 |
-
"AI-Secure/llm-trustworthy-leaderboard": [SubmissionType.Automatic, Categories.Safety, Categories.Text],
|
85 |
-
"AILab-CVC/EvalCrafter": [SubmissionType.Closed, Categories.Video, Categories.Generation],
|
86 |
-
"mike-ravkine/can-ai-code-results": [SubmissionType.Closed, Categories.Code],
|
87 |
-
"echo840/ocrbench-leaderboard": [SubmissionType.Closed, Categories.Image, "OCR"],
|
88 |
-
"NPHardEval/NPHardEval-leaderboard": [SubmissionType.Closed, Categories.Text, Categories.Math, TestSet.Rolling],
|
89 |
-
"HaizeLabs/red-teaming-resistance-benchmark": [SubmissionType.Manual, Categories.Safety, Categories.Text],
|
90 |
-
"devingulliver/subquadratic-llm-leaderboard": [SubmissionType.SemiAutomatic, Categories.Text, Categories.Math],
|
91 |
-
"WildVision/vision-arena": [SubmissionType.Arena, Categories.Image, Categories.Multimodal],
|
92 |
-
"Vchitect/VBench_Leaderboard": [SubmissionType.SemiAutomatic, Categories.Video, Categories.Generation],
|
93 |
-
"eduagarcia/open_pt_llm_leaderboard": [Categories.Text, Categories.LanguageSpecific, Languages.Portuguese],
|
94 |
-
"FinancialSupport/open_ita_llm_leaderboard": [Categories.Text, Categories.LanguageSpecific, Languages.Italian],
|
95 |
-
"mesolitica/malay-llm-leaderboard": [Categories.Text, Categories.LanguageSpecific, Languages.Malay],
|
96 |
-
"TIGER-Lab/GenAI-Arena": [Categories.Image, Categories.Generation, Evaluators.Humans, SubmissionType.Arena],
|
97 |
-
"q-future/Q-Bench-Leaderboard": [Categories.Image, Evaluators.Automatic, SubmissionType.Closed],
|
98 |
-
"OpenGenAI/parti-prompts-leaderboard": [Categories.Image, Categories.Generation, SubmissionType.Arena, Evaluators.Humans],
|
99 |
-
"speakleash/open_pl_llm_leaderboard": [Categories.LanguageSpecific, Categories.Text, Languages.Polish],
|
100 |
-
"malhajar/OpenLLMTurkishLeaderboard": [Categories.LanguageSpecific, Categories.Text, Languages.Turkish],
|
101 |
-
"allenai/WildBench": [Evaluators.Humans, SubmissionType.Arena, Evaluators.Model, Categories.Text, Categories.Generation],
|
102 |
-
"hf-audio/open_asr_leaderboard": [Evaluators.Automatic, Categories.Audio],
|
103 |
-
"opencompass/open_vlm_leaderboard": [Evaluators.Automatic, Categories.Generation, Categories.Image],
|
104 |
-
"livecodebench/benchmarks": [Evaluators.Automatic, Categories.Code],
|
105 |
-
"allenai/reward-bench": [Evaluators.Automatic, Categories.Artefacts, "Models", Categories.Text],
|
106 |
-
"TTS-AGI/TTS-Arena": [Evaluators.Humans, Categories.Audio]
|
107 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
huggingface_hub
|
src/leaderboards/get_from_hub.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from collections import defaultdict
|
2 |
+
|
3 |
+
from src.leaderboards.saved import leaderboard_to_tags
|
4 |
+
from src.static.env import API
|
5 |
+
|
6 |
+
def group_all_tags(input_tags: list[str]) -> dict:
|
7 |
+
"""Groups the tags by categories, following the division in the README.
|
8 |
+
|
9 |
+
Args:
|
10 |
+
input_tags (list[str]): list of tags
|
11 |
+
|
12 |
+
Returns:
|
13 |
+
dict: category to tag list
|
14 |
+
"""
|
15 |
+
output_tags = defaultdict(list)
|
16 |
+
for tag in input_tags:
|
17 |
+
if tag == "arena":
|
18 |
+
output_tags.append("judge:humans")
|
19 |
+
continue
|
20 |
+
|
21 |
+
try:
|
22 |
+
category, value = tag.split(":")
|
23 |
+
output_tags[category].append(value)
|
24 |
+
except ValueError:
|
25 |
+
continue
|
26 |
+
|
27 |
+
return output_tags
|
28 |
+
|
29 |
+
|
30 |
+
def get_leaderboard_info() -> tuple[list, dict]:
|
31 |
+
"""Looks up all spaces tagged as leaderboards or arenas on the hub,
|
32 |
+
and homogeneizes their tags.
|
33 |
+
|
34 |
+
Returns:
|
35 |
+
dict: All leaderboard names to their tag dicts by category
|
36 |
+
"""
|
37 |
+
leaderboards = [
|
38 |
+
(s.id, s.tags) for s in API.list_spaces(
|
39 |
+
filter=["leaderboard"]
|
40 |
+
)]
|
41 |
+
arenas = [
|
42 |
+
(s.id, s.tags) for s in API.list_spaces(
|
43 |
+
filter=["arena"]
|
44 |
+
)]
|
45 |
+
saved_leaderboards = [(k, v) for k, v in leaderboard_to_tags.items()]
|
46 |
+
|
47 |
+
seen_leaderboards = []
|
48 |
+
leaderboard_df = []
|
49 |
+
info_to_leaderboard = defaultdict(lambda: defaultdict(list))
|
50 |
+
for name, tags in leaderboards + arenas + saved_leaderboards:
|
51 |
+
if name in seen_leaderboards:
|
52 |
+
continue
|
53 |
+
|
54 |
+
seen_leaderboards.append(name)
|
55 |
+
|
56 |
+
if name in leaderboard_to_tags:
|
57 |
+
tags += leaderboard_to_tags[name]
|
58 |
+
|
59 |
+
grouped_tags = group_all_tags(tags)
|
60 |
+
current_info = grouped_tags
|
61 |
+
current_info["name"] = name
|
62 |
+
leaderboard_df.append(current_info)
|
63 |
+
for category, tags in grouped_tags.items():
|
64 |
+
for tag in tags:
|
65 |
+
info_to_leaderboard[category][tag].append(name)
|
66 |
+
return leaderboard_df, info_to_leaderboard
|
src/leaderboards/saved.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Default leaderboards with which we initialize the space.
|
3 |
+
"""
|
4 |
+
|
5 |
+
leaderboard_to_tags = {
|
6 |
+
"HuggingFaceH4/open_llm_leaderboard": ["submission:automatic", "judge:auto", "test:public", "modality:text", "eval:math"],
|
7 |
+
"bigcode/bigcode-models-leaderboard": ["submission:semiautomatic", "judge:auto", "test:public", "eval:code"],
|
8 |
+
"optimum/llm-perf-leaderboard": ["submission:manual", "judge:auto", "eval:performance"],
|
9 |
+
"lmsys/chatbot-arena-leaderboard": ["judge:humans", "modality:text", "eval:generation"],
|
10 |
+
"llmonitor/benchmarks": ["submission:manual", "judge:humans", "modality:text", "judge:vibe_check"],
|
11 |
+
"mteb/leaderboard": ["submission:semiautomatic", "modality:text", "Embeddings", "modality:artefacts"],
|
12 |
+
"gaia-benchmark/leaderboard": ["submission:automatic", "test:private", "judge:auto", "modality:text", "modality:tools", "modality:text", "modality:image", "modality:video"],
|
13 |
+
"opencompass/opencompass-llm-leaderboard": ["submission:manual", "modality:text", "language:chinese"],
|
14 |
+
"upstage/open-ko-llm-leaderboard": ["submission:automatic", "judge:auto", "test:mix", "modality:text", ],
|
15 |
+
"BramVanroy/open_dutch_llm_leaderboard": ["submission:manual", "judge:auto", "modality:text", "language:dutch"],
|
16 |
+
"vectara/leaderboard": ["submission:semiautomatic", "judge:model", "modality:text", "Hallucinations"],
|
17 |
+
"facebook/CyberSecEval": ["submission:closed", "eval:code", "eval:safety"],
|
18 |
+
"mlabonne/Yet_Another_LLM_Leaderboard": ["submission:manual", "modality:text", "judge:auto"],
|
19 |
+
"AI-Secure/llm-trustworthy-leaderboard": ["submission:automatic", "eval:safety", "modality:text"],
|
20 |
+
"AILab-CVC/EvalCrafter": ["submission:closed", "modality:video", "eval:generation"],
|
21 |
+
"mike-ravkine/can-ai-code-results": ["submission:closed", "eval:code"],
|
22 |
+
"echo840/ocrbench-leaderboard": ["submission:closed", "modality:image", "OCR"],
|
23 |
+
"NPHardEval/NPHardEval-leaderboard": ["submission:closed", "modality:text", "eval:math", "test:rolling"],
|
24 |
+
"HaizeLabs/red-teaming-resistance-benchmark": ["submission:manual", "eval:safety", "modality:text"],
|
25 |
+
"devingulliver/subquadratic-llm-leaderboard": ["submission:semiautomatic", "modality:text", "eval:math"],
|
26 |
+
"WildVision/vision-arena": ["modality:image", "modality:text", "judge:humans"],
|
27 |
+
"Vchitect/VBench_Leaderboard": ["submission:semiautomatic", "modality:video", "eval:generation"],
|
28 |
+
"eduagarcia/open_pt_llm_leaderboard": ["modality:text", "language:portuguese"],
|
29 |
+
"FinancialSupport/open_ita_llm_leaderboard": ["modality:text", "language:italian"],
|
30 |
+
"mesolitica/malay-llm-leaderboard": ["modality:text", "language:malay"],
|
31 |
+
"TIGER-Lab/GenAI-Arena": ["modality:image", "eval:generation", "judge:humans", ],
|
32 |
+
"q-future/Q-Bench-Leaderboard": ["modality:image", "judge:auto", "submission:closed"],
|
33 |
+
"OpenGenAI/parti-prompts-leaderboard": ["modality:image", "eval:generation", "judge:humans"],
|
34 |
+
"speakleash/open_pl_llm_leaderboard": ["modality:text", "language:polish"],
|
35 |
+
"malhajar/OpenLLMTurkishLeaderboard": ["modality:text", "language:turkish"],
|
36 |
+
"allenai/WildBench": ["judge:humans", "judge:model", "modality:text", "eval:generation"],
|
37 |
+
"hf-audio/open_asr_leaderboard": ["judge:auto", "modality:audio"],
|
38 |
+
"opencompass/open_vlm_leaderboard": ["judge:auto", "eval:generation", "modality:image"],
|
39 |
+
"livecodebench/benchmarks": ["judge:auto", "eval:code"],
|
40 |
+
"allenai/reward-bench": ["judge:auto", "modality:artefacts", "Models", "modality:text"],
|
41 |
+
"TTS-AGI/TTS-Arena": ["judge:humans", "modality:audio"]
|
42 |
+
}
|
src/static/about.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
TITLE = "# Leaderboard explorer"
|
2 |
+
|
3 |
+
INTRO = """
|
4 |
+
Have you ever wondered which leaderboard would be best for your use case?
|
5 |
+
"""
|
6 |
+
|
7 |
+
ABOUT = """
|
8 |
+
If you want your leaderboard to appear in our suggestions, feel free to add relevant information in its tag metadata, and it will be displayed here.
|
9 |
+
|
10 |
+
# First step
|
11 |
+
|
12 |
+
Make sure to either use the tag `leaderboard` or `arena` to your space, by adding the following to your README
|
13 |
+
|
14 |
+
```
|
15 |
+
tags:
|
16 |
+
- leaderboard
|
17 |
+
```
|
18 |
+
|
19 |
+
# Extra tags
|
20 |
+
|
21 |
+
## Submission type
|
22 |
+
Arenas are not concerned by this category.
|
23 |
+
|
24 |
+
- `submission:automatic`: users can submit their models as such to the leaderboard, and evaluation is run automatically without human intervention
|
25 |
+
- `submission:semiautomatic`: the leaderboard requires the model owner to run evaluations on his side and submit the results
|
26 |
+
- `submission:manual`: the leaderboard requires the leaderboard owner to run evaluations for new submissions
|
27 |
+
- `submission:closed`: the leaderboard does not accept submissions at the moment
|
28 |
+
|
29 |
+
## Test set status
|
30 |
+
Arenas are not concerned by this category.
|
31 |
+
|
32 |
+
- `test:public`: all the test sets used are public, the evaluations are completely reproducible
|
33 |
+
- `test:mix`: some test sets are public and some private
|
34 |
+
- `test:private`: all the test sets used are private, the evaluations are hard to game
|
35 |
+
- `test:rolling`: the test sets used change regularly through time and evaluation scores are refreshed
|
36 |
+
|
37 |
+
## Judges
|
38 |
+
- `judge:auto`: evaluations are run automatically, using an evaluation suite such as `lm_eval` or `lighteval`
|
39 |
+
- `judge:model`: evaluations are run using a model as a judge approach to rate answer
|
40 |
+
- `judge:humans`: evaluations are done by humans to rate answer - this is an arena
|
41 |
+
- `judge:vibe_check`: evaluations are done manually by one human
|
42 |
+
|
43 |
+
## Modalities
|
44 |
+
Can be any (or several) of the following list:
|
45 |
+
- `modality:text`
|
46 |
+
- `modality:image`
|
47 |
+
- `modality:video`
|
48 |
+
- `modality:audio`
|
49 |
+
A bit outside of usual modalities
|
50 |
+
- `modality:tools`: requires added tool usage - mostly for assistant models
|
51 |
+
- `modality:artefacts`: the leaderboard concerns itself with machine learning artefacts as themselves, for example, quality evaluation of text embeddings.
|
52 |
+
|
53 |
+
## Evaluation categories
|
54 |
+
Can be any (or several) of the following list:
|
55 |
+
- `eval:generation`: the evaluation looks at generation capabilities specifically (can be image generation, text generation, ...)
|
56 |
+
- `eval:math`
|
57 |
+
- `eval:code`
|
58 |
+
- `eval:performance`: model performance (speed, energy consumption, ...)
|
59 |
+
- `eval:safety`: safety, toxicity, bias evaluations
|
60 |
+
|
61 |
+
## Language
|
62 |
+
You can indicate the languages covered by your benchmark like so: `language:mylanguage`.
|
63 |
+
At the moment, we do not support language codes, please use the language name in English.
|
64 |
+
"""
|
src/static/env.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from huggingface_hub import HfApi
|
3 |
+
|
4 |
+
REPO_ID = "clefourrier/LeaderboardFinder"
|
5 |
+
HF_TOKEN = None #os.getenv("HF_TOKEN")
|
6 |
+
|
7 |
+
API = HfApi(HF_TOKEN)
|