Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
choco9966
commited on
Commit
β’
fbb2531
1
Parent(s):
bc6c2c3
[update] new code
Browse files- app.py +4 -28
- requirements.txt +13 -8
- src/about.py +72 -0
- src/display/about.py +129 -86
- src/display/formatting.py +3 -4
- src/display/utils.py +41 -44
- src/envs.py +5 -5
- src/leaderboard/read_evals.py +13 -40
- src/submission/submit.py +7 -4
app.py
CHANGED
@@ -2,7 +2,7 @@ import gradio as gr
|
|
2 |
import pandas as pd
|
3 |
from apscheduler.schedulers.background import BackgroundScheduler
|
4 |
from huggingface_hub import snapshot_download
|
5 |
-
from gradio_space_ci import configure_space_ci
|
6 |
|
7 |
from src.display.about import (
|
8 |
CITATION_BUTTON_LABEL,
|
@@ -32,11 +32,6 @@ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PU
|
|
32 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
33 |
from src.submission.submit import add_new_eval
|
34 |
from src.tools.collections import update_collections
|
35 |
-
from src.tools.plots import (
|
36 |
-
create_metric_plot_obj,
|
37 |
-
create_plot_df,
|
38 |
-
create_scores_df,
|
39 |
-
)
|
40 |
|
41 |
|
42 |
def restart_space():
|
@@ -59,12 +54,10 @@ except Exception:
|
|
59 |
|
60 |
|
61 |
raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
62 |
-
if REPO_ID == "
|
63 |
update_collections(original_df.copy())
|
64 |
leaderboard_df = original_df.copy()
|
65 |
|
66 |
-
plot_df = create_plot_df(create_scores_df(raw_data))
|
67 |
-
|
68 |
(
|
69 |
finished_eval_queue_df,
|
70 |
running_eval_queue_df,
|
@@ -155,7 +148,6 @@ def filter_models(
|
|
155 |
params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
|
156 |
mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
|
157 |
filtered_df = filtered_df.loc[mask]
|
158 |
-
|
159 |
return filtered_df
|
160 |
|
161 |
leaderboard_df = filter_models(leaderboard_df, [t.to_str(" : ") for t in ModelType], list(NUMERIC_INTERVALS.keys()), [i.value.name for i in Precision], False, False, False)
|
@@ -299,23 +291,7 @@ with demo:
|
|
299 |
leaderboard_table,
|
300 |
queue=True,
|
301 |
)
|
302 |
-
|
303 |
-
with gr.TabItem("π Metrics through time", elem_id="llm-benchmark-tab-table", id=4):
|
304 |
-
with gr.Row():
|
305 |
-
with gr.Column():
|
306 |
-
chart = create_metric_plot_obj(
|
307 |
-
plot_df,
|
308 |
-
[AutoEvalColumn.average.name],
|
309 |
-
title="Average of Top Scores Over Time (from last update)",
|
310 |
-
)
|
311 |
-
gr.Plot(value=chart, min_width=500)
|
312 |
-
with gr.Column():
|
313 |
-
chart = create_metric_plot_obj(
|
314 |
-
plot_df,
|
315 |
-
BENCHMARK_COLS,
|
316 |
-
title="Top Scores Over Time (from last update)",
|
317 |
-
)
|
318 |
-
gr.Plot(value=chart, min_width=500)
|
319 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=2):
|
320 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
321 |
gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
|
@@ -383,7 +359,7 @@ with demo:
|
|
383 |
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
384 |
label="Model type",
|
385 |
multiselect=False,
|
386 |
-
value=ModelType.
|
387 |
interactive=True,
|
388 |
)
|
389 |
|
|
|
2 |
import pandas as pd
|
3 |
from apscheduler.schedulers.background import BackgroundScheduler
|
4 |
from huggingface_hub import snapshot_download
|
5 |
+
from gradio_space_ci.webhook import configure_space_ci
|
6 |
|
7 |
from src.display.about import (
|
8 |
CITATION_BUTTON_LABEL,
|
|
|
32 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
33 |
from src.submission.submit import add_new_eval
|
34 |
from src.tools.collections import update_collections
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
|
37 |
def restart_space():
|
|
|
54 |
|
55 |
|
56 |
raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
57 |
+
if REPO_ID == "choco9966/New-Open-Ko-LLM-Leaderboard": # update only when it's from real leaderboard
|
58 |
update_collections(original_df.copy())
|
59 |
leaderboard_df = original_df.copy()
|
60 |
|
|
|
|
|
61 |
(
|
62 |
finished_eval_queue_df,
|
63 |
running_eval_queue_df,
|
|
|
148 |
params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
|
149 |
mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
|
150 |
filtered_df = filtered_df.loc[mask]
|
|
|
151 |
return filtered_df
|
152 |
|
153 |
leaderboard_df = filter_models(leaderboard_df, [t.to_str(" : ") for t in ModelType], list(NUMERIC_INTERVALS.keys()), [i.value.name for i in Precision], False, False, False)
|
|
|
291 |
leaderboard_table,
|
292 |
queue=True,
|
293 |
)
|
294 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=2):
|
296 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
297 |
gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
|
|
|
359 |
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
360 |
label="Model type",
|
361 |
multiselect=False,
|
362 |
+
value=ModelType.FT.to_str(" : "),
|
363 |
interactive=True,
|
364 |
)
|
365 |
|
requirements.txt
CHANGED
@@ -2,17 +2,22 @@ APScheduler==3.10.1
|
|
2 |
black==23.11.0
|
3 |
click==8.1.3
|
4 |
datasets==2.14.5
|
5 |
-
gradio==4.19.2
|
6 |
-
gradio_client==0.10.1
|
7 |
huggingface-hub>=0.18.0
|
8 |
-
matplotlib==3.
|
9 |
-
numpy==1.
|
10 |
-
pandas==2.
|
11 |
plotly==5.14.1
|
12 |
python-dateutil==2.8.2
|
13 |
-
requests==2.28.2
|
14 |
sentencepiece
|
15 |
tqdm==4.65.0
|
16 |
-
transformers==4.
|
17 |
tokenizers>=0.15.0
|
18 |
-
gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
black==23.11.0
|
3 |
click==8.1.3
|
4 |
datasets==2.14.5
|
|
|
|
|
5 |
huggingface-hub>=0.18.0
|
6 |
+
matplotlib==3.8.4
|
7 |
+
numpy==1.26.0
|
8 |
+
pandas==2.2.2
|
9 |
plotly==5.14.1
|
10 |
python-dateutil==2.8.2
|
|
|
11 |
sentencepiece
|
12 |
tqdm==4.65.0
|
13 |
+
transformers==4.43.1
|
14 |
tokenizers>=0.15.0
|
15 |
+
gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.2.3 # CI !!!
|
16 |
+
isort
|
17 |
+
ruff
|
18 |
+
gradio==4.31.0
|
19 |
+
gradio[oauth]
|
20 |
+
gradio_leaderboard==0.0.11
|
21 |
+
requests==2.31.0
|
22 |
+
requests-oauthlib== 1.3.1
|
23 |
+
schedule == 1.2.2
|
src/about.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
from enum import Enum
|
3 |
+
|
4 |
+
@dataclass
|
5 |
+
class Task:
|
6 |
+
benchmark: str
|
7 |
+
metric: str
|
8 |
+
col_name: str
|
9 |
+
|
10 |
+
|
11 |
+
# Select your tasks here
|
12 |
+
# ---------------------------------------------------
|
13 |
+
class Tasks(Enum):
|
14 |
+
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
+
task0 = Task("anli_r1", "acc", "ANLI")
|
16 |
+
task1 = Task("logiqa", "acc_norm", "LogiQA")
|
17 |
+
|
18 |
+
NUM_FEWSHOT = 0 # Change with your few shot
|
19 |
+
# ---------------------------------------------------
|
20 |
+
|
21 |
+
|
22 |
+
|
23 |
+
# Your leaderboard name
|
24 |
+
TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
|
25 |
+
|
26 |
+
# What does your leaderboard evaluate?
|
27 |
+
INTRODUCTION_TEXT = """
|
28 |
+
Intro text
|
29 |
+
"""
|
30 |
+
|
31 |
+
# Which evaluations are you running? how can people reproduce what you have?
|
32 |
+
LLM_BENCHMARKS_TEXT = f"""
|
33 |
+
## How it works
|
34 |
+
|
35 |
+
## Reproducibility
|
36 |
+
To reproduce our results, here is the commands you can run:
|
37 |
+
|
38 |
+
"""
|
39 |
+
|
40 |
+
EVALUATION_QUEUE_TEXT = """
|
41 |
+
## Some good practices before submitting a model
|
42 |
+
|
43 |
+
### 1) Make sure you can load your model and tokenizer using AutoClasses:
|
44 |
+
```python
|
45 |
+
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
46 |
+
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
47 |
+
model = AutoModel.from_pretrained("your model name", revision=revision)
|
48 |
+
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
49 |
+
```
|
50 |
+
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
|
51 |
+
|
52 |
+
Note: make sure your model is public!
|
53 |
+
Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
|
54 |
+
|
55 |
+
### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
|
56 |
+
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
|
57 |
+
|
58 |
+
### 3) Make sure your model has an open license!
|
59 |
+
This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model π€
|
60 |
+
|
61 |
+
### 4) Fill up your model card
|
62 |
+
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
63 |
+
|
64 |
+
## In case of model failure
|
65 |
+
If your model is displayed in the `FAILED` category, its execution stopped.
|
66 |
+
Make sure you have followed the above steps first.
|
67 |
+
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
68 |
+
"""
|
69 |
+
|
70 |
+
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
71 |
+
CITATION_BUTTON_TEXT = r"""
|
72 |
+
"""
|
src/display/about.py
CHANGED
@@ -2,62 +2,51 @@ from src.display.utils import ModelType
|
|
2 |
|
3 |
|
4 |
TITLE = """<img src="https://upstage-open-ko-llm-leaderboard-logos.s3.ap-northeast-2.amazonaws.com/header_logo.png" style="width:30%;display:block;margin-left:auto;margin-right:auto">"""
|
5 |
-
BOTTOM_LOGO = """<img src="https://upstage-open-ko-llm-leaderboard-logos.s3.ap-northeast-2.amazonaws.com/footer_logo_240715.png" style="width:
|
6 |
|
7 |
INTRODUCTION_TEXT = f"""
|
8 |
-
|
9 |
|
10 |
-
When you submit a model on the "Submit here!" page, it is automatically evaluated.
|
11 |
-
The data used for evaluation consists of datasets to assess reasoning, language understanding, hallucination, and commonsense.
|
12 |
-
The evaluation dataset is exclusively private and only available for evaluation process.
|
13 |
-
More detailed information about the benchmark dataset is provided on the βAboutβ page.
|
14 |
|
15 |
-
This leaderboard is co-hosted by
|
|
|
|
|
16 |
"""
|
17 |
|
18 |
LLM_BENCHMARKS_TEXT = f"""
|
19 |
-
#
|
|
|
20 |
While outstanding LLM models are being released competitively, most of them are centered on English and are familiar with the English cultural sphere. We operate the Korean leaderboard, π Open Ko-LLM, to evaluate models that reflect the characteristics of the Korean language and Korean culture. Through this, we hope that users can conveniently use the leaderboard, participate, and contribute to the advancement of research in Korean.
|
21 |
|
22 |
-
##
|
23 |
-
{ModelType.PT.to_str(" : ")} model
|
24 |
-
{ModelType.IFT.to_str(" : ")} model
|
25 |
-
{ModelType.RL.to_str(" : ")} model
|
26 |
-
If there is no icon, it indicates that there is insufficient information about the model.
|
27 |
-
Please provide information about the model through an issue! π€©
|
28 |
|
29 |
-
|
30 |
-
(Models that have used the evaluation set for training to achieve a high leaderboard ranking, among others, are selected as subjects of caution.)
|
31 |
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
-
|
35 |
|
36 |
-
|
37 |
-
- Ko-HellaSwag (provided by __[Upstage](https://www.upstage.ai/)__, machine translation)
|
38 |
-
- Ko-MMLU (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
|
39 |
-
- Ko-Arc (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
|
40 |
-
- Ko-Truthful QA (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
|
41 |
-
- Ko-Winogrande (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
|
42 |
-
- Ko-GSM8k (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
|
43 |
-
- Ko-CommonGen V2 (provided by __[Korea University NLP&AI Lab](http://nlp.korea.ac.kr/)__, created from scratch)
|
44 |
-
- Ko-EQ Bench (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
|
45 |
-
- Ko-InstFollow (provided by __[Flitto](https://www.flitto.com/portal/en)__, human translation and variation)
|
46 |
-
- KorNAT-CKA (provided by __[SELECTSTAR](https://selectstar.ai/ko/)__ and __[KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1)__, created from scratch)
|
47 |
-
- KorNAT-SVA (provided by __[SELECTSTAR](https://selectstar.ai/ko/)__ and __[KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1)__, created from scratch)
|
48 |
-
- Ko-Harmlessness (provided by __[SELECTSTAR](https://selectstar.ai/ko/)__ and __[KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1)__, created from scratch)
|
49 |
-
- Ko-Helpfulness (provided by __[SELECTSTAR](https://selectstar.ai/ko/)__ and __[KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1)__, created from scratch)
|
50 |
|
51 |
-
|
52 |
|
53 |
-
|
54 |
|
55 |
-
|
56 |
-
-
|
57 |
-
- Community queries and running status in the `requests` Upstage dataset: https://huggingface.co/datasets/open-ko-llm-leaderboard/requests
|
58 |
|
59 |
## More resources
|
60 |
-
|
|
|
61 |
"""
|
62 |
|
63 |
|
@@ -66,38 +55,71 @@ FAQ_TEXT = """
|
|
66 |
|
67 |
|
68 |
EVALUATION_QUEUE_TEXT = f"""
|
69 |
-
# Evaluation Queue for the
|
70 |
-
|
|
|
|
|
|
|
71 |
|
72 |
-
|
73 |
|
74 |
-
|
75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
77 |
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
78 |
model = AutoModel.from_pretrained("your model name", revision=revision)
|
79 |
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
80 |
```
|
81 |
|
82 |
-
|
83 |
|
84 |
-
|
|
|
|
|
85 |
|
86 |
-
|
87 |
|
88 |
-
|
89 |
|
90 |
-
###
|
91 |
-
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
|
92 |
|
93 |
-
|
94 |
-
|
|
|
95 |
|
96 |
-
### 4οΈβ£ Fill up your model card
|
97 |
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
98 |
|
99 |
-
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
"""
|
102 |
|
103 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results. Authors of open-ko-llm-leaderboard are ordered alphabetically."
|
@@ -106,8 +128,10 @@ CITATION_BUTTON_TEXT = r"""
|
|
106 |
title={Open Ko-LLM Leaderboard: Evaluating Large Language Models in Korean with Ko-H5 Benchmark},
|
107 |
author={Chanjun Park and Hyeonwoo Kim and Dahyun Kim and Seonghwan Cho and Sanghoon Kim and Sukyung Lee and Yungi Kim and Hwalsuk Lee},
|
108 |
year={2024},
|
109 |
-
booktitle={ACL
|
110 |
}
|
|
|
|
|
111 |
@software{eval-harness,
|
112 |
author = {Gao, Leo and
|
113 |
Tow, Jonathan and
|
@@ -132,40 +156,59 @@ CITATION_BUTTON_TEXT = r"""
|
|
132 |
publisher = {Zenodo},
|
133 |
version = {v0.0.1},
|
134 |
doi = {10.5281/zenodo.5371628},
|
135 |
-
url = {https://doi.org/10.5281/zenodo.5371628}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
}
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
@misc{park2023koarc,
|
144 |
-
title={Ko-ARC},
|
145 |
-
original_title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
|
146 |
-
author={Hyunbyung Park, Chanjun Park},
|
147 |
-
original_author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
|
148 |
-
year={2023}
|
149 |
}
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
}
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
|
|
|
|
|
|
|
|
163 |
}
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
}
|
171 |
"""
|
|
|
2 |
|
3 |
|
4 |
TITLE = """<img src="https://upstage-open-ko-llm-leaderboard-logos.s3.ap-northeast-2.amazonaws.com/header_logo.png" style="width:30%;display:block;margin-left:auto;margin-right:auto">"""
|
5 |
+
BOTTOM_LOGO = """<img src="https://upstage-open-ko-llm-leaderboard-logos.s3.ap-northeast-2.amazonaws.com/footer_logo_240715.png" style="width:75%;display:block;margin-left:auto;margin-right:auto">"""
|
6 |
|
7 |
INTRODUCTION_TEXT = f"""
|
8 |
+
The previous Leaderboard version is liveΒ [here](https://huggingface.co/spaces/choco9966/open-ko-llm-leaderboard-old)Β π
|
9 |
|
10 |
+
π The Open Ko-LLM Leaderboard2 π°π· objectively evaluates the performance of Korean Large Language Model (LLM). When you submit a model on the "Submit here!" page, it is automatically evaluated.
|
|
|
|
|
|
|
11 |
|
12 |
+
This leaderboard is co-hosted byΒ [Upstage](https://www.upstage.ai/), andΒ [NIA](https://www.nia.or.kr/site/nia_kor/main.do)Β that provides various Korean Data Sets throughΒ [AI-Hub](https://aihub.or.kr/), and operated byΒ [Upstage](https://www.upstage.ai/). The GPU used for evaluation is operated with the support ofΒ [KT](https://cloud.kt.com/) and [AICA](https://aica-gj.kr/main.php). If Season 1 focused on evaluating the capabilities of the LLM in terms of reasoning, language understanding, hallucination, and commonsense through academic benchmarks, Season 2 will focus on assessing the LLM's practical abilities and reliability. The datasets for this season are sponsored by [Flitto](https://www.flitto.com/portal/en), [SELECTSTAR](https://selectstar.ai/ko/), and [KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1). The evaluation dataset is exclusively private and only available for evaluation process. More detailed information about the benchmark dataset is provided on the βAboutβ page.
|
13 |
+
|
14 |
+
You'll notably find explanations on the evaluations we are using, reproducibility guidelines, best practices on how to submit a model, and our FAQ.
|
15 |
"""
|
16 |
|
17 |
LLM_BENCHMARKS_TEXT = f"""
|
18 |
+
# Motivation
|
19 |
+
|
20 |
While outstanding LLM models are being released competitively, most of them are centered on English and are familiar with the English cultural sphere. We operate the Korean leaderboard, π Open Ko-LLM, to evaluate models that reflect the characteristics of the Korean language and Korean culture. Through this, we hope that users can conveniently use the leaderboard, participate, and contribute to the advancement of research in Korean.
|
21 |
|
22 |
+
## How it works
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
+
π We evaluate models on 9 key benchmarks using theΒ [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness)Β , a unified framework to test generative language models on a large number of different evaluation tasks.
|
|
|
25 |
|
26 |
+
- Ko-GPQA (provided byΒ [Flitto](https://www.flitto.com/portal/en))
|
27 |
+
- Ko-WinoGrande (provided byΒ [Flitto](https://www.flitto.com/portal/en))
|
28 |
+
- Ko-GSM8K (provided byΒ [Flitto](https://www.flitto.com/portal/en))
|
29 |
+
- Ko-EQ-Bench (provided byΒ [Flitto](https://www.flitto.com/portal/en))
|
30 |
+
- Ko-IFEval (provided byΒ [Flitto](https://www.flitto.com/portal/en))
|
31 |
+
- KorNAT-Knowledge (provided byΒ [SELECTSTAR](https://selectstar.ai/ko/)Β andΒ [KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1))
|
32 |
+
- KorNAT-Social-Value (provided byΒ [SELECTSTAR](https://selectstar.ai/ko/)Β andΒ [KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1))
|
33 |
+
- Ko-Harmlessness (provided byΒ [SELECTSTAR](https://selectstar.ai/ko/)Β andΒ [KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1))
|
34 |
+
- Ko-Helpfulness (provided byΒ [SELECTSTAR](https://selectstar.ai/ko/)Β andΒ [KAIST AI](https://gsai.kaist.ac.kr/?lang=ko&ckattempt=1))
|
35 |
|
36 |
+
For all these evaluations, a higher score is a better score. We chose these benchmarks as they test a variety of reasoning, harmlessness, helpfulness and general knowledge across a wide variety of fields in 0-shot and few-shot settings.
|
37 |
|
38 |
+
The final score is converted to the average score from each evaluation datasets.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
+
GPUs are provided byΒ [KT](https://cloud.kt.com/)Β and [AICA](https://aica-gj.kr/main.php) for the evaluations.
|
41 |
|
42 |
+
## **Results**
|
43 |
|
44 |
+
- Detailed numerical results in theΒ `results`Β Upstage dataset:Β https://huggingface.co/datasets/open-ko-llm-leaderboard/results
|
45 |
+
- Community queries and running status in theΒ `requests`Β Upstage dataset:Β https://huggingface.co/datasets/open-ko-llm-leaderboard/requests
|
|
|
46 |
|
47 |
## More resources
|
48 |
+
|
49 |
+
If you still have questions, you can check our FAQΒ [here](https://huggingface.co/spaces/upstage/open-ko-llm-leaderboard/discussions/1)!
|
50 |
"""
|
51 |
|
52 |
|
|
|
55 |
|
56 |
|
57 |
EVALUATION_QUEUE_TEXT = f"""
|
58 |
+
# Evaluation Queue for the π€ Open Ko-LLM Leaderboard
|
59 |
+
|
60 |
+
Models added here will be automatically evaluated on the π€ cluster.
|
61 |
+
|
62 |
+
## Submission Disclaimer
|
63 |
|
64 |
+
**By submitting a model, you acknowledge that:**
|
65 |
|
66 |
+
- We store information about who submitted each model inΒ [Requests dataset](https://huggingface.co/datasets/open-ko-llm-leaderboard/requests).
|
67 |
+
- This practice helps maintain the integrity of our leaderboard, prevent spam, and ensure responsible submissions.
|
68 |
+
- Your submission will be visible to the community and you may be contacted regarding your model.
|
69 |
+
- Please submit carefully and responsibly π
|
70 |
+
|
71 |
+
## First Steps Before Submitting a Model
|
72 |
+
|
73 |
+
### 1. Ensure Your Model Loads with AutoClasses
|
74 |
+
|
75 |
+
Verify that you can load your model and tokenizer using AutoClasses:
|
76 |
+
|
77 |
+
```jsx
|
78 |
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
79 |
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
80 |
model = AutoModel.from_pretrained("your model name", revision=revision)
|
81 |
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
82 |
```
|
83 |
|
84 |
+
Note:
|
85 |
|
86 |
+
- If this step fails, debug your model before submitting.
|
87 |
+
- Ensure your model is public.
|
88 |
+
- We are working on adding support for models requiringΒ `use_remote_code=True`.
|
89 |
|
90 |
+
### 2. Convert Weights to Safetensors
|
91 |
|
92 |
+
[Safetensors](https://huggingface.co/docs/safetensors/index)Β is a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to theΒ `Extended Viewer`!
|
93 |
|
94 |
+
### 3. Verify Your Model Open License
|
|
|
95 |
|
96 |
+
This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model π€
|
97 |
+
|
98 |
+
### 4. Complete Your Model Card
|
99 |
|
|
|
100 |
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
101 |
|
102 |
+
### 5. Select Correct Precision
|
103 |
+
|
104 |
+
Choose the right precision to avoid evaluation errors:
|
105 |
+
|
106 |
+
- Not all models convert properly from float16 to bfloat16.
|
107 |
+
- Incorrect precision can cause issues (e.g., loading a bf16 model in fp16 may generate NaNs).
|
108 |
+
|
109 |
+
> Important:Β When submitting, git branches and tags will be strictly tied to the specific commit present at the time of submission to ensure revision consistency.
|
110 |
+
>
|
111 |
+
|
112 |
+
## Model types
|
113 |
+
|
114 |
+
- π’ : π’ pretrained model: new, base models, trained on a given text corpora using masked modelling
|
115 |
+
- π© : π© continuously pretrained model: new, base models, continuously trained on further corpus (which may include IFT/chat data) using masked modelling
|
116 |
+
- πΆ : πΆ fine-tuned on domain-specific datasets model: pretrained models finetuned on more data
|
117 |
+
- π¬ : π¬ chat models (RLHF, DPO, IFT, ...) model: chat like fine-tunes, either using IFT (datasets of task instruction), RLHF or DPO (changing the model loss a bit with an added policy), etc
|
118 |
+
- π€ : π€ base merges and moerges model: merges or MoErges, models which have been merged or fused without additional fine-tuning.
|
119 |
+
|
120 |
+
Please provide information about the model through an issue! π€©
|
121 |
+
|
122 |
+
π΄ββ οΈ : π΄ββ οΈΒ This icon indicates that the model has been selected as a subject of caution by the community, implying that users should exercise restraint when using it. Clicking on the icon will take you to a discussion about that model. (Models that have used the evaluation set for training to achieve a high leaderboard ranking, among others, are selected as subjects of caution.)
|
123 |
"""
|
124 |
|
125 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results. Authors of open-ko-llm-leaderboard are ordered alphabetically."
|
|
|
128 |
title={Open Ko-LLM Leaderboard: Evaluating Large Language Models in Korean with Ko-H5 Benchmark},
|
129 |
author={Chanjun Park and Hyeonwoo Kim and Dahyun Kim and Seonghwan Cho and Sanghoon Kim and Sukyung Lee and Yungi Kim and Hwalsuk Lee},
|
130 |
year={2024},
|
131 |
+
booktitle={The 62nd Annual Meeting of the Association for Computational Linguistics (ACL 2024) }
|
132 |
}
|
133 |
+
|
134 |
+
|
135 |
@software{eval-harness,
|
136 |
author = {Gao, Leo and
|
137 |
Tow, Jonathan and
|
|
|
156 |
publisher = {Zenodo},
|
157 |
version = {v0.0.1},
|
158 |
doi = {10.5281/zenodo.5371628},
|
159 |
+
url = {https://doi.org/10.5281/zenodo.5371628},
|
160 |
+
}
|
161 |
+
|
162 |
+
@misc{rein2023gpqagraduatelevelgoogleproofqa,
|
163 |
+
title={GPQA: A Graduate-Level Google-Proof Q&A Benchmark},
|
164 |
+
author={David Rein and Betty Li Hou and Asa Cooper Stickland and Jackson Petty and Richard Yuanzhe Pang and Julien Dirani and Julian Michael and Samuel R. Bowman},
|
165 |
+
year={2023},
|
166 |
+
eprint={2311.12022},
|
167 |
+
archivePrefix={arXiv},
|
168 |
+
primaryClass={cs.AI},
|
169 |
+
url={https://arxiv.org/abs/2311.12022},
|
170 |
+
}
|
171 |
+
|
172 |
+
@article{sakaguchi2021winogrande,
|
173 |
+
title={Winogrande: An adversarial winograd schema challenge at scale},
|
174 |
+
author={Sakaguchi, Keisuke and Bras, Ronan Le and Bhagavatula, Chandra and Choi, Yejin},
|
175 |
+
journal={Communications of the ACM},
|
176 |
+
volume={64},
|
177 |
+
number={9},
|
178 |
+
pages={99--106},
|
179 |
+
year={2021},
|
180 |
+
publisher={ACM New York, NY, USA}
|
181 |
}
|
182 |
+
|
183 |
+
@article{cobbe2021training,
|
184 |
+
title={Training verifiers to solve math word problems},
|
185 |
+
author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and others},
|
186 |
+
journal={arXiv preprint arXiv:2110.14168},
|
187 |
+
year={2021}
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
}
|
189 |
+
|
190 |
+
article{paech2023eq,
|
191 |
+
title={Eq-bench: An emotional intelligence benchmark for large language models},
|
192 |
+
author={Paech, Samuel J},
|
193 |
+
journal={arXiv preprint arXiv:2312.06281},
|
194 |
+
year={2023}
|
195 |
}
|
196 |
+
|
197 |
+
|
198 |
+
@misc{zhou2023instructionfollowingevaluationlargelanguage,
|
199 |
+
title={Instruction-Following Evaluation for Large Language Models},
|
200 |
+
author={Jeffrey Zhou and Tianjian Lu and Swaroop Mishra and Siddhartha Brahma and Sujoy Basu and Yi Luan and Denny Zhou and Le Hou},
|
201 |
+
year={2023},
|
202 |
+
eprint={2311.07911},
|
203 |
+
archivePrefix={arXiv},
|
204 |
+
primaryClass={cs.CL},
|
205 |
+
url={https://arxiv.org/abs/2311.07911},
|
206 |
}
|
207 |
+
|
208 |
+
@article{lee2024kornat,
|
209 |
+
title={KorNAT: LLM Alignment Benchmark for Korean Social Values and Common Knowledge},
|
210 |
+
author={Lee, Jiyoung and Kim, Minwoo and Kim, Seungho and Kim, Junghwan and Won, Seunghyun and Lee, Hwaran and Choi, Edward},
|
211 |
+
journal={arXiv preprint arXiv:2402.13605},
|
212 |
+
year={2024}
|
213 |
}
|
214 |
"""
|
src/display/formatting.py
CHANGED
@@ -14,10 +14,9 @@ def model_hyperlink(link, model_name):
|
|
14 |
def make_clickable_model(model_name):
|
15 |
link = f"https://huggingface.co/{model_name}"
|
16 |
|
17 |
-
details_model_name = model_name.replace("/", "__")
|
18 |
-
details_link = f"https://huggingface.co/datasets/open-ko-llm-leaderboard/details_{details_model_name}"
|
19 |
-
|
20 |
-
return model_hyperlink(link, model_name) + " " + model_hyperlink(details_link, "π")
|
21 |
|
22 |
|
23 |
def styled_error(error):
|
|
|
14 |
def make_clickable_model(model_name):
|
15 |
link = f"https://huggingface.co/{model_name}"
|
16 |
|
17 |
+
# details_model_name = model_name.replace("/", "__")
|
18 |
+
# details_link = f"https://huggingface.co/datasets/open-ko-llm-leaderboard/details_{details_model_name}"
|
19 |
+
return model_hyperlink(link, model_name) # + " " + model_hyperlink(details_link, "π")
|
|
|
20 |
|
21 |
|
22 |
def styled_error(error):
|
src/display/utils.py
CHANGED
@@ -14,19 +14,15 @@ class Task:
|
|
14 |
col_name: str
|
15 |
|
16 |
class Tasks(Enum):
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
korNatCka = Task("kor_nat_cka", "acc_norm", "KorNAT-CKA")
|
27 |
-
korNatSva = Task("kor_nat_sva", "acc_norm", "KorNAT-SVA")
|
28 |
-
harmlessness = Task("ko_harmlessness", "acc_norm", "Ko-Harmlessness")
|
29 |
-
helpfulness = Task("ko_helpfulness", "acc_norm", "Ko-Helpfulness")
|
30 |
|
31 |
|
32 |
# These classes are for user facing column names,
|
@@ -89,26 +85,30 @@ class ModelDetails:
|
|
89 |
|
90 |
class ModelType(Enum):
|
91 |
PT = ModelDetails(name="pretrained", symbol="π’")
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
|
|
96 |
|
97 |
def to_str(self, separator=" "):
|
98 |
return f"{self.value.symbol}{separator}{self.value.name}"
|
99 |
|
100 |
@staticmethod
|
101 |
-
def from_str(
|
102 |
-
|
103 |
-
|
104 |
-
if "pretrained" in
|
|
|
|
|
105 |
return ModelType.PT
|
106 |
-
if "RL-tuned"
|
107 |
-
return ModelType.
|
108 |
-
if "
|
109 |
-
return ModelType.
|
110 |
return ModelType.Unknown
|
111 |
|
|
|
112 |
class WeightType(Enum):
|
113 |
Adapter = ModelDetails("Adapter")
|
114 |
Original = ModelDetails("Original")
|
@@ -116,12 +116,13 @@ class WeightType(Enum):
|
|
116 |
|
117 |
class Precision(Enum):
|
118 |
float16 = ModelDetails("float16")
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
Unknown = ModelDetails("?")
|
124 |
|
|
|
125 |
def from_str(precision):
|
126 |
if precision in ["torch.float16", "float16"]:
|
127 |
return Precision.float16
|
@@ -134,15 +135,10 @@ class Precision(Enum):
|
|
134 |
if precision in ["GPTQ", "None"]:
|
135 |
return Precision.qt_GPTQ
|
136 |
return Precision.Unknown
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
|
141 |
# Column selection
|
142 |
-
COLS = [c.name for c in fields(AutoEvalColumn)
|
143 |
-
TYPES = [c.type for c in fields(AutoEvalColumn)
|
144 |
-
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
145 |
-
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
146 |
|
147 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
148 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
@@ -150,11 +146,12 @@ EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
|
150 |
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
151 |
|
152 |
NUMERIC_INTERVALS = {
|
153 |
-
"
|
154 |
-
"
|
155 |
-
"3
|
156 |
-
"7
|
157 |
-
"13
|
158 |
-
"35
|
159 |
-
"
|
160 |
-
|
|
|
|
14 |
col_name: str
|
15 |
|
16 |
class Tasks(Enum):
|
17 |
+
gpqa = Task("ko_gpqa_diamond_zeroshot", "acc_norm,none", "Ko-GPQA")
|
18 |
+
winogrande = Task("ko_winogrande", "acc,none", "Ko-Winogrande")
|
19 |
+
gsm8k = Task("ko_gsm8k", "exact_match,strict-match", "Ko-GSM8k")
|
20 |
+
eqBench = Task("ko_eqbench", "eqbench,none", "Ko-EQ Bench")
|
21 |
+
instFollow = Task("ko_ifeval", "strict_acc,none", "Ko-IFEval")
|
22 |
+
korNatCka = Task("kornat_common", "acc_norm,none", "KorNAT-CKA")
|
23 |
+
korNatSva = Task("kornat_social", "A-SVA,none", "KorNAT-SVA")
|
24 |
+
harmlessness = Task("kornat_harmless", "acc_norm,none", "Ko-Harmlessness")
|
25 |
+
helpfulness = Task("kornat_helpful", "acc_norm,none", "Ko-Helpfulness")
|
|
|
|
|
|
|
|
|
26 |
|
27 |
|
28 |
# These classes are for user facing column names,
|
|
|
85 |
|
86 |
class ModelType(Enum):
|
87 |
PT = ModelDetails(name="pretrained", symbol="π’")
|
88 |
+
CPT = ModelDetails(name="continuously pretrained", symbol="π©")
|
89 |
+
FT = ModelDetails(name="fine-tuned on domain-specific datasets", symbol="πΆ")
|
90 |
+
chat = ModelDetails(name="chat models (RLHF, DPO, IFT, ...)", symbol="π¬")
|
91 |
+
merges = ModelDetails(name="base merges and moerges", symbol="π€")
|
92 |
+
Unknown = ModelDetails(name="other", symbol="β")
|
93 |
|
94 |
def to_str(self, separator=" "):
|
95 |
return f"{self.value.symbol}{separator}{self.value.name}"
|
96 |
|
97 |
@staticmethod
|
98 |
+
def from_str(m_type):
|
99 |
+
if any([k for k in m_type if k in ["fine-tuned","πΆ", "finetuned"]]):
|
100 |
+
return ModelType.FT
|
101 |
+
if "continuously pretrained" in m_type or "π©" in m_type:
|
102 |
+
return ModelType.CPT
|
103 |
+
if "pretrained" in m_type or "π’" in m_type:
|
104 |
return ModelType.PT
|
105 |
+
if any([k in m_type for k in ["instruction-tuned", "RL-tuned", "chat", "π¦", "β", "π¬"]]):
|
106 |
+
return ModelType.chat
|
107 |
+
if "merge" in m_type or "π€" in m_type:
|
108 |
+
return ModelType.merges
|
109 |
return ModelType.Unknown
|
110 |
|
111 |
+
|
112 |
class WeightType(Enum):
|
113 |
Adapter = ModelDetails("Adapter")
|
114 |
Original = ModelDetails("Original")
|
|
|
116 |
|
117 |
class Precision(Enum):
|
118 |
float16 = ModelDetails("float16")
|
119 |
+
bfloat16 = ModelDetails("bfloat16")
|
120 |
+
qt_8bit = ModelDetails("8bit")
|
121 |
+
qt_4bit = ModelDetails("4bit")
|
122 |
+
qt_GPTQ = ModelDetails("GPTQ")
|
123 |
Unknown = ModelDetails("?")
|
124 |
|
125 |
+
@staticmethod
|
126 |
def from_str(precision):
|
127 |
if precision in ["torch.float16", "float16"]:
|
128 |
return Precision.float16
|
|
|
135 |
if precision in ["GPTQ", "None"]:
|
136 |
return Precision.qt_GPTQ
|
137 |
return Precision.Unknown
|
|
|
|
|
|
|
138 |
|
139 |
# Column selection
|
140 |
+
COLS = [c.name for c in fields(AutoEvalColumn)]
|
141 |
+
TYPES = [c.type for c in fields(AutoEvalColumn)]
|
|
|
|
|
142 |
|
143 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
144 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
|
|
146 |
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
147 |
|
148 |
NUMERIC_INTERVALS = {
|
149 |
+
"?": pd.Interval(-1, 0, closed="right"),
|
150 |
+
"~1.5": pd.Interval(0, 2, closed="right"),
|
151 |
+
"~3": pd.Interval(2, 4, closed="right"),
|
152 |
+
"~7": pd.Interval(4, 9, closed="right"),
|
153 |
+
"~13": pd.Interval(9, 20, closed="right"),
|
154 |
+
"~35": pd.Interval(20, 45, closed="right"),
|
155 |
+
"~60": pd.Interval(45, 70, closed="right"),
|
156 |
+
"70+": pd.Interval(70, 10000, closed="right"),
|
157 |
+
}
|
src/envs.py
CHANGED
@@ -5,12 +5,12 @@ from huggingface_hub import HfApi
|
|
5 |
# clone / pull the lmeh eval data
|
6 |
H4_TOKEN = os.environ.get("H4_TOKEN", None)
|
7 |
|
8 |
-
REPO_ID = "
|
9 |
-
QUEUE_REPO = "
|
10 |
-
RESULTS_REPO = "
|
11 |
|
12 |
-
PRIVATE_QUEUE_REPO = "
|
13 |
-
PRIVATE_RESULTS_REPO = "
|
14 |
|
15 |
IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
|
16 |
|
|
|
5 |
# clone / pull the lmeh eval data
|
6 |
H4_TOKEN = os.environ.get("H4_TOKEN", None)
|
7 |
|
8 |
+
REPO_ID = "choco9966/New-Open-Ko-LLM-Leaderboard"
|
9 |
+
QUEUE_REPO = "choco9966/new-requests"
|
10 |
+
RESULTS_REPO = "choco9966/new-results"
|
11 |
|
12 |
+
PRIVATE_QUEUE_REPO = "choco9966/private-requests"
|
13 |
+
PRIVATE_RESULTS_REPO = "choco9966/private-results"
|
14 |
|
15 |
IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
|
16 |
|
src/leaderboard/read_evals.py
CHANGED
@@ -48,7 +48,7 @@ class EvalResult:
|
|
48 |
precision = Precision.from_str(config.get("model_dtype"))
|
49 |
|
50 |
# Get model and org
|
51 |
-
org_and_model = config.get("model_name",
|
52 |
org_and_model = org_and_model.split("/", 1)
|
53 |
|
54 |
if len(org_and_model) == 1:
|
@@ -96,26 +96,19 @@ class EvalResult:
|
|
96 |
results = {}
|
97 |
for task in Tasks:
|
98 |
task = task.value
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
if task.benchmark in ["ko_winogrande", "ko_gsm8k", "ko_eq_bench", "ko_inst_follow", "kor_nat_cka", "kor_nat_sva", "ko_harmlessness", "ko_helpfulness"]:
|
108 |
-
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
|
109 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
110 |
-
results[task.benchmark] = 0.0
|
111 |
continue
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
if accs.size == 0 or any([acc is None for acc in accs]):
|
116 |
-
continue
|
117 |
-
|
118 |
-
mean_acc = np.mean(accs) * 100.0
|
119 |
results[task.benchmark] = mean_acc
|
120 |
|
121 |
return self(
|
@@ -151,27 +144,7 @@ class EvalResult:
|
|
151 |
def to_dict(self):
|
152 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
153 |
|
154 |
-
|
155 |
-
# TODO: safely remove this code when the task results are all added
|
156 |
-
skip_avg_len = 0
|
157 |
-
if self.results['ko_winogrande'] == 0.0:
|
158 |
-
skip_avg_len += 1
|
159 |
-
if self.results['ko_gsm8k'] == 0.0:
|
160 |
-
skip_avg_len += 1
|
161 |
-
if self.results['ko_eq_bench'] == 0.0:
|
162 |
-
skip_avg_len += 1
|
163 |
-
if self.results['ko_inst_follow'] == 0.0:
|
164 |
-
skip_avg_len += 1
|
165 |
-
if self.results['kor_nat_cka'] == 0.0:
|
166 |
-
skip_avg_len += 1
|
167 |
-
if self.results['kor_nat_sva'] == 0.0:
|
168 |
-
skip_avg_len += 1
|
169 |
-
if self.results['ko_harmlessness'] == 0.0:
|
170 |
-
skip_avg_len += 1
|
171 |
-
if self.results['ko_helpfulness'] == 0.0:
|
172 |
-
skip_avg_len += 1
|
173 |
-
|
174 |
-
average = sum([v for v in self.results.values() if v is not None]) / (len(Tasks) - skip_avg_len)
|
175 |
|
176 |
data_dict = {
|
177 |
"eval_name": self.eval_name, # not a column, just a save name,
|
|
|
48 |
precision = Precision.from_str(config.get("model_dtype"))
|
49 |
|
50 |
# Get model and org
|
51 |
+
org_and_model = config.get("model_name", None)
|
52 |
org_and_model = org_and_model.split("/", 1)
|
53 |
|
54 |
if len(org_and_model) == 1:
|
|
|
96 |
results = {}
|
97 |
for task in Tasks:
|
98 |
task = task.value
|
99 |
+
if task.benchmark in ["ko_ifeval"]:
|
100 |
+
ko_ifeval = data["results"]["ko_ifeval"]
|
101 |
+
accs = np.mean([ko_ifeval["prompt_level_strict_acc,none"], ko_ifeval["inst_level_strict_acc,none"]])
|
102 |
+
mean_acc = np.mean(accs) * 100.0
|
103 |
+
results[task.benchmark] = mean_acc
|
104 |
+
|
105 |
+
if task.benchmark in ["ko_winogrande", "ko_gsm8k", "ko_eqbench", "kornat_common", "kornat_social", "kornat_harmless", "kornat_helpful", "ko_gpqa_diamond_zeroshot"]:
|
106 |
+
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
|
|
|
|
107 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
|
|
108 |
continue
|
109 |
+
|
110 |
+
if task.benchmark not in ["ko_eqbench"]:
|
111 |
+
mean_acc = accs[0] * 100.0
|
|
|
|
|
|
|
|
|
112 |
results[task.benchmark] = mean_acc
|
113 |
|
114 |
return self(
|
|
|
144 |
def to_dict(self):
|
145 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
146 |
|
147 |
+
average = sum([v for v in self.results.values() if v is not None]) / (len(Tasks))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
|
149 |
data_dict = {
|
150 |
"eval_name": self.eval_name, # not a column, just a save name,
|
src/submission/submit.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import json
|
2 |
import os
|
3 |
from datetime import datetime, timezone
|
|
|
4 |
|
5 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
6 |
from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
|
@@ -12,6 +13,7 @@ from src.submission.check_validity import (
|
|
12 |
is_model_on_hub,
|
13 |
user_submission_permission,
|
14 |
)
|
|
|
15 |
|
16 |
REQUESTED_MODELS = None
|
17 |
USERS_TO_SUBMISSION_DATES = None
|
@@ -38,10 +40,7 @@ def add_new_eval(
|
|
38 |
|
39 |
precision = precision.split(" ")[0]
|
40 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
41 |
-
|
42 |
-
if True:
|
43 |
-
return styled_error("The current Season 1 will conclude on Friday, August 2, and the new season will commence on August 12.")
|
44 |
-
|
45 |
if model_type is None or model_type == "":
|
46 |
return styled_error("Please select a model type.")
|
47 |
|
@@ -100,6 +99,9 @@ def add_new_eval(
|
|
100 |
|
101 |
# Seems good, creating the eval
|
102 |
print("Adding new eval")
|
|
|
|
|
|
|
103 |
|
104 |
eval_entry = {
|
105 |
"model": model,
|
@@ -114,6 +116,7 @@ def add_new_eval(
|
|
114 |
"likes": model_info.likes,
|
115 |
"params": model_size,
|
116 |
"license": license,
|
|
|
117 |
}
|
118 |
|
119 |
# Check for duplicate submission
|
|
|
1 |
import json
|
2 |
import os
|
3 |
from datetime import datetime, timezone
|
4 |
+
import pandas as pd
|
5 |
|
6 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
7 |
from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
|
|
|
13 |
is_model_on_hub,
|
14 |
user_submission_permission,
|
15 |
)
|
16 |
+
from src.populate import get_evaluation_queue_df
|
17 |
|
18 |
REQUESTED_MODELS = None
|
19 |
USERS_TO_SUBMISSION_DATES = None
|
|
|
40 |
|
41 |
precision = precision.split(" ")[0]
|
42 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
43 |
+
|
|
|
|
|
|
|
44 |
if model_type is None or model_type == "":
|
45 |
return styled_error("Please select a model type.")
|
46 |
|
|
|
99 |
|
100 |
# Seems good, creating the eval
|
101 |
print("Adding new eval")
|
102 |
+
dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, cols=["job_id"])
|
103 |
+
dfs = pd.concat(dfs).reset_index(drop=True)
|
104 |
+
max_job_id = max([int(c) for c in dfs["job_id"].values])
|
105 |
|
106 |
eval_entry = {
|
107 |
"model": model,
|
|
|
116 |
"likes": model_info.likes,
|
117 |
"params": model_size,
|
118 |
"license": license,
|
119 |
+
"job_id": max_job_id+1
|
120 |
}
|
121 |
|
122 |
# Check for duplicate submission
|