File size: 9,353 Bytes
097981b
 
7b9fe5e
80e30db
ecfb9fd
7b9fe5e
 
fced05c
7b9fe5e
1220a69
fced05c
4c0ff9d
 
7b9fe5e
4db294f
7b9fe5e
 
 
 
4c0ff9d
7b9fe5e
 
 
 
 
4c0ff9d
 
7b9fe5e
4c0ff9d
 
7b9fe5e
f73765d
7b9fe5e
59ba8a1
 
 
fced05c
 
 
 
 
e2333b3
94a1689
7b9fe5e
cb44a17
7b9fe5e
4c0ff9d
 
fced05c
7b9fe5e
4c0ff9d
 
7b9fe5e
 
097981b
 
 
 
 
7b9fe5e
f4a5465
4c0ff9d
7b9fe5e
4c0ff9d
7b9fe5e
4c0ff9d
1220a69
7b9fe5e
 
 
 
 
 
4c0ff9d
59ba8a1
4c0ff9d
59ba8a1
 
 
1220a69
7b9fe5e
4c0ff9d
 
7b9fe5e
4c0ff9d
f4a5465
7b9fe5e
4c0ff9d
 
7b9fe5e
4c0ff9d
 
7b9fe5e
 
96ea9f1
7b9fe5e
a86ccea
a0e3926
fced05c
7b9fe5e
a86ccea
fced05c
7b9fe5e
0e609ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a86ccea
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
from src.display.utils import ModelType


TITLE = """<img src="https://upstage-open-ko-llm-leaderboard-logos.s3.ap-northeast-2.amazonaws.com/header_logo.png" style="width:30%;display:block;margin-left:auto;margin-right:auto">"""
BOTTOM_LOGO = """<img src="https://upstage-open-ko-llm-leaderboard-logos.s3.ap-northeast-2.amazonaws.com/footer_logo_1.png" style="width:50%;display:block;margin-left:auto;margin-right:auto">"""

INTRODUCTION_TEXT = f"""
πŸš€ The Open Ko-LLM Leaderboard πŸ‡°πŸ‡· objectively evaluates the performance of Korean Large Language Model (LLM).

When you submit a model on the "Submit here!" page, it is automatically evaluated. The GPU used for evaluation is operated with the support of  __[KT](https://cloud.kt.com/)__.
The data used for evaluation consists of datasets to assess reasoning, language understanding, hallucination, and commonsense. 
The evaluation dataset is exclusively private and only available for evaluation process.
More detailed information about the benchmark dataset is provided on the β€œAbout” page.

This leaderboard is co-hosted by __[Upstage](https://www.upstage.ai)__, and __[NIA](https://www.nia.or.kr/site/nia_kor/main.do)__ that provides various Korean Data Sets through __[AI-Hub](https://aihub.or.kr)__, and operated by __[Upstage](https://www.upstage.ai)__.
"""

LLM_BENCHMARKS_TEXT = f"""
# Context
While outstanding LLM models are being released competitively, most of them are centered on English and are familiar with the English cultural sphere. We operate the Korean leaderboard, πŸš€ Open Ko-LLM, to evaluate models that reflect the characteristics of the Korean language and Korean culture. Through this, we hope that users can conveniently use the leaderboard, participate, and contribute to the advancement of research in Korean.

## Icons
{ModelType.PT.to_str(" : ")} model
{ModelType.IFT.to_str(" : ")} model
{ModelType.RL.to_str(" : ")} model
If there is no icon, it indicates that there is insufficient information about the model.
Please provide information about the model through an issue! 🀩

πŸ΄β€β˜ οΈ : This icon indicates that the model has been selected as a subject of caution by the community, implying that users should exercise restraint when using it. Clicking on the icon will take you to a discussion about that model.
(Models that have used the evaluation set for training to achieve a high leaderboard ranking, among others, are selected as subjects of caution.)

## How it works

πŸ“ˆ We evaluate models using the [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness), a unified framework to test generative language models on a large number of different evaluation tasks.

We have set up a benchmark using datasets translated into Korean, and applied variations by human experts, from the four tasks (HellaSwag, MMLU, Arc, Truthful QA) operated by HuggingFace OpenLLM. We have also added a new dataset prepared from scratch.
- Ko-HellaSwag (provided by __[Upstage](https://www.upstage.ai/)__, machine translation)
- Ko-MMLU (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
- Ko-Arc (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
- Ko-Truthful QA (provided by __[Upstage](https://www.upstage.ai/)__, human translation and variation)
- Ko-CommonGen V2 (provided by __[Korea University NLP&AI Lab](http://nlp.korea.ac.kr/)__, created from scratch)

To provide an evaluation befitting the LLM era, we've selected benchmark datasets suitable for assessing these elements: expertise, inference, hallucination, and common sense. The final score is converted to the average score from each evaluation datasets.

GPUs are provided by __[KT](https://cloud.kt.com/)__ for the evaluations.

## Details and Logs
- Detailed numerical results in the `results` Upstage dataset: https://huggingface.co/datasets/open-ko-llm-leaderboard/results
- Community queries and running status in the `requests` Upstage dataset: https://huggingface.co/datasets/open-ko-llm-leaderboard/requests

## More resources
If you still have questions, you can check our FAQ [here](https://huggingface.co/spaces/upstage/open-ko-llm-leaderboard/discussions/1)!
"""


FAQ_TEXT = """
"""


EVALUATION_QUEUE_TEXT = f"""
# Evaluation Queue for the πŸš€ Open Ko-LLM Leaderboard
Models added here will be automatically evaluated on the KT GPU cluster.

## <Some good practices before submitting a model>

### 1️⃣ Make sure you can load your model and tokenizer using AutoClasses
```python
from transformers import AutoConfig, AutoModel, AutoTokenizer
config = AutoConfig.from_pretrained("your model name", revision=revision)
model = AutoModel.from_pretrained("your model name", revision=revision)
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
```

If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.

⚠️ Make sure your model is public!

⚠️ Maker sure your model runs with [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness)

⚠️ If your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!

### 2️⃣ Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!

### 3️⃣ Make sure your model has an open license!
This is a leaderboard for πŸš€ Open Ko-LLMs, and we'd love for as many people as possible to know they can use your model

### 4️⃣ Fill up your model card
When we add extra information about models to the leaderboard, it will be automatically taken from the model card

## In case of model failure
If your model is displayed in the `FAILED` category, its execution stopped. Make sure you have followed the above steps first. If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
"""

CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results. Authors of open-ko-llm-leaderboard are ordered alphabetically."
CITATION_BUTTON_TEXT = r"""
@misc{open-ko-llm-leaderboard,
  author = {Chanjun Park, Hwalsuk Lee, Hyunbyung Park, Hyeonwoo Kim, Sanghoon Kim, Seonghwan Cho, Sunghun Kim, Sukyung Lee},
  title = {Open Ko-LLM Leaderboard},
  year = {2023},
  publisher = {Upstage, National Information Society Agency},
  howpublished = "\url{https://huggingface.co/spaces/upstage/open-ko-llm-leaderboard}"
}
@software{eval-harness,
  author       = {Gao, Leo and
                  Tow, Jonathan and
                  Biderman, Stella and
                  Black, Sid and
                  DiPofi, Anthony and
                  Foster, Charles and
                  Golding, Laurence and
                  Hsu, Jeffrey and
                  McDonell, Kyle and
                  Muennighoff, Niklas and
                  Phang, Jason and
                  Reynolds, Laria and
                  Tang, Eric and
                  Thite, Anish and
                  Wang, Ben and
                  Wang, Kevin and
                  Zou, Andy},
  title        = {A framework for few-shot language model evaluation},
  month        = sep,
  year         = 2021,
  publisher    = {Zenodo},
  version      = {v0.0.1},
  doi          = {10.5281/zenodo.5371628},
  url          = {https://doi.org/10.5281/zenodo.5371628}
}
@misc{seo2023kocommongen,
      title={Korean Commonsense Reasoning Evaluation for Large Language Models},
      author={Jaehyung Seo, Chanjun Park, Hyeonseok Moon, Sugyeong Eo, Aram So, Heuiseok Lim},
      year={2023},
      affilation={Korea University, NLP&AI},
      booktitle={Proceedings of the 35th Annual Conference on Human & Cognitive Language Technology}}
@misc{park2023koarc,
      title={Ko-ARC},
      original_title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
      author={Hyunbyung Park, Chanjun Park},
      original_author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
      year={2023}
}
@misc{park2023kohellaswag,
      title={Ko-HellaSwag},
      original_title={HellaSwag: Can a Machine Really Finish Your Sentence?},
      author={Hyunbyung Park, Chanjun Park},
      original_author={Rowan Zellers and Ari Holtzman and Yonatan Bisk and Ali Farhadi and Yejin Choi},
      year={2023}
}
@misc{park2023kommlu,
      title={Ko-MMLU},
      original_title={Measuring Massive Multitask Language Understanding},
      author={Hyunbyung Park, Chanjun Park},
      original_author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
      year={2023}
}
@misc{park2023kotruthfulqa,
      title={Ko-TruthfulQA},
      original_title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
      author={Hyunbyung Park, Chanjun Park},
      original_author={Stephanie Lin and Jacob Hilton and Owain Evans},
      year={2023}
}
"""