Spaces:
Restarting
on
CPU Upgrade
Restarting
on
CPU Upgrade
description update
Browse files- src/about.py +24 -7
- src/display/utils.py +1 -1
- src/leaderboard/read_evals.py +5 -4
src/about.py
CHANGED
@@ -36,12 +36,18 @@ class Tasks(Enum):
|
|
36 |
task21 = Task("polish_polqa_reranking_multiple_choice", "acc,none", "polqa_reranking_mc", "multiple_choice", 0.5335588952710677) # multiple_choice
|
37 |
task22 = Task("polish_polqa_open_book", "levenshtein,none", "polqa_open_book_g", "generate_until", 0.0) # generate_until
|
38 |
task23 = Task("polish_polqa_closed_book", "levenshtein,none", "polqa_closed_book_g", "generate_until", 0.0) # generate_until
|
39 |
-
task20 = Task("polish_poleval2018_task3_test_10k", "word_perplexity,none", "poleval2018_task3_test_10k", "other")
|
40 |
task24 = Task("polish_poquad_open_book", "levenshtein,none", "poquad_open_book", "generate_until", 0.0)
|
41 |
task25 = Task("polish_eq_bench_first_turn", "first_eqbench,none", "eq_bench_first_turn", "generate_until", 0.0)
|
42 |
task26 = Task("polish_eq_bench", "average_eqbench,none", "eq_bench", "generate_until", 0.0)
|
|
|
43 |
# task27 = Task("polish_eq_bench", "revised_eqbench,none", "eq_bench_revised", "other", 0.0)
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
NUM_FEWSHOT = 0 # Change with your few shot
|
46 |
# ---------------------------------------------------
|
47 |
|
@@ -59,7 +65,7 @@ TITLE = """
|
|
59 |
"""
|
60 |
|
61 |
# What does your leaderboard evaluate?
|
62 |
-
INTRODUCTION_TEXT = """
|
63 |
The leaderboard evaluates language models on a set of Polish tasks. The tasks are designed to test the models' ability to understand and generate Polish text. The leaderboard is designed to be a benchmark for the Polish language model community, and to help researchers and practitioners understand the capabilities of different models.
|
64 |
For now, models are tested without theirs templates.
|
65 |
|
@@ -67,7 +73,14 @@ Almost every task has two versions: regex and multiple choice.
|
|
67 |
* _g suffix means that a model needs to generate an answer (only suitable for instructions-based models)
|
68 |
* _mc suffix means that a model is scored against every possible class (suitable also for base models)
|
69 |
|
70 |
-
Average columns are normalized against scores by "Baseline (majority class)".
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
We gratefully acknowledge Polish high-performance computing infrastructure PLGrid (HPC Centers: ACK Cyfronet AGH) for providing computer facilities and support within computational grant no. PLG/2024/016951.
|
73 |
"""
|
@@ -85,7 +98,6 @@ or join our [Discord SpeakLeash](https://discord.gg/FfYp4V6y3R)
|
|
85 |
* fix long model names
|
86 |
* add inference time
|
87 |
* add more tasks
|
88 |
-
* use model templates
|
89 |
* fix scrolling on Firefox
|
90 |
|
91 |
## Tasks
|
@@ -114,12 +126,15 @@ or join our [Discord SpeakLeash](https://discord.gg/FfYp4V6y3R)
|
|
114 |
| polqa_open_book_g | ipipan/polqa | levenshtein | generate_until |
|
115 |
| polqa_closed_book_g | ipipan/polqa | levenshtein | generate_until |
|
116 |
| poleval2018_task3_test_10k | enelpol/poleval2018_task3_test_10k | word perplexity | other |
|
|
|
|
|
|
|
117 |
|
118 |
## Reproducibility
|
119 |
To reproduce our results, you need to clone the repository:
|
120 |
|
121 |
```
|
122 |
-
git clone https://github.com/speakleash/lm-evaluation-harness.git -b
|
123 |
cd lm-evaluation-harness
|
124 |
pip install -e .
|
125 |
```
|
@@ -127,8 +142,10 @@ pip install -e .
|
|
127 |
and run benchmark for 0-shot and 5-shot:
|
128 |
|
129 |
```
|
130 |
-
lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks
|
131 |
-
lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks
|
|
|
|
|
132 |
```
|
133 |
|
134 |
## List of Polish models
|
|
|
36 |
task21 = Task("polish_polqa_reranking_multiple_choice", "acc,none", "polqa_reranking_mc", "multiple_choice", 0.5335588952710677) # multiple_choice
|
37 |
task22 = Task("polish_polqa_open_book", "levenshtein,none", "polqa_open_book_g", "generate_until", 0.0) # generate_until
|
38 |
task23 = Task("polish_polqa_closed_book", "levenshtein,none", "polqa_closed_book_g", "generate_until", 0.0) # generate_until
|
|
|
39 |
task24 = Task("polish_poquad_open_book", "levenshtein,none", "poquad_open_book", "generate_until", 0.0)
|
40 |
task25 = Task("polish_eq_bench_first_turn", "first_eqbench,none", "eq_bench_first_turn", "generate_until", 0.0)
|
41 |
task26 = Task("polish_eq_bench", "average_eqbench,none", "eq_bench", "generate_until", 0.0)
|
42 |
+
task20 = Task("polish_poleval2018_task3_test_10k", "word_perplexity,none", "poleval2018_task3_test_10k", "other")
|
43 |
# task27 = Task("polish_eq_bench", "revised_eqbench,none", "eq_bench_revised", "other", 0.0)
|
44 |
|
45 |
+
|
46 |
+
g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
|
47 |
+
mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
|
48 |
+
rag_tasks = ['polish_polqa_reranking_multiple_choice', 'polish_polqa_open_book', 'polish_poquad_open_book']
|
49 |
+
all_tasks = g_tasks + mc_tasks
|
50 |
+
|
51 |
NUM_FEWSHOT = 0 # Change with your few shot
|
52 |
# ---------------------------------------------------
|
53 |
|
|
|
65 |
"""
|
66 |
|
67 |
# What does your leaderboard evaluate?
|
68 |
+
INTRODUCTION_TEXT = f"""
|
69 |
The leaderboard evaluates language models on a set of Polish tasks. The tasks are designed to test the models' ability to understand and generate Polish text. The leaderboard is designed to be a benchmark for the Polish language model community, and to help researchers and practitioners understand the capabilities of different models.
|
70 |
For now, models are tested without theirs templates.
|
71 |
|
|
|
73 |
* _g suffix means that a model needs to generate an answer (only suitable for instructions-based models)
|
74 |
* _mc suffix means that a model is scored against every possible class (suitable also for base models)
|
75 |
|
76 |
+
Average columns are normalized against scores by "Baseline (majority class)". Tasks taken into account while calculating averages:
|
77 |
+
* Average: {', '.join(all_tasks)}
|
78 |
+
* Avg g: {', '.join(g_tasks)}
|
79 |
+
* Avg mc: {', '.join(mc_tasks)}
|
80 |
+
* Acg RAG: {', '.join(rag_tasks)}
|
81 |
+
|
82 |
+
* `,chat` suffix means that a model is tested using chat templates
|
83 |
+
* `,chat,multiturn` suffix means that a model is tested using chat templates and fewshot examples are treated as a multi-turn conversation
|
84 |
|
85 |
We gratefully acknowledge Polish high-performance computing infrastructure PLGrid (HPC Centers: ACK Cyfronet AGH) for providing computer facilities and support within computational grant no. PLG/2024/016951.
|
86 |
"""
|
|
|
98 |
* fix long model names
|
99 |
* add inference time
|
100 |
* add more tasks
|
|
|
101 |
* fix scrolling on Firefox
|
102 |
|
103 |
## Tasks
|
|
|
126 |
| polqa_open_book_g | ipipan/polqa | levenshtein | generate_until |
|
127 |
| polqa_closed_book_g | ipipan/polqa | levenshtein | generate_until |
|
128 |
| poleval2018_task3_test_10k | enelpol/poleval2018_task3_test_10k | word perplexity | other |
|
129 |
+
| polish_poquad_open_book | enelpol/poleval2018_task3_test_10k | levenshtein | generate_until |
|
130 |
+
| polish_eq_bench_first_turn | speakleash/EQ-Bench-PL | eq_bench | generate_until |
|
131 |
+
| polish_eq_bench | speakleash/EQ-Bench-PL | eq_bench | generate_until |
|
132 |
|
133 |
## Reproducibility
|
134 |
To reproduce our results, you need to clone the repository:
|
135 |
|
136 |
```
|
137 |
+
git clone https://github.com/speakleash/lm-evaluation-harness.git -b polish3
|
138 |
cd lm-evaluation-harness
|
139 |
pip install -e .
|
140 |
```
|
|
|
142 |
and run benchmark for 0-shot and 5-shot:
|
143 |
|
144 |
```
|
145 |
+
lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_generate --num_fewshot 0 --device cuda:0 --batch_size 16 --verbosity DEBUG --output_path results/ --log_samples
|
146 |
+
lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_mc --num_fewshot 0 --device cuda:0 --batch_size 16 --verbosity DEBUG --output_path results/ --log_samples
|
147 |
+
lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_generate_few --num_fewshot 5 --device cuda:0 --batch_size 16 --verbosity DEBUG --output_path results/ --log_samples
|
148 |
+
lm_eval --model hf --model_args pretrained=speakleash/Bielik-7B-Instruct-v0.1 --tasks polish_mc --num_fewshot 5 --device cuda:0 --batch_size 16 --verbosity DEBUG --output_path results/ --log_samples
|
149 |
```
|
150 |
|
151 |
## List of Polish models
|
src/display/utils.py
CHANGED
@@ -34,11 +34,11 @@ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average
|
|
34 |
auto_eval_column_dict.append(["average_old", ColumnContent, ColumnContent("Average old", "number", False)])
|
35 |
auto_eval_column_dict.append(["average_g", ColumnContent, ColumnContent("Avg g", "number", True)])
|
36 |
auto_eval_column_dict.append(["average_mc", ColumnContent, ColumnContent("Avg mc", "number", True)])
|
|
|
37 |
|
38 |
for task in Tasks:
|
39 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
40 |
# Model information
|
41 |
-
auto_eval_column_dict.append(["average_rag", ColumnContent, ColumnContent("Avg RAG", "number", True)])
|
42 |
|
43 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
44 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
|
|
34 |
auto_eval_column_dict.append(["average_old", ColumnContent, ColumnContent("Average old", "number", False)])
|
35 |
auto_eval_column_dict.append(["average_g", ColumnContent, ColumnContent("Avg g", "number", True)])
|
36 |
auto_eval_column_dict.append(["average_mc", ColumnContent, ColumnContent("Avg mc", "number", True)])
|
37 |
+
auto_eval_column_dict.append(["average_rag", ColumnContent, ColumnContent("Avg RAG", "number", True)])
|
38 |
|
39 |
for task in Tasks:
|
40 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
41 |
# Model information
|
|
|
42 |
|
43 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
44 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
src/leaderboard/read_evals.py
CHANGED
@@ -8,6 +8,7 @@ from dataclasses import dataclass
|
|
8 |
import dateutil
|
9 |
import numpy as np
|
10 |
|
|
|
11 |
from src.display.formatting import make_clickable_model
|
12 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, NShotType
|
13 |
from src.submission.check_validity import is_model_on_hub
|
@@ -183,10 +184,10 @@ class EvalResult:
|
|
183 |
|
184 |
def to_dict(self):
|
185 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
186 |
-
g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
|
187 |
-
mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
|
188 |
-
rag_tasks = ['polish_polqa_reranking_multiple_choice', 'polish_polqa_open_book', 'polish_poquad_open_book']
|
189 |
-
all_tasks = g_tasks + mc_tasks
|
190 |
all_tasks_wo_polqa = [task for task in all_tasks if 'polqa' not in task]
|
191 |
|
192 |
baselines = {task.value.benchmark: task.value.baseline*100 for task in Tasks}
|
|
|
8 |
import dateutil
|
9 |
import numpy as np
|
10 |
|
11 |
+
from src.about import all_tasks, g_tasks, mc_tasks, rag_tasks
|
12 |
from src.display.formatting import make_clickable_model
|
13 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType, NShotType
|
14 |
from src.submission.check_validity import is_model_on_hub
|
|
|
184 |
|
185 |
def to_dict(self):
|
186 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
187 |
+
# g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
|
188 |
+
# mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
|
189 |
+
# rag_tasks = ['polish_polqa_reranking_multiple_choice', 'polish_polqa_open_book', 'polish_poquad_open_book']
|
190 |
+
# all_tasks = g_tasks + mc_tasks
|
191 |
all_tasks_wo_polqa = [task for task in all_tasks if 'polqa' not in task]
|
192 |
|
193 |
baselines = {task.value.benchmark: task.value.baseline*100 for task in Tasks}
|