Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_fra.yaml +3 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_hau.yaml +3 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lin.yaml +3 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lug.yaml +3 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_orm.yaml +3 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sna.yaml +3 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sot.yaml +3 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_swa.yaml +3 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/utils.py +32 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_common_translate_yaml +34 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_amh.yaml +3 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_eng.yaml +3 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ewe.yaml +3 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_fra.yaml +3 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_hau.yaml +3 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ibo.yaml +3 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_kin.yaml +3 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lin.yaml +3 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lug.yaml +3 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_orm.yaml +3 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sna.yaml +3 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sot.yaml +3 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_swa.yaml +3 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_twi.yaml +3 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_wol.yaml +3 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_xho.yaml +3 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_yor.yaml +3 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_zul.yaml +3 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/utils.py +32 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/flan/_held_in_template_yaml +14 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/flan/flan_held_in.yaml +352 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/flan/flan_held_out.yaml +13 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/minerva_math.yaml +15 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/multimedqa/README.md +43 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/multimedqa/multimedqa.yaml +21 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/openllm.yaml +18 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/pythia.yaml +12 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/t0_eval.yaml +127 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/fld/README.md +65 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/fld/fld_default.yaml +19 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/fld/fld_logical_formula_default.yaml +21 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/fld/fld_logical_formula_star.yaml +3 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/fld/fld_star.yaml +3 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/leaderboard/README.md +328 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/leaderboard/bbh_mc/_leaderboard_bbh.yaml +26 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/leaderboard/bbh_mc/boolean_expressions.yaml +14 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/leaderboard/bbh_mc/formal_fallacies.yaml +55 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/leaderboard/bbh_mc/geometric_shapes.yaml +84 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/leaderboard/bbh_mc/hyperbaton.yaml +32 -0
- scripts/yans/lm-evaluation-harness/lm_eval/tasks/leaderboard/bbh_mc/reasoning_about_colored_objects.yaml +114 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_fra.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
dataset_name: fra
|
2 |
+
include: afrimmlu_common_yaml
|
3 |
+
task: afrimmlu_direct_fra
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_hau.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
dataset_name: hau
|
2 |
+
include: afrimmlu_common_yaml
|
3 |
+
task: afrimmlu_direct_hau
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lin.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
dataset_name: lin
|
2 |
+
include: afrimmlu_common_yaml
|
3 |
+
task: afrimmlu_direct_lin
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lug.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
dataset_name: lug
|
2 |
+
include: afrimmlu_common_yaml
|
3 |
+
task: afrimmlu_direct_lug
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_orm.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
dataset_name: orm
|
2 |
+
include: afrimmlu_common_yaml
|
3 |
+
task: afrimmlu_direct_orm
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sna.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
dataset_name: sna
|
2 |
+
include: afrimmlu_common_yaml
|
3 |
+
task: afrimmlu_direct_sna
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sot.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
dataset_name: sot
|
2 |
+
include: afrimmlu_common_yaml
|
3 |
+
task: afrimmlu_direct_sot
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_swa.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
dataset_name: swa
|
2 |
+
include: afrimmlu_common_yaml
|
3 |
+
task: afrimmlu_direct_swa
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/utils.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from lm_eval.utils import weighted_f1_score
|
2 |
+
|
3 |
+
|
4 |
+
def doc_to_choice(doc):
|
5 |
+
choices = eval(doc["choices"])
|
6 |
+
return choices
|
7 |
+
|
8 |
+
|
9 |
+
def doc_to_text(doc):
|
10 |
+
output = """You are a highly knowledgeable and intelligent artificial intelligence
|
11 |
+
model answers multiple-choice questions about {subject}
|
12 |
+
|
13 |
+
Question: {question}
|
14 |
+
|
15 |
+
Choices:
|
16 |
+
A: {choice1}
|
17 |
+
B: {choice2}
|
18 |
+
C: {choice3}
|
19 |
+
D: {choice4}
|
20 |
+
|
21 |
+
Answer: """
|
22 |
+
|
23 |
+
choices = eval(doc["choices"])
|
24 |
+
text = output.format(
|
25 |
+
subject=doc["subject"],
|
26 |
+
question=doc["question"],
|
27 |
+
choice1=choices[0],
|
28 |
+
choice2=choices[1],
|
29 |
+
choice3=choices[2],
|
30 |
+
choice4=choices[3],
|
31 |
+
)
|
32 |
+
return text
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_common_translate_yaml
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
group:
|
2 |
+
- afrimmlu_translate
|
3 |
+
task: null
|
4 |
+
dataset_path: masakhane/afrimmlu-translate-test
|
5 |
+
dataset_name: null
|
6 |
+
output_type: multiple_choice
|
7 |
+
test_split: test
|
8 |
+
doc_to_text: !function utils.doc_to_text
|
9 |
+
doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}"
|
10 |
+
doc_to_choice: !function utils.doc_to_choice
|
11 |
+
should_decontaminate: true
|
12 |
+
doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
|
13 |
+
metric_list:
|
14 |
+
- metric: f1
|
15 |
+
aggregation: !function utils.weighted_f1_score
|
16 |
+
# aggregation: mean
|
17 |
+
average: weighted
|
18 |
+
hf_evaluate: true
|
19 |
+
higher_is_better: True
|
20 |
+
ignore_case: true
|
21 |
+
ignore_punctuation: true
|
22 |
+
regexes_to_ignore:
|
23 |
+
- ","
|
24 |
+
- "\\$"
|
25 |
+
- metric: acc
|
26 |
+
aggregation: mean
|
27 |
+
higher_is_better: true
|
28 |
+
ignore_case: true
|
29 |
+
ignore_punctuation: true
|
30 |
+
regexes_to_ignore:
|
31 |
+
- ","
|
32 |
+
- "\\$"
|
33 |
+
metadata:
|
34 |
+
version: 1.0
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_amh.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
dataset_name: amh
|
2 |
+
include: afrimmlu_common_translate_yaml
|
3 |
+
task: afrimmlu_translate_amh
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_eng.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
dataset_name: eng
|
2 |
+
include: afrimmlu_common_translate_yaml
|
3 |
+
task: afrimmlu_translate_eng
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ewe.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
dataset_name: ewe
|
2 |
+
include: afrimmlu_common_translate_yaml
|
3 |
+
task: afrimmlu_translate_ewe
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_fra.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
dataset_name: fra
|
2 |
+
include: afrimmlu_common_translate_yaml
|
3 |
+
task: afrimmlu_translate_fra
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_hau.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
dataset_name: hau
|
2 |
+
include: afrimmlu_common_translate_yaml
|
3 |
+
task: afrimmlu_translate_hau
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ibo.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
dataset_name: ibo
|
2 |
+
include: afrimmlu_common_translate_yaml
|
3 |
+
task: afrimmlu_translate_ibo
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_kin.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
dataset_name: kin
|
2 |
+
include: afrimmlu_common_translate_yaml
|
3 |
+
task: afrimmlu_translate_kin
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lin.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
dataset_name: lin
|
2 |
+
include: afrimmlu_common_translate_yaml
|
3 |
+
task: afrimmlu_translate_lin
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lug.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
dataset_name: lug
|
2 |
+
include: afrimmlu_common_translate_yaml
|
3 |
+
task: afrimmlu_translate_lug
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_orm.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
dataset_name: orm
|
2 |
+
include: afrimmlu_common_translate_yaml
|
3 |
+
task: afrimmlu_translate_orm
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sna.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
dataset_name: sna
|
2 |
+
include: afrimmlu_common_translate_yaml
|
3 |
+
task: afrimmlu_translate_sna
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sot.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
dataset_name: sot
|
2 |
+
include: afrimmlu_common_translate_yaml
|
3 |
+
task: afrimmlu_translate_sot
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_swa.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
dataset_name: swa
|
2 |
+
include: afrimmlu_common_translate_yaml
|
3 |
+
task: afrimmlu_translate_swa
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_twi.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
dataset_name: twi
|
2 |
+
include: afrimmlu_common_translate_yaml
|
3 |
+
task: afrimmlu_translate_twi
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_wol.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
dataset_name: wol
|
2 |
+
include: afrimmlu_common_translate_yaml
|
3 |
+
task: afrimmlu_translate_wol
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_xho.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
dataset_name: xho
|
2 |
+
include: afrimmlu_common_translate_yaml
|
3 |
+
task: afrimmlu_translate_xho
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_yor.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
dataset_name: yor
|
2 |
+
include: afrimmlu_common_translate_yaml
|
3 |
+
task: afrimmlu_translate_yor
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_zul.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
dataset_name: zul
|
2 |
+
include: afrimmlu_common_translate_yaml
|
3 |
+
task: afrimmlu_translate_zul
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/utils.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from lm_eval.utils import weighted_f1_score
|
2 |
+
|
3 |
+
|
4 |
+
def doc_to_choice(doc):
|
5 |
+
choices = eval(doc["choices"])
|
6 |
+
return choices
|
7 |
+
|
8 |
+
|
9 |
+
def doc_to_text(doc):
|
10 |
+
output = """You are a highly knowledgeable and intelligent artificial intelligence
|
11 |
+
model answers multiple-choice questions about '{subject}'
|
12 |
+
|
13 |
+
Question: '''{question}'''
|
14 |
+
|
15 |
+
Choices:
|
16 |
+
A: ''{choice1}'''
|
17 |
+
B: ''{choice2}'''
|
18 |
+
C: ''{choice3}'''
|
19 |
+
D: ''{choice4}'''
|
20 |
+
|
21 |
+
Answer: """
|
22 |
+
|
23 |
+
choices = eval(doc["choices"])
|
24 |
+
text = output.format(
|
25 |
+
subject=doc["subject"],
|
26 |
+
question=doc["question"],
|
27 |
+
choice1=choices[0],
|
28 |
+
choice2=choices[1],
|
29 |
+
choice3=choices[2],
|
30 |
+
choice4=choices[3],
|
31 |
+
)
|
32 |
+
return text
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/flan/_held_in_template_yaml
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
output_type: generate_until
|
2 |
+
test_split: null
|
3 |
+
doc_to_choice: null
|
4 |
+
metric_list:
|
5 |
+
- metric: exact_match
|
6 |
+
aggregation: mean
|
7 |
+
higher_is_better: true
|
8 |
+
generation_kwargs:
|
9 |
+
until:
|
10 |
+
- "</s>"
|
11 |
+
do_sample: false
|
12 |
+
temperature: 0.0
|
13 |
+
metadata:
|
14 |
+
version: 1.0
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
ADDED
@@ -0,0 +1,352 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
group: flan_held_in
|
2 |
+
group_alias: Flan (Held-In)
|
3 |
+
task:
|
4 |
+
# ANLI R1
|
5 |
+
- group: anli_r1_flan
|
6 |
+
group_alias: ANLI R1
|
7 |
+
aggregate_metric_list:
|
8 |
+
- metric: acc
|
9 |
+
weight_by_size: True
|
10 |
+
task:
|
11 |
+
- task: anli_r1_prompt-0
|
12 |
+
task_alias: prompt-0
|
13 |
+
include: _held_in_template_yaml
|
14 |
+
doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
|
15 |
+
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
|
16 |
+
- task: anli_r1_prompt-1
|
17 |
+
task_alias: prompt-1
|
18 |
+
include: _held_in_template_yaml
|
19 |
+
doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
|
20 |
+
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
|
21 |
+
- task: anli_r1_prompt-2
|
22 |
+
task_alias: prompt-2
|
23 |
+
include: _held_in_template_yaml
|
24 |
+
doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
|
25 |
+
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
|
26 |
+
- task: anli_r1_prompt-3
|
27 |
+
task_alias: prompt-3
|
28 |
+
include: _held_in_template_yaml
|
29 |
+
doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
|
30 |
+
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
|
31 |
+
- task: anli_r1_prompt-4
|
32 |
+
task_alias: prompt-4
|
33 |
+
include: _held_in_template_yaml
|
34 |
+
doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
|
35 |
+
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
|
36 |
+
- task: anli_r1_prompt-5
|
37 |
+
task_alias: prompt-5
|
38 |
+
include: _held_in_template_yaml
|
39 |
+
doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
|
40 |
+
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
|
41 |
+
- task: anli_r1_prompt-6
|
42 |
+
task_alias: prompt-6
|
43 |
+
include: _held_in_template_yaml
|
44 |
+
doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
|
45 |
+
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
|
46 |
+
- task: anli_r1_prompt-7
|
47 |
+
task_alias: prompt-7
|
48 |
+
include: _held_in_template_yaml
|
49 |
+
doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
|
50 |
+
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
|
51 |
+
- task: anli_r1_prompt-8
|
52 |
+
task_alias: prompt-8
|
53 |
+
include: _held_in_template_yaml
|
54 |
+
doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
|
55 |
+
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
|
56 |
+
# ANLI R2
|
57 |
+
- group: anli_r2_flan
|
58 |
+
group_alias: ANLI R2
|
59 |
+
aggregate_metric_list:
|
60 |
+
- metric: acc
|
61 |
+
weight_by_size: True
|
62 |
+
task:
|
63 |
+
- task: anli_r2_prompt-0
|
64 |
+
task_alias: prompt-0
|
65 |
+
include: _held_in_template_yaml
|
66 |
+
doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
|
67 |
+
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
|
68 |
+
- task: anli_r2_prompt-1
|
69 |
+
task_alias: prompt-1
|
70 |
+
include: _held_in_template_yaml
|
71 |
+
doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
|
72 |
+
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
|
73 |
+
- task: anli_r2_prompt-2
|
74 |
+
task_alias: prompt-2
|
75 |
+
include: _held_in_template_yaml
|
76 |
+
doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
|
77 |
+
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
|
78 |
+
- task: anli_r2_prompt-3
|
79 |
+
task_alias: prompt-3
|
80 |
+
include: _held_in_template_yaml
|
81 |
+
doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
|
82 |
+
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
|
83 |
+
- task: anli_r2_prompt-4
|
84 |
+
task_alias: prompt-4
|
85 |
+
include: _held_in_template_yaml
|
86 |
+
doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
|
87 |
+
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
|
88 |
+
- task: anli_r2_prompt-5
|
89 |
+
task_alias: prompt-5
|
90 |
+
include: _held_in_template_yaml
|
91 |
+
doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
|
92 |
+
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
|
93 |
+
- task: anli_r2_prompt-6
|
94 |
+
task_alias: prompt-6
|
95 |
+
include: _held_in_template_yaml
|
96 |
+
doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
|
97 |
+
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
|
98 |
+
- task: anli_r2_prompt-7
|
99 |
+
task_alias: prompt-7
|
100 |
+
include: _held_in_template_yaml
|
101 |
+
doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
|
102 |
+
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
|
103 |
+
- task: anli_r2_prompt-8
|
104 |
+
task_alias: prompt-8
|
105 |
+
include: _held_in_template_yaml
|
106 |
+
doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
|
107 |
+
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
|
108 |
+
# ANLI R3
|
109 |
+
- group: anli_r3_flan
|
110 |
+
group_alias: ANLI R3
|
111 |
+
aggregate_metric_list:
|
112 |
+
- metric: acc
|
113 |
+
weight_by_size: True
|
114 |
+
task:
|
115 |
+
- task: anli_r3_prompt-0
|
116 |
+
task_alias: prompt-0
|
117 |
+
include: _held_in_template_yaml
|
118 |
+
doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
|
119 |
+
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
|
120 |
+
- task: anli_r3_prompt-1
|
121 |
+
task_alias: prompt-1
|
122 |
+
include: _held_in_template_yaml
|
123 |
+
doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
|
124 |
+
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
|
125 |
+
- task: anli_r3_prompt-2
|
126 |
+
task_alias: prompt-2
|
127 |
+
include: _held_in_template_yaml
|
128 |
+
doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
|
129 |
+
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
|
130 |
+
- task: anli_r3_prompt-3
|
131 |
+
task_alias: prompt-3
|
132 |
+
include: _held_in_template_yaml
|
133 |
+
doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
|
134 |
+
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
|
135 |
+
- task: anli_r3_prompt-4
|
136 |
+
task_alias: prompt-4
|
137 |
+
include: _held_in_template_yaml
|
138 |
+
doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
|
139 |
+
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
|
140 |
+
- task: anli_r3_prompt-5
|
141 |
+
task_alias: prompt-5
|
142 |
+
include: _held_in_template_yaml
|
143 |
+
doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
|
144 |
+
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
|
145 |
+
- task: anli_r3_prompt-6
|
146 |
+
task_alias: prompt-6
|
147 |
+
include: _held_in_template_yaml
|
148 |
+
doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
|
149 |
+
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
|
150 |
+
- task: anli_r3_prompt-7
|
151 |
+
task_alias: prompt-7
|
152 |
+
include: _held_in_template_yaml
|
153 |
+
doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
|
154 |
+
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
|
155 |
+
- task: anli_r3_prompt-8
|
156 |
+
task_alias: prompt-8
|
157 |
+
include: _held_in_template_yaml
|
158 |
+
doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
|
159 |
+
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
|
160 |
+
# Arc Easy
|
161 |
+
- group: arc_easy_flan
|
162 |
+
group_alias: Arc Easy
|
163 |
+
aggregate_metric_list:
|
164 |
+
- metric: acc
|
165 |
+
weight_by_size: True
|
166 |
+
task:
|
167 |
+
- task: arc_easy_prompt-0
|
168 |
+
task_alias: prompt-0
|
169 |
+
include: _held_in_template_yaml
|
170 |
+
doc_to_text: "{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
|
171 |
+
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
|
172 |
+
- task: arc_easy_prompt-1
|
173 |
+
task_alias: prompt-1
|
174 |
+
include: _held_in_template_yaml
|
175 |
+
doc_to_text: "Question: {{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}\nAnswer:"
|
176 |
+
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
|
177 |
+
- task: arc_easy_prompt-2
|
178 |
+
task_alias: prompt-2
|
179 |
+
include: _held_in_template_yaml
|
180 |
+
doc_to_text: "Question: {{question}}\n\nWhat is the correct answer to the question from the following choices?\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
|
181 |
+
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
|
182 |
+
- task: arc_easy_prompt-3
|
183 |
+
task_alias: prompt-3
|
184 |
+
include: _held_in_template_yaml
|
185 |
+
doc_to_text: "Q: {{question}}\nWhat is the correct answer to this question?\nOPTIONS:\n- {{choices.text|join('\n- ')}}...A:"
|
186 |
+
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
|
187 |
+
- task: arc_easy_prompt-4
|
188 |
+
task_alias: prompt-4
|
189 |
+
include: _held_in_template_yaml
|
190 |
+
doc_to_text: "Choose your answer?\n\n{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
|
191 |
+
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
|
192 |
+
- task: arc_easy_prompt-5
|
193 |
+
task_alias: prompt-5
|
194 |
+
include: _held_in_template_yaml
|
195 |
+
doc_to_text: "Answer the question\n\n{{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
|
196 |
+
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
|
197 |
+
- task: arc_easy_prompt-6
|
198 |
+
task_alias: prompt-6
|
199 |
+
include: _held_in_template_yaml
|
200 |
+
doc_to_text: "{{question}}\n\nPick the answer from these options\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
|
201 |
+
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
|
202 |
+
# Arc Challenge
|
203 |
+
- group: arc_challenge_flan
|
204 |
+
group_alias: Arc Challenge
|
205 |
+
aggregate_metric_list:
|
206 |
+
- metric: acc
|
207 |
+
weight_by_size: True
|
208 |
+
task:
|
209 |
+
- task: arc_challenge_prompt-0
|
210 |
+
task_alias: prompt-0
|
211 |
+
include: _held_in_template_yaml
|
212 |
+
doc_to_text: "{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
|
213 |
+
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
|
214 |
+
- task: arc_challenge_prompt-1
|
215 |
+
task_alias: prompt-1
|
216 |
+
include: _held_in_template_yaml
|
217 |
+
doc_to_text: "Question: {{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}\nAnswer:"
|
218 |
+
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
|
219 |
+
- task: arc_challenge_prompt-2
|
220 |
+
task_alias: prompt-2
|
221 |
+
include: _held_in_template_yaml
|
222 |
+
doc_to_text: "Question: {{question}}\n\nWhat is the correct answer to the question from the following choices?\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
|
223 |
+
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
|
224 |
+
- task: arc_challenge_prompt-3
|
225 |
+
task_alias: prompt-3
|
226 |
+
include: _held_in_template_yaml
|
227 |
+
doc_to_text: "Q: {{question}}\nWhat is the correct answer to this question?\nOPTIONS:\n- {{choices.text|join('\n- ')}}...A:"
|
228 |
+
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
|
229 |
+
- task: arc_challenge_prompt-4
|
230 |
+
task_alias: prompt-4
|
231 |
+
include: _held_in_template_yaml
|
232 |
+
doc_to_text: "Choose your answer?\n\n{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
|
233 |
+
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
|
234 |
+
- task: arc_challenge_prompt-5
|
235 |
+
task_alias: prompt-5
|
236 |
+
include: _held_in_template_yaml
|
237 |
+
doc_to_text: "Answer the question\n\n{{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
|
238 |
+
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
|
239 |
+
- task: arc_challenge_prompt-6
|
240 |
+
task_alias: prompt-6
|
241 |
+
include: _held_in_template_yaml
|
242 |
+
doc_to_text: "{{question}}\n\nPick the answer from these options\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
|
243 |
+
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
|
244 |
+
# BoolQ
|
245 |
+
- group: boolq_flan
|
246 |
+
group_alias: BoolQ
|
247 |
+
aggregate_metric_list:
|
248 |
+
- metric: acc
|
249 |
+
weight_by_size: True
|
250 |
+
task:
|
251 |
+
- task: boolq_prompt-0
|
252 |
+
task_alias: prompt-0
|
253 |
+
include: _held_in_template_yaml
|
254 |
+
doc_to_text: "{{passage}}\n\nCan we conclude that {{question}}?\n\nOPTIONS:\n- no\n- yes"
|
255 |
+
doc_to_target: "{{['no', 'yes'][label]}}"
|
256 |
+
- task: boolq_prompt-1
|
257 |
+
task_alias: prompt-1
|
258 |
+
include: _held_in_template_yaml
|
259 |
+
doc_to_text: "{{passage}}\n\nIs it true that {{question}}?\n\nOPTIONS:\n- no\n- yes"
|
260 |
+
doc_to_target: "{{['no', 'yes'][label]}}"
|
261 |
+
- task: boolq_prompt-2
|
262 |
+
task_alias: prompt-2
|
263 |
+
include: _held_in_template_yaml
|
264 |
+
doc_to_text: "{{passage}}\n\n{{question}}?\n\nOPTIONS:\n- no\n- yes"
|
265 |
+
doc_to_target: "{{['no', 'yes'][label]}}"
|
266 |
+
- task: boolq_prompt-3
|
267 |
+
task_alias: prompt-3
|
268 |
+
include: _held_in_template_yaml
|
269 |
+
doc_to_text: "Text: {{passage}}\n\nQuestion: {{question}}?\n\nOPTIONS:\n- no\n- yes"
|
270 |
+
doc_to_target: "{{['no', 'yes'][label]}}"
|
271 |
+
- task: boolq_prompt-4
|
272 |
+
task_alias: prompt-4
|
273 |
+
include: _held_in_template_yaml
|
274 |
+
doc_to_text: "{{passage}}\n\nWhat's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
|
275 |
+
doc_to_target: "{{['no', 'yes'][label]}}"
|
276 |
+
- task: boolq_prompt-5
|
277 |
+
task_alias: prompt-5
|
278 |
+
include: _held_in_template_yaml
|
279 |
+
doc_to_text: "{{passage}}\nBased on the above text what's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
|
280 |
+
doc_to_target: "{{['no', 'yes'][label]}}"
|
281 |
+
- task: boolq_prompt-6
|
282 |
+
task_alias: prompt-6
|
283 |
+
include: _held_in_template_yaml
|
284 |
+
doc_to_text: "{{passage}}\nAnswer this question making sure that the answer is supposed by the text: {{question}}?\n\nOPTIONS:\n- no\n- yes"
|
285 |
+
doc_to_target: "{{['no', 'yes'][label]}}"
|
286 |
+
- task: boolq_prompt-7
|
287 |
+
task_alias: prompt-7
|
288 |
+
include: _held_in_template_yaml
|
289 |
+
doc_to_text: "{{passage}}\n\nIs the following statement correct based on the text\n\n{{question}}\n\nOPTIONS:\n- no\n- yes"
|
290 |
+
doc_to_target: "{{['no', 'yes'][label]}}"
|
291 |
+
- task: boolq_prompt-8
|
292 |
+
task_alias: prompt-8
|
293 |
+
include: _held_in_template_yaml
|
294 |
+
doc_to_text: "{{passage}}\n\nIs this statement correct \"{{question}}\"?\n\nOPTIONS:\n- no\n- yes"
|
295 |
+
doc_to_target: "{{['no', 'yes'][label]}}"
|
296 |
+
- task: boolq_prompt-9
|
297 |
+
task_alias: prompt-9
|
298 |
+
include: _held_in_template_yaml
|
299 |
+
doc_to_text: "Is it true that {{question}} based on the following text?\n\n{{passage}}\n\nOPTIONS:\n- no\n- yes"
|
300 |
+
doc_to_target: "{{['no', 'yes'][label]}}"
|
301 |
+
# RTE
|
302 |
+
- group: rte_flan
|
303 |
+
group_alias: RTE
|
304 |
+
aggregate_metric_list:
|
305 |
+
- metric: acc
|
306 |
+
weight_by_size: True
|
307 |
+
task:
|
308 |
+
- task: rte_prompt-0
|
309 |
+
task_alias: prompt-0
|
310 |
+
include: _held_in_template_yaml
|
311 |
+
doc_to_text: "{{sentence1}}\n\nQuestion with options: Based on the paragraph above can we conclude that \"{{sentence2}}\"?\n\nOPTIONS:\n- yes\n- no"
|
312 |
+
doc_to_target: "{{['yes', 'no'][label]}}"
|
313 |
+
- task: rte_prompt-1
|
314 |
+
task_alias: prompt-1
|
315 |
+
include: _held_in_template_yaml
|
316 |
+
doc_to_text: "{{sentence1}}\n\nBased on that paragraph can we conclude that the sentence below is true?\n{{sentence2}}\n\nOPTIONS:\n- yes\n- no"
|
317 |
+
doc_to_target: "{{['yes', 'no'][label]}}"
|
318 |
+
- task: rte_prompt-1
|
319 |
+
task_alias: prompt-2
|
320 |
+
include: _held_in_template_yaml
|
321 |
+
doc_to_text: "{{sentence1}}\n\nQ with options: Can we draw the following conclusion?\n{{sentence2}}\n\nOPTIONS:\n- yes\n- no"
|
322 |
+
doc_to_target: "{{['yes', 'no'][label]}}"
|
323 |
+
- task: rte_prompt-3
|
324 |
+
task_alias: prompt-3
|
325 |
+
include: _held_in_template_yaml
|
326 |
+
doc_to_text: "{{sentence1}}\nDoes this next sentence follow, given the preceding text?\n{{sentence2}}\n\nOPTIONS:\n- yes\n- no"
|
327 |
+
doc_to_target: "{{['yes', 'no'][label]}}"
|
328 |
+
- task: rte_prompt-4
|
329 |
+
task_alias: prompt-4
|
330 |
+
include: _held_in_template_yaml
|
331 |
+
doc_to_text: "{{sentence1}}\nOPTIONS:\n- yes\n- no\nQuestion: Can we infer the following?\n{{sentence2}}"
|
332 |
+
doc_to_target: "{{['yes', 'no'][label]}}"
|
333 |
+
- task: rte_prompt-5
|
334 |
+
task_alias: prompt-5
|
335 |
+
include: _held_in_template_yaml
|
336 |
+
doc_to_text: "Read the following paragraph and determine if the hypothesis is true. Select from options at the end:\n\n{{sentence1}}\n\nHypothesis: {{sentence2}}\nOPTIONS:\n- yes\n- no\nThe answer is"
|
337 |
+
doc_to_target: "{{['yes', 'no'][label]}}"
|
338 |
+
- task: rte_prompt-6
|
339 |
+
task_alias: prompt-6
|
340 |
+
include: _held_in_template_yaml
|
341 |
+
doc_to_text: "Read the text and determine if the sentence is true:\n\n{{sentence1}}\n\nSentence: {{sentence2}}\nOPTIONS:\n- yes\n- no\nA:"
|
342 |
+
doc_to_target: "{{['yes', 'no'][label]}}"
|
343 |
+
- task: rte_prompt-7
|
344 |
+
task_alias: prompt-7
|
345 |
+
include: _held_in_template_yaml
|
346 |
+
doc_to_text: "Question with options: can we draw the following hypothesis from the context? \n\nContext:\n\n{{sentence1}}\n\nHypothesis: {{sentence2}}\nOPTIONS:\n- yes\n- no\nA:"
|
347 |
+
doc_to_target: "{{['yes', 'no'][label]}}"
|
348 |
+
- task: rte_prompt-8
|
349 |
+
task_alias: prompt-8
|
350 |
+
include: _held_in_template_yaml
|
351 |
+
doc_to_text: "Determine if the sentence is true based on the text below. Choose from options.\n{{sentence2}}\n\n{{sentence1}}\nOPTIONS:\n- yes\n- no"
|
352 |
+
doc_to_target: "{{['yes', 'no'][label]}}"
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/flan/flan_held_out.yaml
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
group: flan_held_out
|
2 |
+
task:
|
3 |
+
# BBH
|
4 |
+
- bbh_zeroshot
|
5 |
+
- bbh_fewshot
|
6 |
+
- bbh_cot_fewshot
|
7 |
+
- bbh_cot_zeroshot
|
8 |
+
# MMLU
|
9 |
+
- mmlu
|
10 |
+
- mmlu_flan_n_shot_generative
|
11 |
+
- mmlu_flan_n_shot_loglikelihood
|
12 |
+
- mmlu_flan_cot_zeroshot
|
13 |
+
- mmlu_flan_cot_fewshot
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/minerva_math.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
group: minerva_math
|
2 |
+
task:
|
3 |
+
- minerva_math_algebra
|
4 |
+
- minerva_math_counting_and_prob
|
5 |
+
- minerva_math_geometry
|
6 |
+
- minerva_math_intermediate_algebra
|
7 |
+
- minerva_math_num_theory
|
8 |
+
- minerva_math_prealgebra
|
9 |
+
- minerva_math_precalc
|
10 |
+
aggregate_metric_list:
|
11 |
+
- metric: exact_match
|
12 |
+
aggregation: mean
|
13 |
+
weight_by_size: true
|
14 |
+
metadata:
|
15 |
+
version: 1.0
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/multimedqa/README.md
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MultiMedQA (multiple-choice subset)
|
2 |
+
|
3 |
+
### Paper
|
4 |
+
|
5 |
+
Title: Large Language Models Encode Clinical Knowledge
|
6 |
+
|
7 |
+
Abstract: https://arxiv.org/abs/2212.13138
|
8 |
+
|
9 |
+
A benchmark combining four existing multiple-choice question answering datasets spanning professional medical exams and research queries.
|
10 |
+
|
11 |
+
### Citation
|
12 |
+
|
13 |
+
```
|
14 |
+
@Article{Singhal2023,
|
15 |
+
author={Singhal, Karan and Azizi, Shekoofeh and Tu, Tao and Mahdavi, S. Sara and Wei, Jason and Chung, Hyung Won and Scales, Nathan and Tanwani, Ajay and Cole-Lewis, Heather and Pfohl, Stephen and Payne, Perry and Seneviratne, Martin and Gamble, Paul and Kelly, Chris and Babiker, Abubakr and Sch{\"a}rli, Nathanael and Chowdhery, Aakanksha and Mansfield, Philip and Demner-Fushman, Dina and Ag{\"u}era y Arcas, Blaise and Webster, Dale and Corrado, Greg S. and Matias, Yossi and Chou, Katherine and Gottweis, Juraj and Tomasev, Nenad and Liu, Yun and Rajkomar, Alvin and Barral, Joelle and Semturs, Christopher and Karthikesalingam, Alan and Natarajan, Vivek},
|
16 |
+
title={Large language models encode clinical knowledge},
|
17 |
+
journal={Nature},
|
18 |
+
year={2023},
|
19 |
+
month={Aug},
|
20 |
+
day={01},
|
21 |
+
volume={620},
|
22 |
+
number={7972},
|
23 |
+
pages={172-180},
|
24 |
+
issn={1476-4687},
|
25 |
+
doi={10.1038/s41586-023-06291-2},
|
26 |
+
url={https://doi.org/10.1038/s41586-023-06291-2}
|
27 |
+
}
|
28 |
+
```
|
29 |
+
|
30 |
+
### Tasks
|
31 |
+
|
32 |
+
* [PubMedQA](https://pubmedqa.github.io/) - 1,000 expert-labeled Q&A pairs where a question and corresponding PubMed abstract as context is given and the a yes/maybe/no answer must be produced. Unlike the rest of the tasks in this suite, PubMedQA is a closed-domain Q&A task.
|
33 |
+
* [MedQA](https://github.com/jind11/MedQA) - US Medical License Exam (USMLE) questions with 4 or 5 possible answers. Typically, only the 4-option questions are used.
|
34 |
+
* [MedMCQA](https://medmcqa.github.io/) - 4-option multiple choice questions from Indian medical entrance examinations, >191k total questions.
|
35 |
+
* [MMLU](https://arxiv.org/abs/2009.03300) - 4-option multiple choice exam questions from a variety of domains. The following 6 domains are utilized here:
|
36 |
+
* Anatomy
|
37 |
+
* Clinical Knowledge
|
38 |
+
* College Medicine
|
39 |
+
* Medical Genetics
|
40 |
+
* Professional Medicine
|
41 |
+
* College Biology
|
42 |
+
|
43 |
+
Note that MultiMedQA also includes some short-form and long-form Q&A tasks (LiveQA, MedicationQA, HealthSearchQA). Evaluation on these tasks is usually done by experts and is not typically performed automatically, and therefore is ignored here.
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/multimedqa/multimedqa.yaml
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
group: multimedqa
|
2 |
+
task:
|
3 |
+
- pubmedqa
|
4 |
+
- medmcqa
|
5 |
+
- medqa_4options
|
6 |
+
- task: mmlu_anatomy
|
7 |
+
task_alias: "anatomy (mmlu)"
|
8 |
+
- task: mmlu_clinical_knowledge
|
9 |
+
task_alias: "clinical_knowledge (mmlu)"
|
10 |
+
- task: mmlu_college_medicine
|
11 |
+
task_alias: "college_medicine (mmlu)"
|
12 |
+
- task: mmlu_medical_genetics
|
13 |
+
task_alias: "medical_genetics (mmlu)"
|
14 |
+
- task: mmlu_professional_medicine
|
15 |
+
task_alias: "professional_medicine (mmlu)"
|
16 |
+
- task: mmlu_college_biology
|
17 |
+
task_alias: "college_biology (mmlu)"
|
18 |
+
aggregate_metric_list:
|
19 |
+
- metric: acc
|
20 |
+
aggregation: mean
|
21 |
+
weight_by_size: True
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/openllm.yaml
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
group: openllm
|
2 |
+
group_alias: Open LLM Leaderboard
|
3 |
+
task:
|
4 |
+
- task: arc_challenge
|
5 |
+
fewshot_split: validation
|
6 |
+
num_fewshot: 25
|
7 |
+
- task: hellaswag
|
8 |
+
fewshot_split: train
|
9 |
+
num_fewshot: 10
|
10 |
+
- task: truthfulqa
|
11 |
+
num_fewshot: 0
|
12 |
+
- task: mmlu
|
13 |
+
num_fewshot: 5
|
14 |
+
- task: winogrande
|
15 |
+
fewshot_split: train
|
16 |
+
num_fewshot: 5
|
17 |
+
- task: gsm8k
|
18 |
+
num_fewshot: 5
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/pythia.yaml
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
group: pythia
|
2 |
+
task:
|
3 |
+
- lambada_openai
|
4 |
+
- logiqa
|
5 |
+
- piqa
|
6 |
+
- sciq
|
7 |
+
- wikitext
|
8 |
+
- winogrande
|
9 |
+
- wsc
|
10 |
+
- ai2_arc
|
11 |
+
- blimp
|
12 |
+
- mmlu
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/t0_eval.yaml
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
group: t0_eval
|
2 |
+
task:
|
3 |
+
# Coreference Resolution
|
4 |
+
- dataset_path: super_glue
|
5 |
+
dataset_name: wsc.fixed
|
6 |
+
use_prompt: promptsource:*
|
7 |
+
training_split: train
|
8 |
+
validation_split: validation
|
9 |
+
output_type: generate_until
|
10 |
+
metric_list:
|
11 |
+
- metric: exact_match
|
12 |
+
aggregation: mean
|
13 |
+
higher_is_better: true
|
14 |
+
ignore_case: true
|
15 |
+
ignore_punctuation: true
|
16 |
+
# Coreference Resolution
|
17 |
+
- dataset_path: winogrande
|
18 |
+
dataset_name: winogrande_xl
|
19 |
+
use_prompt: promptsource:*
|
20 |
+
training_split: train
|
21 |
+
validation_split: validation
|
22 |
+
output_type: generate_until
|
23 |
+
metric_list:
|
24 |
+
- metric: exact_match
|
25 |
+
aggregation: mean
|
26 |
+
higher_is_better: true
|
27 |
+
ignore_case: true
|
28 |
+
ignore_punctuation: true
|
29 |
+
# Natural Language Inference
|
30 |
+
- dataset_path: super_glue
|
31 |
+
dataset_name: cb
|
32 |
+
use_prompt: promptsource:*
|
33 |
+
training_split: train
|
34 |
+
validation_split: validation
|
35 |
+
output_type: generate_until
|
36 |
+
metric_list:
|
37 |
+
- metric: exact_match
|
38 |
+
aggregation: mean
|
39 |
+
higher_is_better: true
|
40 |
+
ignore_case: true
|
41 |
+
ignore_punctuation: true
|
42 |
+
- dataset_path: super_glue
|
43 |
+
dataset_name: rte
|
44 |
+
use_prompt: promptsource:*
|
45 |
+
training_split: train
|
46 |
+
validation_split: validation
|
47 |
+
output_type: generate_until
|
48 |
+
metric_list:
|
49 |
+
- metric: exact_match
|
50 |
+
aggregation: mean
|
51 |
+
higher_is_better: true
|
52 |
+
ignore_case: true
|
53 |
+
ignore_punctuation: true
|
54 |
+
- task: anli_r1
|
55 |
+
dataset_path: anli
|
56 |
+
use_prompt: promptsource:*
|
57 |
+
training_split: train_r1
|
58 |
+
validation_split: dev_r1
|
59 |
+
output_type: generate_until
|
60 |
+
metric_list:
|
61 |
+
- metric: exact_match
|
62 |
+
aggregation: mean
|
63 |
+
higher_is_better: true
|
64 |
+
ignore_case: true
|
65 |
+
ignore_punctuation: true
|
66 |
+
- task: anli_r2
|
67 |
+
dataset_path: anli
|
68 |
+
use_prompt: promptsource:*
|
69 |
+
training_split: train_r2
|
70 |
+
validation_split: dev_r2
|
71 |
+
output_type: generate_until
|
72 |
+
metric_list:
|
73 |
+
- metric: exact_match
|
74 |
+
aggregation: mean
|
75 |
+
higher_is_better: true
|
76 |
+
ignore_case: true
|
77 |
+
ignore_punctuation: true
|
78 |
+
- task: anli_r3
|
79 |
+
dataset_path: anli
|
80 |
+
use_prompt: promptsource:*
|
81 |
+
training_split: train_r3
|
82 |
+
validation_split: dev_r3
|
83 |
+
output_type: generate_until
|
84 |
+
metric_list:
|
85 |
+
- metric: exact_match
|
86 |
+
aggregation: mean
|
87 |
+
higher_is_better: true
|
88 |
+
ignore_case: true
|
89 |
+
ignore_punctuation: true
|
90 |
+
# Sentence Completion
|
91 |
+
- dataset_path: super_glue
|
92 |
+
dataset_name: copa
|
93 |
+
use_prompt: promptsource:*
|
94 |
+
training_split: train
|
95 |
+
validation_split: validation
|
96 |
+
output_type: generate_until
|
97 |
+
metric_list:
|
98 |
+
- metric: exact_match
|
99 |
+
aggregation: mean
|
100 |
+
higher_is_better: true
|
101 |
+
ignore_case: true
|
102 |
+
ignore_punctuation: true
|
103 |
+
# Natural Language Inference
|
104 |
+
- dataset_path: hellaswag
|
105 |
+
use_prompt: promptsource:*
|
106 |
+
training_split: train
|
107 |
+
validation_split: validation
|
108 |
+
output_type: generate_until
|
109 |
+
metric_list:
|
110 |
+
- metric: exact_match
|
111 |
+
aggregation: mean
|
112 |
+
higher_is_better: true
|
113 |
+
ignore_case: true
|
114 |
+
ignore_punctuation: true
|
115 |
+
# Word Sense Disambiguation
|
116 |
+
- dataset_path: super_glue
|
117 |
+
dataset_name: wic
|
118 |
+
use_prompt: promptsource:*
|
119 |
+
training_split: train
|
120 |
+
validation_split: validation
|
121 |
+
output_type: generate_until
|
122 |
+
metric_list:
|
123 |
+
- metric: exact_match
|
124 |
+
aggregation: mean
|
125 |
+
higher_is_better: true
|
126 |
+
ignore_case: true
|
127 |
+
ignore_punctuation: true
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/fld/README.md
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# FLD
|
2 |
+
|
3 |
+
### Paper
|
4 |
+
|
5 |
+
Title: Learning Deductive Reasoning from Synthetic Corpus based on Formal Logic
|
6 |
+
|
7 |
+
Abstract: https://arxiv.org/abs/2308.07336
|
8 |
+
|
9 |
+
**FLD** (**F**ormal **L**ogic **D**eduction) is a deductive reasoning benchmark.
|
10 |
+
Given a set of facts and a hypothesis, an LLM is required to generate (i) proof steps to (dis-)prove the hypothesis, and (ii) an answer ("proved", "disproved" or unknown").
|
11 |
+
|
12 |
+
Unique features of FLD are:
|
13 |
+
* It assesses the model's logical reasoning ability *isolated from knowledge*, as the facts are randomly constructed so that referring to existing knowledge never helps solve the task.
|
14 |
+
* It assesses diverse reasoning patterns (i.e., deduction rules), as it is based on formal logic theory.
|
15 |
+
* As a result, it is highly challenging. Indeed, even GPT-4 can solve only about half of the problems.
|
16 |
+
|
17 |
+
Homepage: https://github.com/hitachi-nlp/FLD
|
18 |
+
|
19 |
+
|
20 |
+
### Citation
|
21 |
+
|
22 |
+
```
|
23 |
+
@InProceedings{pmlr-v202-morishita23a,
|
24 |
+
title = {Learning Deductive Reasoning from Synthetic Corpus based on Formal Logic},
|
25 |
+
author = {Morishita, Terufumi and Morio, Gaku and Yamaguchi, Atsuki and Sogawa, Yasuhiro},
|
26 |
+
booktitle = {Proceedings of the 40th International Conference on Machine Learning},
|
27 |
+
pages = {25254--25274},
|
28 |
+
year = {2023},
|
29 |
+
editor = {Krause, Andreas and Brunskill, Emma and Cho, Kyunghyun and Engelhardt, Barbara and Sabato, Sivan and Scarlett, Jonathan},
|
30 |
+
volume = {202},
|
31 |
+
series = {Proceedings of Machine Learning Research},
|
32 |
+
month = {23--29 Jul},
|
33 |
+
publisher = {PMLR},
|
34 |
+
pdf = {https://proceedings.mlr.press/v202/morishita23a/morishita23a.pdf},
|
35 |
+
url = {https://proceedings.mlr.press/v202/morishita23a.html},
|
36 |
+
}
|
37 |
+
```
|
38 |
+
|
39 |
+
### Groups and Tasks
|
40 |
+
|
41 |
+
This release is the simplified version of FLD where a model is required to predict only an answer.
|
42 |
+
This setting is described by "answer accuracy" in the original paper.
|
43 |
+
|
44 |
+
#### Tasks in Group `fld`
|
45 |
+
* `fld_default` is a basic task based on [FLD.v2](https://huggingface.co/datasets/hitachi-nlp/FLD.v2/viewer/star)
|
46 |
+
* `fld_star`: is a more challenging version based on [FLD.v2-star](https://huggingface.co/datasets/hitachi-nlp/FLD.v2/viewer/star)
|
47 |
+
|
48 |
+
#### Tasks in Group `fld_logical_formula`
|
49 |
+
Further, we have "logical formula" versions of the benchmarks, which evaluate LLMs' pure logical reasoning capabilities within the domain of logical formulas, rather than natural language:
|
50 |
+
* `fld_logical_formula_default`
|
51 |
+
* `fld_logical_formula_fld_star`
|
52 |
+
|
53 |
+
|
54 |
+
### Checklist
|
55 |
+
|
56 |
+
For adding novel benchmarks/datasets to the library:
|
57 |
+
* [x] Is the task an existing benchmark in the literature?
|
58 |
+
* [x] Have you referenced the original paper that introduced the task?
|
59 |
+
* [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
|
60 |
+
|
61 |
+
|
62 |
+
If other tasks on this dataset are already supported:
|
63 |
+
* [ ] Is the "Main" variant of this task clearly denoted?
|
64 |
+
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
|
65 |
+
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/fld/fld_default.yaml
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
task: fld_default
|
2 |
+
dataset_path: hitachi-nlp/FLD.v2
|
3 |
+
dataset_name: default
|
4 |
+
training_split: train
|
5 |
+
validation_split: validation
|
6 |
+
test_split: test
|
7 |
+
doc_to_text: "Based on the provided facts ($context$), either prove or disprove the hypothesis or state that it is unknown. {{prompt_serial}}"
|
8 |
+
doc_to_target: world_assump_label
|
9 |
+
metric_list:
|
10 |
+
- metric: exact_match
|
11 |
+
aggregation: mean
|
12 |
+
higher_is_better: true
|
13 |
+
filter_list:
|
14 |
+
- name: remove_whitespace
|
15 |
+
filter:
|
16 |
+
- function: remove_whitespace
|
17 |
+
- function: take_first
|
18 |
+
metadata:
|
19 |
+
version: 2.0
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/fld/fld_logical_formula_default.yaml
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
group:
|
2 |
+
- fld_logical_formula
|
3 |
+
task: fld_logical_formula_default
|
4 |
+
dataset_path: hitachi-nlp/FLD.v2
|
5 |
+
dataset_name: default
|
6 |
+
training_split: train
|
7 |
+
validation_split: validation
|
8 |
+
test_split: test
|
9 |
+
doc_to_text: "Based on the provided facts ($context$), either prove or disprove the hypothesis or state that it is unknown. The facts and the hypothesis are written in logical formulas as follows: capital letters such as \"{A}\", \"{B}\", \"{AB}\" are predicates, small letters such as \"{a}\", \"{b}\", \"{ab}\" are constants, \"&\" is logical conjunction, \"v\" is logical disjunction, \"¬\" is negation, \"->\" is implication, \"(x)\" is \"for all x\", and \"(Ex)\" is \"for some x\". $hypothesis$ = {{hypothesis_formula}} ; $context$ = {{context_formula}} ; $proof$ = "
|
10 |
+
doc_to_target: world_assump_label
|
11 |
+
metric_list:
|
12 |
+
- metric: exact_match
|
13 |
+
aggregation: mean
|
14 |
+
higher_is_better: true
|
15 |
+
filter_list:
|
16 |
+
- name: remove_whitespace
|
17 |
+
filter:
|
18 |
+
- function: remove_whitespace
|
19 |
+
- function: take_first
|
20 |
+
metadata:
|
21 |
+
version: 2.0
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/fld/fld_logical_formula_star.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
include: fld_logical_formula_default.yaml
|
2 |
+
task: fld_logical_formula_star
|
3 |
+
dataset_name: star
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/fld/fld_star.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
include: fld_default.yaml
|
2 |
+
task: fld_star
|
3 |
+
dataset_name: star
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/leaderboard/README.md
ADDED
@@ -0,0 +1,328 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Leaderboard evaluations
|
2 |
+
Our goal with this group is to create an unchanging through time version of
|
3 |
+
evaluations that will power the Open LLM Leaderboard on HuggingFace.
|
4 |
+
|
5 |
+
As we want to evaluate models across capabilities, the list currently contains:
|
6 |
+
- BBH (3-shots, multichoice)
|
7 |
+
- GPQA (0-shot, multichoice)
|
8 |
+
- mmlu-pro (5-shots, multichoice)
|
9 |
+
- Musr (0-shot, multichoice)
|
10 |
+
- ifeval (0-shot, generative)
|
11 |
+
- Math-lvl-5 (4-shots, generative, minerva version)
|
12 |
+
|
13 |
+
|
14 |
+
Details on the choice of those evals can be found [here](https://huggingface.co/spaces/open-llm-leaderboard/blog) !
|
15 |
+
|
16 |
+
## BigBenchHard (BBH)
|
17 |
+
|
18 |
+
A suite of 23 challenging BIG-Bench tasks which we call BIG-Bench Hard (BBH).
|
19 |
+
These are the task for which prior language model evaluations did not
|
20 |
+
outperform the average human-rater.
|
21 |
+
|
22 |
+
### Paper
|
23 |
+
|
24 |
+
Title: Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them
|
25 |
+
|
26 |
+
BIG-Bench (Srivastava et al., 2022) is a diverse evaluation suite that focuses on tasks believed to be beyond the capabilities of current language models. Language models have already made good progress on this benchmark, with the best model in the BIG-Bench paper outperforming average reported human-rater results on 65% of the BIG-Bench tasks via few-shot prompting. But on what tasks do language models fall short of average human-rater performance, and are those tasks actually unsolvable by current language models?
|
27 |
+
In this work, we focus on a suite of 23 challenging BIG-Bench tasks which we call BIG-Bench Hard (BBH). These are the task for which prior language model evaluations did not outperform the average human-rater. We find that applying chain-of-thought (CoT) prompting to BBH tasks enables PaLM to surpass the average human-rater performance on 10 of the 23 tasks, and Codex (code-davinci-002) to surpass the average human-rater performance on 17 of the 23 tasks. Since many tasks in BBH require multi-step reasoning, few-shot prompting without CoT, as done in the BIG-Bench evaluations (Srivastava et al., 2022), substantially underestimates the best performance and capabilities of language models, which is better captured via CoT prompting. As further analysis, we explore the interaction between CoT and model scale on BBH, finding that CoT enables emergent task performance on several BBH tasks with otherwise flat scaling curves.
|
28 |
+
|
29 |
+
|
30 |
+
- paper: https://huggingface.co/papers/2210.09261
|
31 |
+
- Homepage: https://github.com/suzgunmirac/BIG-Bench-Hard
|
32 |
+
|
33 |
+
### Citation
|
34 |
+
|
35 |
+
```
|
36 |
+
@article{suzgun2022challenging,
|
37 |
+
title={Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them},
|
38 |
+
author={Suzgun, Mirac and Scales, Nathan and Sch{\"a}rli, Nathanael and Gehrmann, Sebastian and Tay, Yi and Chung, Hyung Won and Chowdhery, Aakanksha and Le, Quoc V and Chi, Ed H and Zhou, Denny and and Wei, Jason},
|
39 |
+
journal={arXiv preprint arXiv:2210.09261},
|
40 |
+
year={2022}
|
41 |
+
}
|
42 |
+
```
|
43 |
+
|
44 |
+
### Groups
|
45 |
+
|
46 |
+
- `leaderboard_bbh`
|
47 |
+
|
48 |
+
### Tasks
|
49 |
+
|
50 |
+
- `leaderboard_bbh_boolean_expressions`
|
51 |
+
- `leaderboard_bbh_causal_judgement`
|
52 |
+
- `leaderboard_bbh_date_understanding`
|
53 |
+
- `leaderboard_bbh_disambiguation_qa`
|
54 |
+
- `leaderboard_bbh_dyck_languages`
|
55 |
+
- `leaderboard_bbh_formal_fallacies`
|
56 |
+
- `leaderboard_bbh_geometric_shapes`
|
57 |
+
- `leaderboard_bbh_hyperbaton`
|
58 |
+
- `leaderboard_bbh_logical_deduction_five_objects`
|
59 |
+
- `leaderboard_bbh_logical_deduction_seven_objects`
|
60 |
+
- `leaderboard_bbh_logical_deduction_three_objects`
|
61 |
+
- `leaderboard_bbh_movie_recommendation`
|
62 |
+
- `leaderboard_bbh_multistep_arithmetic_two`
|
63 |
+
- `leaderboard_bbh_navigate`
|
64 |
+
- `leaderboard_bbh_object_counting`
|
65 |
+
- `leaderboard_bbh_penguins_in_a_table`
|
66 |
+
- `leaderboard_bbh_reasoning_about_colored_objects`
|
67 |
+
- `leaderboard_bbh_ruin_names`
|
68 |
+
- `leaderboard_bbh_salient_translation_error_detection`
|
69 |
+
- `leaderboard_bbh_snarks`
|
70 |
+
- `leaderboard_bbh_sports_understanding`
|
71 |
+
- `leaderboard_bbh_temporal_sequences`
|
72 |
+
- `leaderboard_bbh_tracking_shuffled_objects_five_objects`
|
73 |
+
- `leaderboard_bbh_tracking_shuffled_objects_seven_objects`
|
74 |
+
- `leaderboard_bbh_tracking_shuffled_objects_three_objects`
|
75 |
+
- `leaderboard_bbh_web_of_lies`
|
76 |
+
- `leaderboard_bbh_word_sorting`
|
77 |
+
|
78 |
+
## GPQA
|
79 |
+
|
80 |
+
### Paper
|
81 |
+
|
82 |
+
Title: GPQA: A Graduate-Level Google-Proof Q&A Benchmark
|
83 |
+
|
84 |
+
We present GPQA, a challenging dataset of 448 multiple-choice questions written
|
85 |
+
by domain experts in biology, physics, and chemistry. We ensure that the
|
86 |
+
questions are high-quality and extremely difficult: experts who have or are
|
87 |
+
pursuing PhDs in the corresponding domains reach 65% accuracy (74% when
|
88 |
+
discounting clear mistakes the experts identified in retrospect), while highly
|
89 |
+
skilled non-expert validators only reach 34% accuracy, despite spending on
|
90 |
+
average over 30 minutes with unrestricted access to the web (i.e., the
|
91 |
+
questions are “Google-proof”). The questions are also difficult for
|
92 |
+
state-of-the-art AI systems, with our strongest GPT-4–based baseline achieving
|
93 |
+
39% accuracy. If we are to use future AI systems to help us answer very hard
|
94 |
+
questions—for example, when developing new scientific knowledge—we need to
|
95 |
+
develop scalable oversight methods that enable humans to supervise their
|
96 |
+
outputs, which may be difficult even if the supervisors are themselves skilled
|
97 |
+
and knowledgeable. The difficulty of GPQA both for skilled non-experts and
|
98 |
+
frontier AI systems should enable realistic scalable oversight experiments,
|
99 |
+
which we hope can help devise ways for human experts to reliably get truthful
|
100 |
+
information from AI systems that surpass human capabilities.
|
101 |
+
|
102 |
+
- Paper: https://huggingface.co/papers/2311.12022
|
103 |
+
- Homepage: https://github.com/idavidrein/gpqa/tree/main
|
104 |
+
|
105 |
+
### Citation
|
106 |
+
|
107 |
+
```
|
108 |
+
@misc{rein2023gpqa,
|
109 |
+
title={GPQA: A Graduate-Level Google-Proof Q&A Benchmark},
|
110 |
+
author={David Rein and Betty Li Hou and Asa Cooper Stickland and Jackson Petty and Richard Yuanzhe Pang and Julien Dirani and Julian Michael and Samuel R. Bowman},
|
111 |
+
year={2023},
|
112 |
+
eprint={2311.12022},
|
113 |
+
archivePrefix={arXiv},
|
114 |
+
primaryClass={cs.AI}
|
115 |
+
}
|
116 |
+
```
|
117 |
+
|
118 |
+
### Groups
|
119 |
+
|
120 |
+
- `leaderboard_gpqa`
|
121 |
+
|
122 |
+
### Tasks
|
123 |
+
|
124 |
+
- `leaderboard_gpqa_extended`
|
125 |
+
- `leaderboard_gpqa_diamond`
|
126 |
+
- `leaderboard_gpqa_main`
|
127 |
+
|
128 |
+
## IFEval
|
129 |
+
|
130 |
+
### Paper
|
131 |
+
|
132 |
+
Title: Instruction-Following Evaluation for Large Language Models
|
133 |
+
|
134 |
+
One core capability of Large Language Models (LLMs) is to follow natural
|
135 |
+
language instructions. However, the evaluation of such abilities is not
|
136 |
+
standardized: Human evaluations are expensive, slow, and not objectively
|
137 |
+
reproducible, while LLM-based auto-evaluation is potentially biased or limited
|
138 |
+
by the ability of the evaluator LLM. To overcome these issues, we introduce
|
139 |
+
Instruction-Following Eval (IFEval) for large language models. IFEval is a
|
140 |
+
straightforward and easy-to-reproduce evaluation benchmark. It focuses on a set
|
141 |
+
of "verifiable instructions" such as "write in more than 400 words" and
|
142 |
+
"mention the keyword of AI at least 3 times". We identified 25 types of those
|
143 |
+
verifiable instructions and constructed around 500 prompts, with each prompt
|
144 |
+
containing one or more verifiable instructions. We show evaluation results of
|
145 |
+
two widely available LLMs on the market.
|
146 |
+
|
147 |
+
- Paper: https://huggingface.co/papers/2210.09261
|
148 |
+
- Homepage: https://github.com/google-research/google-research/tree/master/instruction_following_eval
|
149 |
+
|
150 |
+
### Citation
|
151 |
+
|
152 |
+
```
|
153 |
+
@article{zhou2023instructionfollowing,
|
154 |
+
title={Instruction-Following Evaluation for Large Language Models},
|
155 |
+
author={Jeffrey Zhou and Tianjian Lu and Swaroop Mishra and Siddhartha Brahma and Sujoy Basu and Yi Luan and Denny Zhou and Le Hou},
|
156 |
+
journal={arXiv preprint arXiv:2311.07911},
|
157 |
+
year={2023},
|
158 |
+
}
|
159 |
+
```
|
160 |
+
|
161 |
+
### Tasks
|
162 |
+
|
163 |
+
- `leaderboard_ifeval`
|
164 |
+
|
165 |
+
## MATH-hard
|
166 |
+
|
167 |
+
This is the 4 shots variant of minerva math but only keeping the level 5 questions.
|
168 |
+
|
169 |
+
### Paper
|
170 |
+
|
171 |
+
Title: Measuring Mathematical Problem Solving With the MATH Dataset
|
172 |
+
|
173 |
+
Many intellectual endeavors require mathematical problem solving, but this
|
174 |
+
skill remains beyond the capabilities of computers. To measure this ability in
|
175 |
+
machine learning models, we introduce MATH, a new dataset of 12,500 challenging
|
176 |
+
competition mathematics problems. Each problem in MATH has a full step-by-step
|
177 |
+
solution which can be used to teach models to generate answer derivations and
|
178 |
+
explanations.
|
179 |
+
|
180 |
+
NOTE: The few-shot and the generated answer extraction is based on the
|
181 |
+
[Minerva](https://arxiv.org/abs/2206.14858) and exact match equivalence is
|
182 |
+
calculated using the `sympy` library. This requires additional dependencies,
|
183 |
+
which can be installed via the `lm-eval[math]` extra.
|
184 |
+
|
185 |
+
- Paper: https://huggingface.co/papers/2103.03874
|
186 |
+
- Homepage: https://github.com/hendrycks/math
|
187 |
+
|
188 |
+
|
189 |
+
### Citation
|
190 |
+
|
191 |
+
```
|
192 |
+
@article{hendrycksmath2021,
|
193 |
+
title={Measuring Mathematical Problem Solving With the MATH Dataset},
|
194 |
+
author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
|
195 |
+
journal={NeurIPS},
|
196 |
+
year={2021}
|
197 |
+
}
|
198 |
+
@misc{2206.14858,
|
199 |
+
Author = {Aitor Lewkowycz and Anders Andreassen and David Dohan and Ethan Dye and Henryk Michalewski and Vinay Ramasesh and Ambrose Slone and Cem Anil and Imanol Schlag and Theo Gutman-Solo and Yuhuai Wu and Behnam Neyshabur and Guy Gur-Ari and Vedant Misra},
|
200 |
+
Title = {Solving Quantitative Reasoning Problems with Language Models},
|
201 |
+
Year = {2022},
|
202 |
+
Eprint = {arXiv:2206.14858},
|
203 |
+
}
|
204 |
+
```
|
205 |
+
|
206 |
+
### Groups
|
207 |
+
|
208 |
+
- `leaderboard_math_hard`
|
209 |
+
|
210 |
+
### Tasks
|
211 |
+
|
212 |
+
- `leaderboard_math_algebra_hard`
|
213 |
+
- `leaderboard_math_counting_and_prob_hard`
|
214 |
+
- `leaderboard_math_geometry_hard`
|
215 |
+
- `leaderboard_math_intermediate_algebra_hard`
|
216 |
+
- `leaderboard_math_num_theory_hard`
|
217 |
+
- `leaderboard_math_prealgebra_hard`
|
218 |
+
- `leaderboard_math_precalc_hard`
|
219 |
+
|
220 |
+
|
221 |
+
## MMLU-Pro
|
222 |
+
|
223 |
+
### Paper
|
224 |
+
|
225 |
+
Title: MMLU-Pro: A More Robust and Challenging Multi-Task Language
|
226 |
+
Understanding Benchmark
|
227 |
+
|
228 |
+
In the age of large-scale language models, benchmarks like the Massive
|
229 |
+
Multitask Language Understanding (MMLU) have been pivotal in pushing the
|
230 |
+
boundaries of what AI can achieve in language comprehension and reasoning
|
231 |
+
across diverse domains. However, as models continue to improve, their
|
232 |
+
performance on these benchmarks has begun to plateau, making it increasingly
|
233 |
+
difficult to discern differences in model capabilities. This paper introduces
|
234 |
+
MMLU-Pro, an enhanced dataset designed to extend the mostly knowledge-driven
|
235 |
+
MMLU benchmark by integrating more challenging, reasoning-focused questions and
|
236 |
+
expanding the choice set from four to ten options. Additionally, MMLU-Pro
|
237 |
+
eliminates the trivial and noisy questions in MMLU. Our experimental results
|
238 |
+
show that MMLU-Pro not only raises the challenge, causing a significant drop in
|
239 |
+
accuracy by 16% to 33% compared to MMLU but also demonstrates greater stability
|
240 |
+
under varying prompts. With 24 different prompt styles tested, the sensitivity
|
241 |
+
of model scores to prompt variations decreased from 4-5% in MMLU to just 2% in
|
242 |
+
MMLU-Pro. Additionally, we found that models utilizing Chain of Thought (CoT)
|
243 |
+
reasoning achieved better performance on MMLU-Pro compared to direct answering,
|
244 |
+
which is in stark contrast to the findings on the original MMLU, indicating
|
245 |
+
that MMLU-Pro includes more complex reasoning questions. Our assessments
|
246 |
+
confirm that MMLU-Pro is a more discriminative benchmark to better track
|
247 |
+
progress in the field.
|
248 |
+
|
249 |
+
- Paper: https://huggingface.co/papers/2406.01574
|
250 |
+
- Homepage: https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro
|
251 |
+
|
252 |
+
### Citation
|
253 |
+
|
254 |
+
```
|
255 |
+
@misc{wang2024mmluprorobustchallengingmultitask,
|
256 |
+
title={MMLU-Pro: A More Robust and Challenging Multi-Task Language
|
257 |
+
Understanding Benchmark},
|
258 |
+
author={Yubo Wang and Xueguang Ma and Ge Zhang and Yuansheng Ni and Abhranil Chandra and Shiguang Guo and Weiming Ren and Aaran Arulraj and Xuan He and Ziyan Jiang and Tianle Li and Max Ku and Kai Wang and Alex Zhuang and Rongqi Fan and Xiang Yue and Wenhu Chen},
|
259 |
+
year={2024},
|
260 |
+
eprint={2406.01574},
|
261 |
+
archivePrefix={arXiv},
|
262 |
+
primaryClass={cs.CL},
|
263 |
+
url={https://arxiv.org/abs/2406.01574},
|
264 |
+
}
|
265 |
+
```
|
266 |
+
|
267 |
+
### Groups
|
268 |
+
|
269 |
+
- `leaderboard_mmlu_pro`
|
270 |
+
|
271 |
+
### Tasks
|
272 |
+
|
273 |
+
- `leaderboard_mmlu_pro`
|
274 |
+
|
275 |
+
|
276 |
+
## Musr
|
277 |
+
|
278 |
+
### Paper
|
279 |
+
|
280 |
+
Title: MuSR: Testing the Limits of Chain-of-thought with Multistep Soft
|
281 |
+
Reasoning
|
282 |
+
|
283 |
+
While large language models (LLMs) equipped with techniques like
|
284 |
+
chain-of-thought prompting have demonstrated impressive capabilities, they
|
285 |
+
still fall short in their ability to reason robustly in complex settings.
|
286 |
+
However, evaluating LLM reasoning is challenging because system capabilities
|
287 |
+
continue to grow while benchmark datasets for tasks like logical deduction have
|
288 |
+
remained static. We introduce MuSR, a dataset for evaluating language models on
|
289 |
+
multistep soft reasoning tasks specified in a natural language narrative. This
|
290 |
+
dataset has two crucial features. First, it is created through a novel
|
291 |
+
neurosymbolic synthetic-to-natural generation algorithm, enabling the
|
292 |
+
construction of complex reasoning instances that challenge GPT-4 (e.g., murder
|
293 |
+
mysteries roughly 1000 words in length) and which can be scaled further as more
|
294 |
+
capable LLMs are released. Second, our dataset instances are free text
|
295 |
+
narratives corresponding to real-world domains of reasoning; this makes it
|
296 |
+
simultaneously much more challenging than other synthetically-crafted
|
297 |
+
benchmarks while remaining realistic and tractable for human annotators to
|
298 |
+
solve with high accuracy. We evaluate a range of LLMs and prompting techniques
|
299 |
+
on this dataset and characterize the gaps that remain for techniques like
|
300 |
+
chain-of-thought to perform robust reasoning.
|
301 |
+
|
302 |
+
- Paper: https://huggingface.co/papers/2310.16049
|
303 |
+
- Homepage: https://zayne-sprague.github.io/MuSR/
|
304 |
+
|
305 |
+
### Citation
|
306 |
+
|
307 |
+
```
|
308 |
+
@misc{sprague2024musrtestinglimitschainofthought,
|
309 |
+
title={MuSR: Testing the Limits of Chain-of-thought with Multistep Soft
|
310 |
+
Reasoning},
|
311 |
+
author={Zayne Sprague and Xi Ye and Kaj Bostrom and Swarat Chaudhuri and Greg Durrett},
|
312 |
+
year={2024},
|
313 |
+
eprint={2310.16049},
|
314 |
+
archivePrefix={arXiv},
|
315 |
+
primaryClass={cs.CL},
|
316 |
+
url={https://arxiv.org/abs/2310.16049},
|
317 |
+
}
|
318 |
+
```
|
319 |
+
|
320 |
+
### Groups
|
321 |
+
|
322 |
+
- `leaderboard_musr`
|
323 |
+
|
324 |
+
### Tasks
|
325 |
+
|
326 |
+
- `leaderboard_musr_murder_mysteries`
|
327 |
+
- `leaderboard_musr_object_placements`
|
328 |
+
- `leaderboard_musr_team_allocation`
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/leaderboard/bbh_mc/_leaderboard_bbh.yaml
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
group: leaderboard_bbh
|
2 |
+
task:
|
3 |
+
- leaderboard_bbh_boolean_expressions
|
4 |
+
- leaderboard_bbh_causal_judgement
|
5 |
+
- leaderboard_bbh_date_understanding
|
6 |
+
- leaderboard_bbh_disambiguation_qa
|
7 |
+
- leaderboard_bbh_formal_fallacies
|
8 |
+
- leaderboard_bbh_geometric_shapes
|
9 |
+
- leaderboard_bbh_hyperbaton
|
10 |
+
- leaderboard_bbh_logical_deduction_five_objects
|
11 |
+
- leaderboard_bbh_logical_deduction_seven_objects
|
12 |
+
- leaderboard_bbh_logical_deduction_three_objects
|
13 |
+
- leaderboard_bbh_movie_recommendation
|
14 |
+
- leaderboard_bbh_navigate
|
15 |
+
- leaderboard_bbh_object_counting
|
16 |
+
- leaderboard_bbh_penguins_in_a_table
|
17 |
+
- leaderboard_bbh_reasoning_about_colored_objects
|
18 |
+
- leaderboard_bbh_ruin_names
|
19 |
+
- leaderboard_bbh_salient_translation_error_detection
|
20 |
+
- leaderboard_bbh_snarks
|
21 |
+
- leaderboard_bbh_sports_understanding
|
22 |
+
- leaderboard_bbh_temporal_sequences
|
23 |
+
- leaderboard_bbh_tracking_shuffled_objects_five_objects
|
24 |
+
- leaderboard_bbh_tracking_shuffled_objects_seven_objects
|
25 |
+
- leaderboard_bbh_tracking_shuffled_objects_three_objects
|
26 |
+
- leaderboard_bbh_web_of_lies
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/leaderboard/bbh_mc/boolean_expressions.yaml
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset_name: boolean_expressions
|
2 |
+
description: 'Evaluate the result of a random Boolean expression.'
|
3 |
+
doc_to_choice: ["False", "True"]
|
4 |
+
fewshot_config:
|
5 |
+
sampler: first_n
|
6 |
+
samples:
|
7 |
+
- input: not ( ( not not True ) ) is
|
8 |
+
target: 'False'
|
9 |
+
- input: True and False and not True and True is
|
10 |
+
target: 'False'
|
11 |
+
- input: not not ( not ( False ) ) is
|
12 |
+
target: 'True'
|
13 |
+
include: _fewshot_template_yaml
|
14 |
+
task: leaderboard_bbh_boolean_expressions
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/leaderboard/bbh_mc/formal_fallacies.yaml
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset_name: formal_fallacies
|
2 |
+
description: 'Distinguish deductively valid arguments from formal fallacies.'
|
3 |
+
doc_to_choice: ["valid", "invalid"]
|
4 |
+
fewshot_config:
|
5 |
+
sampler: first_n
|
6 |
+
samples:
|
7 |
+
- input: '"It is not always easy to see who is related to whom -- and in which ways.
|
8 |
+
The following argument pertains to this question: To begin with, Lesley is a
|
9 |
+
close friend of Fernando. Moreover, being a close friend of Fernando or a schoolmate
|
10 |
+
of Lowell is sufficient for being a great-grandfather of Leroy. It follows that
|
11 |
+
Lesley is a great-grandfather of Leroy."
|
12 |
+
|
13 |
+
Is the argument, given the explicitly stated premises, deductively valid or
|
14 |
+
invalid?
|
15 |
+
|
16 |
+
Options:
|
17 |
+
|
18 |
+
- valid
|
19 |
+
|
20 |
+
- invalid'
|
21 |
+
target: valid
|
22 |
+
- input: '"It is not always easy to see who is related to whom -- and in which ways.
|
23 |
+
The following argument pertains to this question: Whoever is not a great-grandfather
|
24 |
+
of Clyde is a stepbrother of Brian. Being an ancestor of Dana is sufficient
|
25 |
+
for not being a great-grandfather of Clyde. We may conclude: Everyone who is
|
26 |
+
an ancestor of Dana is a stepbrother of Brian, too."
|
27 |
+
|
28 |
+
Is the argument, given the explicitly stated premises, deductively valid or
|
29 |
+
invalid?
|
30 |
+
|
31 |
+
Options:
|
32 |
+
|
33 |
+
- valid
|
34 |
+
|
35 |
+
- invalid'
|
36 |
+
target: valid
|
37 |
+
- input: '"It is not always easy to grasp who is consuming which products. The following
|
38 |
+
argument pertains to this question: Every infrequent user of Paul Mitchell shampoo
|
39 |
+
is either a rare consumer of Nioxin shampoo or a loyal buyer of Caress soap,
|
40 |
+
or both. No regular consumer of Lush soap is a rare consumer of Nioxin shampoo
|
41 |
+
and, in the same time, a loyal buyer of Caress soap. It follows that whoever
|
42 |
+
is an infrequent user of Paul Mitchell shampoo is not a regular consumer of
|
43 |
+
Lush soap."
|
44 |
+
|
45 |
+
Is the argument, given the explicitly stated premises, deductively valid or
|
46 |
+
invalid?
|
47 |
+
|
48 |
+
Options:
|
49 |
+
|
50 |
+
- valid
|
51 |
+
|
52 |
+
- invalid'
|
53 |
+
target: invalid
|
54 |
+
include: _fewshot_template_yaml
|
55 |
+
task: leaderboard_bbh_formal_fallacies
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/leaderboard/bbh_mc/geometric_shapes.yaml
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset_name: geometric_shapes
|
2 |
+
description: 'Name geometric shapes from their SVG paths.'
|
3 |
+
doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)","(G)","(H)","(I)","(J)","(K)"]
|
4 |
+
fewshot_config:
|
5 |
+
sampler: first_n
|
6 |
+
samples:
|
7 |
+
- input: 'This SVG path element <path d="M 31.00,73.00 L 32.00,59.00 L 44.00,50.00
|
8 |
+
L 49.00,41.00 L 64.00,37.00 L 71.00,55.00 L 64.00,76.00 L 52.00,61.00 L 31.00,73.00"/>
|
9 |
+
draws a
|
10 |
+
|
11 |
+
Options:
|
12 |
+
|
13 |
+
(A) circle
|
14 |
+
|
15 |
+
(B) heptagon
|
16 |
+
|
17 |
+
(C) hexagon
|
18 |
+
|
19 |
+
(D) kite
|
20 |
+
|
21 |
+
(E) line
|
22 |
+
|
23 |
+
(F) octagon
|
24 |
+
|
25 |
+
(G) pentagon
|
26 |
+
|
27 |
+
(H) rectangle
|
28 |
+
|
29 |
+
(I) sector
|
30 |
+
|
31 |
+
(J) triangle'
|
32 |
+
target: (F)
|
33 |
+
- input: 'This SVG path element <path d="M 14.19,26.04 L 51.43,39.21 L 58.44,36.69
|
34 |
+
L 56.63,30.17 L 48.53,26.66 L 14.19,26.04"/> draws a
|
35 |
+
|
36 |
+
Options:
|
37 |
+
|
38 |
+
(A) circle
|
39 |
+
|
40 |
+
(B) heptagon
|
41 |
+
|
42 |
+
(C) hexagon
|
43 |
+
|
44 |
+
(D) kite
|
45 |
+
|
46 |
+
(E) line
|
47 |
+
|
48 |
+
(F) octagon
|
49 |
+
|
50 |
+
(G) pentagon
|
51 |
+
|
52 |
+
(H) rectangle
|
53 |
+
|
54 |
+
(I) sector
|
55 |
+
|
56 |
+
(J) triangle'
|
57 |
+
target: (G)
|
58 |
+
- input: 'This SVG path element <path d="M 41.00,43.00 L 37.00,34.00 L 41.00,33.00
|
59 |
+
L 45.00,34.00 L 41.00,43.00"/> draws a
|
60 |
+
|
61 |
+
Options:
|
62 |
+
|
63 |
+
(A) circle
|
64 |
+
|
65 |
+
(B) heptagon
|
66 |
+
|
67 |
+
(C) hexagon
|
68 |
+
|
69 |
+
(D) kite
|
70 |
+
|
71 |
+
(E) line
|
72 |
+
|
73 |
+
(F) octagon
|
74 |
+
|
75 |
+
(G) pentagon
|
76 |
+
|
77 |
+
(H) rectangle
|
78 |
+
|
79 |
+
(I) sector
|
80 |
+
|
81 |
+
(J) triangle'
|
82 |
+
target: (D)
|
83 |
+
include: _fewshot_template_yaml
|
84 |
+
task: leaderboard_bbh_geometric_shapes
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/leaderboard/bbh_mc/hyperbaton.yaml
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset_name: hyperbaton
|
2 |
+
description: 'Order adjectives correctly in English sentences.'
|
3 |
+
doc_to_choice: ["(A)", "(B)"]
|
4 |
+
fewshot_config:
|
5 |
+
sampler: first_n
|
6 |
+
samples:
|
7 |
+
- input: 'Which sentence has the correct adjective order:
|
8 |
+
|
9 |
+
Options:
|
10 |
+
|
11 |
+
(A) rubber terrible ship
|
12 |
+
|
13 |
+
(B) terrible rubber ship'
|
14 |
+
target: (B)
|
15 |
+
- input: 'Which sentence has the correct adjective order:
|
16 |
+
|
17 |
+
Options:
|
18 |
+
|
19 |
+
(A) repulsive small Brazilian exercise ship
|
20 |
+
|
21 |
+
(B) Brazilian repulsive exercise small ship'
|
22 |
+
target: (A)
|
23 |
+
- input: 'Which sentence has the correct adjective order:
|
24 |
+
|
25 |
+
Options:
|
26 |
+
|
27 |
+
(A) blue gold wonderful square shoe
|
28 |
+
|
29 |
+
(B) wonderful square blue gold shoe'
|
30 |
+
target: (B)
|
31 |
+
include: _fewshot_template_yaml
|
32 |
+
task: leaderboard_bbh_hyperbaton
|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/leaderboard/bbh_mc/reasoning_about_colored_objects.yaml
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dataset_name: reasoning_about_colored_objects
|
2 |
+
description: 'Answer extremely simple questions about the colors of objects on a surface.'
|
3 |
+
doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)","(G)","(H)","(I)","(J)","(K)","(L)","(M)","(N)","(O)","(P)","(Q)","(R)"]
|
4 |
+
fewshot_config:
|
5 |
+
sampler: first_n
|
6 |
+
samples:
|
7 |
+
- input: 'On the nightstand, there is a red pencil, a purple mug, a burgundy keychain,
|
8 |
+
a fuchsia teddy bear, a black plate, and a blue stress ball. What color is the
|
9 |
+
stress ball?
|
10 |
+
|
11 |
+
Options:
|
12 |
+
|
13 |
+
(A) red
|
14 |
+
|
15 |
+
(B) orange
|
16 |
+
|
17 |
+
(C) yellow
|
18 |
+
|
19 |
+
(D) green
|
20 |
+
|
21 |
+
(E) blue
|
22 |
+
|
23 |
+
(F) brown
|
24 |
+
|
25 |
+
(G) magenta
|
26 |
+
|
27 |
+
(H) fuchsia
|
28 |
+
|
29 |
+
(I) mauve
|
30 |
+
|
31 |
+
(J) teal
|
32 |
+
|
33 |
+
(K) turquoise
|
34 |
+
|
35 |
+
(L) burgundy
|
36 |
+
|
37 |
+
(M) silver
|
38 |
+
|
39 |
+
(N) gold
|
40 |
+
|
41 |
+
(O) black
|
42 |
+
|
43 |
+
(P) grey
|
44 |
+
|
45 |
+
(Q) purple
|
46 |
+
|
47 |
+
(R) pink'
|
48 |
+
target: (E)
|
49 |
+
- input: 'On the table, you see a bunch of objects arranged in a row: a purple paperclip,
|
50 |
+
a pink stress ball, a brown keychain, a green scrunchiephone charger, a mauve
|
51 |
+
fidget spinner, and a burgundy pen. What is the color of the object directly
|
52 |
+
to the right of the stress ball?
|
53 |
+
|
54 |
+
Options:
|
55 |
+
|
56 |
+
(A) red
|
57 |
+
|
58 |
+
(B) orange
|
59 |
+
|
60 |
+
(C) yellow
|
61 |
+
|
62 |
+
(D) green
|
63 |
+
|
64 |
+
(E) blue
|
65 |
+
|
66 |
+
(F) brown
|
67 |
+
|
68 |
+
(G) magenta
|
69 |
+
|
70 |
+
(H) fuchsia
|
71 |
+
|
72 |
+
(I) mauve
|
73 |
+
|
74 |
+
(J) teal
|
75 |
+
|
76 |
+
(K) turquoise
|
77 |
+
|
78 |
+
(L) burgundy
|
79 |
+
|
80 |
+
(M) silver
|
81 |
+
|
82 |
+
(N) gold
|
83 |
+
|
84 |
+
(O) black
|
85 |
+
|
86 |
+
(P) grey
|
87 |
+
|
88 |
+
(Q) purple
|
89 |
+
|
90 |
+
(R) pink'
|
91 |
+
target: (F)
|
92 |
+
- input: 'On the nightstand, you see the following items arranged in a row: a teal
|
93 |
+
plate, a burgundy keychain, a yellow scrunchiephone charger, an orange mug,
|
94 |
+
a pink notebook, and a grey cup. How many non-orange items do you see to the
|
95 |
+
left of the teal item?
|
96 |
+
|
97 |
+
Options:
|
98 |
+
|
99 |
+
(A) zero
|
100 |
+
|
101 |
+
(B) one
|
102 |
+
|
103 |
+
(C) two
|
104 |
+
|
105 |
+
(D) three
|
106 |
+
|
107 |
+
(E) four
|
108 |
+
|
109 |
+
(F) five
|
110 |
+
|
111 |
+
(G) six'
|
112 |
+
target: (A)
|
113 |
+
include: _fewshot_template_yaml
|
114 |
+
task: leaderboard_bbh_reasoning_about_colored_objects
|