koichi12 commited on
Commit
7d59045
·
verified ·
1 Parent(s): 7c44d25

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_fra.yaml +3 -0
  2. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_hau.yaml +3 -0
  3. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lin.yaml +3 -0
  4. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lug.yaml +3 -0
  5. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_orm.yaml +3 -0
  6. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sna.yaml +3 -0
  7. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sot.yaml +3 -0
  8. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_swa.yaml +3 -0
  9. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/utils.py +32 -0
  10. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_common_translate_yaml +34 -0
  11. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_amh.yaml +3 -0
  12. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_eng.yaml +3 -0
  13. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ewe.yaml +3 -0
  14. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_fra.yaml +3 -0
  15. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_hau.yaml +3 -0
  16. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ibo.yaml +3 -0
  17. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_kin.yaml +3 -0
  18. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lin.yaml +3 -0
  19. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lug.yaml +3 -0
  20. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_orm.yaml +3 -0
  21. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sna.yaml +3 -0
  22. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sot.yaml +3 -0
  23. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_swa.yaml +3 -0
  24. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_twi.yaml +3 -0
  25. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_wol.yaml +3 -0
  26. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_xho.yaml +3 -0
  27. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_yor.yaml +3 -0
  28. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_zul.yaml +3 -0
  29. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/utils.py +32 -0
  30. scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/flan/_held_in_template_yaml +14 -0
  31. scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/flan/flan_held_in.yaml +352 -0
  32. scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/flan/flan_held_out.yaml +13 -0
  33. scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/minerva_math.yaml +15 -0
  34. scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/multimedqa/README.md +43 -0
  35. scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/multimedqa/multimedqa.yaml +21 -0
  36. scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/openllm.yaml +18 -0
  37. scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/pythia.yaml +12 -0
  38. scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/t0_eval.yaml +127 -0
  39. scripts/yans/lm-evaluation-harness/lm_eval/tasks/fld/README.md +65 -0
  40. scripts/yans/lm-evaluation-harness/lm_eval/tasks/fld/fld_default.yaml +19 -0
  41. scripts/yans/lm-evaluation-harness/lm_eval/tasks/fld/fld_logical_formula_default.yaml +21 -0
  42. scripts/yans/lm-evaluation-harness/lm_eval/tasks/fld/fld_logical_formula_star.yaml +3 -0
  43. scripts/yans/lm-evaluation-harness/lm_eval/tasks/fld/fld_star.yaml +3 -0
  44. scripts/yans/lm-evaluation-harness/lm_eval/tasks/leaderboard/README.md +328 -0
  45. scripts/yans/lm-evaluation-harness/lm_eval/tasks/leaderboard/bbh_mc/_leaderboard_bbh.yaml +26 -0
  46. scripts/yans/lm-evaluation-harness/lm_eval/tasks/leaderboard/bbh_mc/boolean_expressions.yaml +14 -0
  47. scripts/yans/lm-evaluation-harness/lm_eval/tasks/leaderboard/bbh_mc/formal_fallacies.yaml +55 -0
  48. scripts/yans/lm-evaluation-harness/lm_eval/tasks/leaderboard/bbh_mc/geometric_shapes.yaml +84 -0
  49. scripts/yans/lm-evaluation-harness/lm_eval/tasks/leaderboard/bbh_mc/hyperbaton.yaml +32 -0
  50. scripts/yans/lm-evaluation-harness/lm_eval/tasks/leaderboard/bbh_mc/reasoning_about_colored_objects.yaml +114 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_fra.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: fra
2
+ include: afrimmlu_common_yaml
3
+ task: afrimmlu_direct_fra
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_hau.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: hau
2
+ include: afrimmlu_common_yaml
3
+ task: afrimmlu_direct_hau
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lin.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: lin
2
+ include: afrimmlu_common_yaml
3
+ task: afrimmlu_direct_lin
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_lug.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: lug
2
+ include: afrimmlu_common_yaml
3
+ task: afrimmlu_direct_lug
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_orm.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: orm
2
+ include: afrimmlu_common_yaml
3
+ task: afrimmlu_direct_orm
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sna.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: sna
2
+ include: afrimmlu_common_yaml
3
+ task: afrimmlu_direct_sna
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_sot.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: sot
2
+ include: afrimmlu_common_yaml
3
+ task: afrimmlu_direct_sot
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_swa.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: swa
2
+ include: afrimmlu_common_yaml
3
+ task: afrimmlu_direct_swa
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/utils.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from lm_eval.utils import weighted_f1_score
2
+
3
+
4
+ def doc_to_choice(doc):
5
+ choices = eval(doc["choices"])
6
+ return choices
7
+
8
+
9
+ def doc_to_text(doc):
10
+ output = """You are a highly knowledgeable and intelligent artificial intelligence
11
+ model answers multiple-choice questions about {subject}
12
+
13
+ Question: {question}
14
+
15
+ Choices:
16
+ A: {choice1}
17
+ B: {choice2}
18
+ C: {choice3}
19
+ D: {choice4}
20
+
21
+ Answer: """
22
+
23
+ choices = eval(doc["choices"])
24
+ text = output.format(
25
+ subject=doc["subject"],
26
+ question=doc["question"],
27
+ choice1=choices[0],
28
+ choice2=choices[1],
29
+ choice3=choices[2],
30
+ choice4=choices[3],
31
+ )
32
+ return text
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_common_translate_yaml ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ group:
2
+ - afrimmlu_translate
3
+ task: null
4
+ dataset_path: masakhane/afrimmlu-translate-test
5
+ dataset_name: null
6
+ output_type: multiple_choice
7
+ test_split: test
8
+ doc_to_text: !function utils.doc_to_text
9
+ doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}"
10
+ doc_to_choice: !function utils.doc_to_choice
11
+ should_decontaminate: true
12
+ doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
13
+ metric_list:
14
+ - metric: f1
15
+ aggregation: !function utils.weighted_f1_score
16
+ # aggregation: mean
17
+ average: weighted
18
+ hf_evaluate: true
19
+ higher_is_better: True
20
+ ignore_case: true
21
+ ignore_punctuation: true
22
+ regexes_to_ignore:
23
+ - ","
24
+ - "\\$"
25
+ - metric: acc
26
+ aggregation: mean
27
+ higher_is_better: true
28
+ ignore_case: true
29
+ ignore_punctuation: true
30
+ regexes_to_ignore:
31
+ - ","
32
+ - "\\$"
33
+ metadata:
34
+ version: 1.0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_amh.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: amh
2
+ include: afrimmlu_common_translate_yaml
3
+ task: afrimmlu_translate_amh
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_eng.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: eng
2
+ include: afrimmlu_common_translate_yaml
3
+ task: afrimmlu_translate_eng
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ewe.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: ewe
2
+ include: afrimmlu_common_translate_yaml
3
+ task: afrimmlu_translate_ewe
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_fra.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: fra
2
+ include: afrimmlu_common_translate_yaml
3
+ task: afrimmlu_translate_fra
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_hau.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: hau
2
+ include: afrimmlu_common_translate_yaml
3
+ task: afrimmlu_translate_hau
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_ibo.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: ibo
2
+ include: afrimmlu_common_translate_yaml
3
+ task: afrimmlu_translate_ibo
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_kin.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: kin
2
+ include: afrimmlu_common_translate_yaml
3
+ task: afrimmlu_translate_kin
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lin.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: lin
2
+ include: afrimmlu_common_translate_yaml
3
+ task: afrimmlu_translate_lin
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_lug.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: lug
2
+ include: afrimmlu_common_translate_yaml
3
+ task: afrimmlu_translate_lug
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_orm.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: orm
2
+ include: afrimmlu_common_translate_yaml
3
+ task: afrimmlu_translate_orm
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sna.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: sna
2
+ include: afrimmlu_common_translate_yaml
3
+ task: afrimmlu_translate_sna
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_sot.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: sot
2
+ include: afrimmlu_common_translate_yaml
3
+ task: afrimmlu_translate_sot
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_swa.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: swa
2
+ include: afrimmlu_common_translate_yaml
3
+ task: afrimmlu_translate_swa
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_twi.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: twi
2
+ include: afrimmlu_common_translate_yaml
3
+ task: afrimmlu_translate_twi
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_wol.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: wol
2
+ include: afrimmlu_common_translate_yaml
3
+ task: afrimmlu_translate_wol
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_xho.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: xho
2
+ include: afrimmlu_common_translate_yaml
3
+ task: afrimmlu_translate_xho
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_yor.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: yor
2
+ include: afrimmlu_common_translate_yaml
3
+ task: afrimmlu_translate_yor
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/afrimmlu_translate_zul.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: zul
2
+ include: afrimmlu_common_translate_yaml
3
+ task: afrimmlu_translate_zul
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/translate/utils.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from lm_eval.utils import weighted_f1_score
2
+
3
+
4
+ def doc_to_choice(doc):
5
+ choices = eval(doc["choices"])
6
+ return choices
7
+
8
+
9
+ def doc_to_text(doc):
10
+ output = """You are a highly knowledgeable and intelligent artificial intelligence
11
+ model answers multiple-choice questions about '{subject}'
12
+
13
+ Question: '''{question}'''
14
+
15
+ Choices:
16
+ A: ''{choice1}'''
17
+ B: ''{choice2}'''
18
+ C: ''{choice3}'''
19
+ D: ''{choice4}'''
20
+
21
+ Answer: """
22
+
23
+ choices = eval(doc["choices"])
24
+ text = output.format(
25
+ subject=doc["subject"],
26
+ question=doc["question"],
27
+ choice1=choices[0],
28
+ choice2=choices[1],
29
+ choice3=choices[2],
30
+ choice4=choices[3],
31
+ )
32
+ return text
scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/flan/_held_in_template_yaml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ output_type: generate_until
2
+ test_split: null
3
+ doc_to_choice: null
4
+ metric_list:
5
+ - metric: exact_match
6
+ aggregation: mean
7
+ higher_is_better: true
8
+ generation_kwargs:
9
+ until:
10
+ - "</s>"
11
+ do_sample: false
12
+ temperature: 0.0
13
+ metadata:
14
+ version: 1.0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/flan/flan_held_in.yaml ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ group: flan_held_in
2
+ group_alias: Flan (Held-In)
3
+ task:
4
+ # ANLI R1
5
+ - group: anli_r1_flan
6
+ group_alias: ANLI R1
7
+ aggregate_metric_list:
8
+ - metric: acc
9
+ weight_by_size: True
10
+ task:
11
+ - task: anli_r1_prompt-0
12
+ task_alias: prompt-0
13
+ include: _held_in_template_yaml
14
+ doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
15
+ doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
16
+ - task: anli_r1_prompt-1
17
+ task_alias: prompt-1
18
+ include: _held_in_template_yaml
19
+ doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
20
+ doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
21
+ - task: anli_r1_prompt-2
22
+ task_alias: prompt-2
23
+ include: _held_in_template_yaml
24
+ doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
25
+ doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
26
+ - task: anli_r1_prompt-3
27
+ task_alias: prompt-3
28
+ include: _held_in_template_yaml
29
+ doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
30
+ doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
31
+ - task: anli_r1_prompt-4
32
+ task_alias: prompt-4
33
+ include: _held_in_template_yaml
34
+ doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
35
+ doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
36
+ - task: anli_r1_prompt-5
37
+ task_alias: prompt-5
38
+ include: _held_in_template_yaml
39
+ doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
40
+ doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
41
+ - task: anli_r1_prompt-6
42
+ task_alias: prompt-6
43
+ include: _held_in_template_yaml
44
+ doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
45
+ doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
46
+ - task: anli_r1_prompt-7
47
+ task_alias: prompt-7
48
+ include: _held_in_template_yaml
49
+ doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
50
+ doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
51
+ - task: anli_r1_prompt-8
52
+ task_alias: prompt-8
53
+ include: _held_in_template_yaml
54
+ doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
55
+ doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
56
+ # ANLI R2
57
+ - group: anli_r2_flan
58
+ group_alias: ANLI R2
59
+ aggregate_metric_list:
60
+ - metric: acc
61
+ weight_by_size: True
62
+ task:
63
+ - task: anli_r2_prompt-0
64
+ task_alias: prompt-0
65
+ include: _held_in_template_yaml
66
+ doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
67
+ doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
68
+ - task: anli_r2_prompt-1
69
+ task_alias: prompt-1
70
+ include: _held_in_template_yaml
71
+ doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
72
+ doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
73
+ - task: anli_r2_prompt-2
74
+ task_alias: prompt-2
75
+ include: _held_in_template_yaml
76
+ doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
77
+ doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
78
+ - task: anli_r2_prompt-3
79
+ task_alias: prompt-3
80
+ include: _held_in_template_yaml
81
+ doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
82
+ doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
83
+ - task: anli_r2_prompt-4
84
+ task_alias: prompt-4
85
+ include: _held_in_template_yaml
86
+ doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
87
+ doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
88
+ - task: anli_r2_prompt-5
89
+ task_alias: prompt-5
90
+ include: _held_in_template_yaml
91
+ doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
92
+ doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
93
+ - task: anli_r2_prompt-6
94
+ task_alias: prompt-6
95
+ include: _held_in_template_yaml
96
+ doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
97
+ doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
98
+ - task: anli_r2_prompt-7
99
+ task_alias: prompt-7
100
+ include: _held_in_template_yaml
101
+ doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
102
+ doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
103
+ - task: anli_r2_prompt-8
104
+ task_alias: prompt-8
105
+ include: _held_in_template_yaml
106
+ doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
107
+ doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
108
+ # ANLI R3
109
+ - group: anli_r3_flan
110
+ group_alias: ANLI R3
111
+ aggregate_metric_list:
112
+ - metric: acc
113
+ weight_by_size: True
114
+ task:
115
+ - task: anli_r3_prompt-0
116
+ task_alias: prompt-0
117
+ include: _held_in_template_yaml
118
+ doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
119
+ doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
120
+ - task: anli_r3_prompt-1
121
+ task_alias: prompt-1
122
+ include: _held_in_template_yaml
123
+ doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
124
+ doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
125
+ - task: anli_r3_prompt-2
126
+ task_alias: prompt-2
127
+ include: _held_in_template_yaml
128
+ doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
129
+ doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
130
+ - task: anli_r3_prompt-3
131
+ task_alias: prompt-3
132
+ include: _held_in_template_yaml
133
+ doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
134
+ doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
135
+ - task: anli_r3_prompt-4
136
+ task_alias: prompt-4
137
+ include: _held_in_template_yaml
138
+ doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
139
+ doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
140
+ - task: anli_r3_prompt-5
141
+ task_alias: prompt-5
142
+ include: _held_in_template_yaml
143
+ doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
144
+ doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
145
+ - task: anli_r3_prompt-6
146
+ task_alias: prompt-6
147
+ include: _held_in_template_yaml
148
+ doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
149
+ doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
150
+ - task: anli_r3_prompt-7
151
+ task_alias: prompt-7
152
+ include: _held_in_template_yaml
153
+ doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
154
+ doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
155
+ - task: anli_r3_prompt-8
156
+ task_alias: prompt-8
157
+ include: _held_in_template_yaml
158
+ doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
159
+ doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
160
+ # Arc Easy
161
+ - group: arc_easy_flan
162
+ group_alias: Arc Easy
163
+ aggregate_metric_list:
164
+ - metric: acc
165
+ weight_by_size: True
166
+ task:
167
+ - task: arc_easy_prompt-0
168
+ task_alias: prompt-0
169
+ include: _held_in_template_yaml
170
+ doc_to_text: "{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
171
+ doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
172
+ - task: arc_easy_prompt-1
173
+ task_alias: prompt-1
174
+ include: _held_in_template_yaml
175
+ doc_to_text: "Question: {{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}\nAnswer:"
176
+ doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
177
+ - task: arc_easy_prompt-2
178
+ task_alias: prompt-2
179
+ include: _held_in_template_yaml
180
+ doc_to_text: "Question: {{question}}\n\nWhat is the correct answer to the question from the following choices?\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
181
+ doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
182
+ - task: arc_easy_prompt-3
183
+ task_alias: prompt-3
184
+ include: _held_in_template_yaml
185
+ doc_to_text: "Q: {{question}}\nWhat is the correct answer to this question?\nOPTIONS:\n- {{choices.text|join('\n- ')}}...A:"
186
+ doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
187
+ - task: arc_easy_prompt-4
188
+ task_alias: prompt-4
189
+ include: _held_in_template_yaml
190
+ doc_to_text: "Choose your answer?\n\n{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
191
+ doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
192
+ - task: arc_easy_prompt-5
193
+ task_alias: prompt-5
194
+ include: _held_in_template_yaml
195
+ doc_to_text: "Answer the question\n\n{{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
196
+ doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
197
+ - task: arc_easy_prompt-6
198
+ task_alias: prompt-6
199
+ include: _held_in_template_yaml
200
+ doc_to_text: "{{question}}\n\nPick the answer from these options\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
201
+ doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
202
+ # Arc Challenge
203
+ - group: arc_challenge_flan
204
+ group_alias: Arc Challenge
205
+ aggregate_metric_list:
206
+ - metric: acc
207
+ weight_by_size: True
208
+ task:
209
+ - task: arc_challenge_prompt-0
210
+ task_alias: prompt-0
211
+ include: _held_in_template_yaml
212
+ doc_to_text: "{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
213
+ doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
214
+ - task: arc_challenge_prompt-1
215
+ task_alias: prompt-1
216
+ include: _held_in_template_yaml
217
+ doc_to_text: "Question: {{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}\nAnswer:"
218
+ doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
219
+ - task: arc_challenge_prompt-2
220
+ task_alias: prompt-2
221
+ include: _held_in_template_yaml
222
+ doc_to_text: "Question: {{question}}\n\nWhat is the correct answer to the question from the following choices?\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
223
+ doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
224
+ - task: arc_challenge_prompt-3
225
+ task_alias: prompt-3
226
+ include: _held_in_template_yaml
227
+ doc_to_text: "Q: {{question}}\nWhat is the correct answer to this question?\nOPTIONS:\n- {{choices.text|join('\n- ')}}...A:"
228
+ doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
229
+ - task: arc_challenge_prompt-4
230
+ task_alias: prompt-4
231
+ include: _held_in_template_yaml
232
+ doc_to_text: "Choose your answer?\n\n{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
233
+ doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
234
+ - task: arc_challenge_prompt-5
235
+ task_alias: prompt-5
236
+ include: _held_in_template_yaml
237
+ doc_to_text: "Answer the question\n\n{{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
238
+ doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
239
+ - task: arc_challenge_prompt-6
240
+ task_alias: prompt-6
241
+ include: _held_in_template_yaml
242
+ doc_to_text: "{{question}}\n\nPick the answer from these options\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
243
+ doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
244
+ # BoolQ
245
+ - group: boolq_flan
246
+ group_alias: BoolQ
247
+ aggregate_metric_list:
248
+ - metric: acc
249
+ weight_by_size: True
250
+ task:
251
+ - task: boolq_prompt-0
252
+ task_alias: prompt-0
253
+ include: _held_in_template_yaml
254
+ doc_to_text: "{{passage}}\n\nCan we conclude that {{question}}?\n\nOPTIONS:\n- no\n- yes"
255
+ doc_to_target: "{{['no', 'yes'][label]}}"
256
+ - task: boolq_prompt-1
257
+ task_alias: prompt-1
258
+ include: _held_in_template_yaml
259
+ doc_to_text: "{{passage}}\n\nIs it true that {{question}}?\n\nOPTIONS:\n- no\n- yes"
260
+ doc_to_target: "{{['no', 'yes'][label]}}"
261
+ - task: boolq_prompt-2
262
+ task_alias: prompt-2
263
+ include: _held_in_template_yaml
264
+ doc_to_text: "{{passage}}\n\n{{question}}?\n\nOPTIONS:\n- no\n- yes"
265
+ doc_to_target: "{{['no', 'yes'][label]}}"
266
+ - task: boolq_prompt-3
267
+ task_alias: prompt-3
268
+ include: _held_in_template_yaml
269
+ doc_to_text: "Text: {{passage}}\n\nQuestion: {{question}}?\n\nOPTIONS:\n- no\n- yes"
270
+ doc_to_target: "{{['no', 'yes'][label]}}"
271
+ - task: boolq_prompt-4
272
+ task_alias: prompt-4
273
+ include: _held_in_template_yaml
274
+ doc_to_text: "{{passage}}\n\nWhat's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
275
+ doc_to_target: "{{['no', 'yes'][label]}}"
276
+ - task: boolq_prompt-5
277
+ task_alias: prompt-5
278
+ include: _held_in_template_yaml
279
+ doc_to_text: "{{passage}}\nBased on the above text what's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
280
+ doc_to_target: "{{['no', 'yes'][label]}}"
281
+ - task: boolq_prompt-6
282
+ task_alias: prompt-6
283
+ include: _held_in_template_yaml
284
+ doc_to_text: "{{passage}}\nAnswer this question making sure that the answer is supposed by the text: {{question}}?\n\nOPTIONS:\n- no\n- yes"
285
+ doc_to_target: "{{['no', 'yes'][label]}}"
286
+ - task: boolq_prompt-7
287
+ task_alias: prompt-7
288
+ include: _held_in_template_yaml
289
+ doc_to_text: "{{passage}}\n\nIs the following statement correct based on the text\n\n{{question}}\n\nOPTIONS:\n- no\n- yes"
290
+ doc_to_target: "{{['no', 'yes'][label]}}"
291
+ - task: boolq_prompt-8
292
+ task_alias: prompt-8
293
+ include: _held_in_template_yaml
294
+ doc_to_text: "{{passage}}\n\nIs this statement correct \"{{question}}\"?\n\nOPTIONS:\n- no\n- yes"
295
+ doc_to_target: "{{['no', 'yes'][label]}}"
296
+ - task: boolq_prompt-9
297
+ task_alias: prompt-9
298
+ include: _held_in_template_yaml
299
+ doc_to_text: "Is it true that {{question}} based on the following text?\n\n{{passage}}\n\nOPTIONS:\n- no\n- yes"
300
+ doc_to_target: "{{['no', 'yes'][label]}}"
301
+ # RTE
302
+ - group: rte_flan
303
+ group_alias: RTE
304
+ aggregate_metric_list:
305
+ - metric: acc
306
+ weight_by_size: True
307
+ task:
308
+ - task: rte_prompt-0
309
+ task_alias: prompt-0
310
+ include: _held_in_template_yaml
311
+ doc_to_text: "{{sentence1}}\n\nQuestion with options: Based on the paragraph above can we conclude that \"{{sentence2}}\"?\n\nOPTIONS:\n- yes\n- no"
312
+ doc_to_target: "{{['yes', 'no'][label]}}"
313
+ - task: rte_prompt-1
314
+ task_alias: prompt-1
315
+ include: _held_in_template_yaml
316
+ doc_to_text: "{{sentence1}}\n\nBased on that paragraph can we conclude that the sentence below is true?\n{{sentence2}}\n\nOPTIONS:\n- yes\n- no"
317
+ doc_to_target: "{{['yes', 'no'][label]}}"
318
+ - task: rte_prompt-1
319
+ task_alias: prompt-2
320
+ include: _held_in_template_yaml
321
+ doc_to_text: "{{sentence1}}\n\nQ with options: Can we draw the following conclusion?\n{{sentence2}}\n\nOPTIONS:\n- yes\n- no"
322
+ doc_to_target: "{{['yes', 'no'][label]}}"
323
+ - task: rte_prompt-3
324
+ task_alias: prompt-3
325
+ include: _held_in_template_yaml
326
+ doc_to_text: "{{sentence1}}\nDoes this next sentence follow, given the preceding text?\n{{sentence2}}\n\nOPTIONS:\n- yes\n- no"
327
+ doc_to_target: "{{['yes', 'no'][label]}}"
328
+ - task: rte_prompt-4
329
+ task_alias: prompt-4
330
+ include: _held_in_template_yaml
331
+ doc_to_text: "{{sentence1}}\nOPTIONS:\n- yes\n- no\nQuestion: Can we infer the following?\n{{sentence2}}"
332
+ doc_to_target: "{{['yes', 'no'][label]}}"
333
+ - task: rte_prompt-5
334
+ task_alias: prompt-5
335
+ include: _held_in_template_yaml
336
+ doc_to_text: "Read the following paragraph and determine if the hypothesis is true. Select from options at the end:\n\n{{sentence1}}\n\nHypothesis: {{sentence2}}\nOPTIONS:\n- yes\n- no\nThe answer is"
337
+ doc_to_target: "{{['yes', 'no'][label]}}"
338
+ - task: rte_prompt-6
339
+ task_alias: prompt-6
340
+ include: _held_in_template_yaml
341
+ doc_to_text: "Read the text and determine if the sentence is true:\n\n{{sentence1}}\n\nSentence: {{sentence2}}\nOPTIONS:\n- yes\n- no\nA:"
342
+ doc_to_target: "{{['yes', 'no'][label]}}"
343
+ - task: rte_prompt-7
344
+ task_alias: prompt-7
345
+ include: _held_in_template_yaml
346
+ doc_to_text: "Question with options: can we draw the following hypothesis from the context? \n\nContext:\n\n{{sentence1}}\n\nHypothesis: {{sentence2}}\nOPTIONS:\n- yes\n- no\nA:"
347
+ doc_to_target: "{{['yes', 'no'][label]}}"
348
+ - task: rte_prompt-8
349
+ task_alias: prompt-8
350
+ include: _held_in_template_yaml
351
+ doc_to_text: "Determine if the sentence is true based on the text below. Choose from options.\n{{sentence2}}\n\n{{sentence1}}\nOPTIONS:\n- yes\n- no"
352
+ doc_to_target: "{{['yes', 'no'][label]}}"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/flan/flan_held_out.yaml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ group: flan_held_out
2
+ task:
3
+ # BBH
4
+ - bbh_zeroshot
5
+ - bbh_fewshot
6
+ - bbh_cot_fewshot
7
+ - bbh_cot_zeroshot
8
+ # MMLU
9
+ - mmlu
10
+ - mmlu_flan_n_shot_generative
11
+ - mmlu_flan_n_shot_loglikelihood
12
+ - mmlu_flan_cot_zeroshot
13
+ - mmlu_flan_cot_fewshot
scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/minerva_math.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ group: minerva_math
2
+ task:
3
+ - minerva_math_algebra
4
+ - minerva_math_counting_and_prob
5
+ - minerva_math_geometry
6
+ - minerva_math_intermediate_algebra
7
+ - minerva_math_num_theory
8
+ - minerva_math_prealgebra
9
+ - minerva_math_precalc
10
+ aggregate_metric_list:
11
+ - metric: exact_match
12
+ aggregation: mean
13
+ weight_by_size: true
14
+ metadata:
15
+ version: 1.0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/multimedqa/README.md ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MultiMedQA (multiple-choice subset)
2
+
3
+ ### Paper
4
+
5
+ Title: Large Language Models Encode Clinical Knowledge
6
+
7
+ Abstract: https://arxiv.org/abs/2212.13138
8
+
9
+ A benchmark combining four existing multiple-choice question answering datasets spanning professional medical exams and research queries.
10
+
11
+ ### Citation
12
+
13
+ ```
14
+ @Article{Singhal2023,
15
+ author={Singhal, Karan and Azizi, Shekoofeh and Tu, Tao and Mahdavi, S. Sara and Wei, Jason and Chung, Hyung Won and Scales, Nathan and Tanwani, Ajay and Cole-Lewis, Heather and Pfohl, Stephen and Payne, Perry and Seneviratne, Martin and Gamble, Paul and Kelly, Chris and Babiker, Abubakr and Sch{\"a}rli, Nathanael and Chowdhery, Aakanksha and Mansfield, Philip and Demner-Fushman, Dina and Ag{\"u}era y Arcas, Blaise and Webster, Dale and Corrado, Greg S. and Matias, Yossi and Chou, Katherine and Gottweis, Juraj and Tomasev, Nenad and Liu, Yun and Rajkomar, Alvin and Barral, Joelle and Semturs, Christopher and Karthikesalingam, Alan and Natarajan, Vivek},
16
+ title={Large language models encode clinical knowledge},
17
+ journal={Nature},
18
+ year={2023},
19
+ month={Aug},
20
+ day={01},
21
+ volume={620},
22
+ number={7972},
23
+ pages={172-180},
24
+ issn={1476-4687},
25
+ doi={10.1038/s41586-023-06291-2},
26
+ url={https://doi.org/10.1038/s41586-023-06291-2}
27
+ }
28
+ ```
29
+
30
+ ### Tasks
31
+
32
+ * [PubMedQA](https://pubmedqa.github.io/) - 1,000 expert-labeled Q&A pairs where a question and corresponding PubMed abstract as context is given and the a yes/maybe/no answer must be produced. Unlike the rest of the tasks in this suite, PubMedQA is a closed-domain Q&A task.
33
+ * [MedQA](https://github.com/jind11/MedQA) - US Medical License Exam (USMLE) questions with 4 or 5 possible answers. Typically, only the 4-option questions are used.
34
+ * [MedMCQA](https://medmcqa.github.io/) - 4-option multiple choice questions from Indian medical entrance examinations, >191k total questions.
35
+ * [MMLU](https://arxiv.org/abs/2009.03300) - 4-option multiple choice exam questions from a variety of domains. The following 6 domains are utilized here:
36
+ * Anatomy
37
+ * Clinical Knowledge
38
+ * College Medicine
39
+ * Medical Genetics
40
+ * Professional Medicine
41
+ * College Biology
42
+
43
+ Note that MultiMedQA also includes some short-form and long-form Q&A tasks (LiveQA, MedicationQA, HealthSearchQA). Evaluation on these tasks is usually done by experts and is not typically performed automatically, and therefore is ignored here.
scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/multimedqa/multimedqa.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ group: multimedqa
2
+ task:
3
+ - pubmedqa
4
+ - medmcqa
5
+ - medqa_4options
6
+ - task: mmlu_anatomy
7
+ task_alias: "anatomy (mmlu)"
8
+ - task: mmlu_clinical_knowledge
9
+ task_alias: "clinical_knowledge (mmlu)"
10
+ - task: mmlu_college_medicine
11
+ task_alias: "college_medicine (mmlu)"
12
+ - task: mmlu_medical_genetics
13
+ task_alias: "medical_genetics (mmlu)"
14
+ - task: mmlu_professional_medicine
15
+ task_alias: "professional_medicine (mmlu)"
16
+ - task: mmlu_college_biology
17
+ task_alias: "college_biology (mmlu)"
18
+ aggregate_metric_list:
19
+ - metric: acc
20
+ aggregation: mean
21
+ weight_by_size: True
scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/openllm.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ group: openllm
2
+ group_alias: Open LLM Leaderboard
3
+ task:
4
+ - task: arc_challenge
5
+ fewshot_split: validation
6
+ num_fewshot: 25
7
+ - task: hellaswag
8
+ fewshot_split: train
9
+ num_fewshot: 10
10
+ - task: truthfulqa
11
+ num_fewshot: 0
12
+ - task: mmlu
13
+ num_fewshot: 5
14
+ - task: winogrande
15
+ fewshot_split: train
16
+ num_fewshot: 5
17
+ - task: gsm8k
18
+ num_fewshot: 5
scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/pythia.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ group: pythia
2
+ task:
3
+ - lambada_openai
4
+ - logiqa
5
+ - piqa
6
+ - sciq
7
+ - wikitext
8
+ - winogrande
9
+ - wsc
10
+ - ai2_arc
11
+ - blimp
12
+ - mmlu
scripts/yans/lm-evaluation-harness/lm_eval/tasks/benchmarks/t0_eval.yaml ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ group: t0_eval
2
+ task:
3
+ # Coreference Resolution
4
+ - dataset_path: super_glue
5
+ dataset_name: wsc.fixed
6
+ use_prompt: promptsource:*
7
+ training_split: train
8
+ validation_split: validation
9
+ output_type: generate_until
10
+ metric_list:
11
+ - metric: exact_match
12
+ aggregation: mean
13
+ higher_is_better: true
14
+ ignore_case: true
15
+ ignore_punctuation: true
16
+ # Coreference Resolution
17
+ - dataset_path: winogrande
18
+ dataset_name: winogrande_xl
19
+ use_prompt: promptsource:*
20
+ training_split: train
21
+ validation_split: validation
22
+ output_type: generate_until
23
+ metric_list:
24
+ - metric: exact_match
25
+ aggregation: mean
26
+ higher_is_better: true
27
+ ignore_case: true
28
+ ignore_punctuation: true
29
+ # Natural Language Inference
30
+ - dataset_path: super_glue
31
+ dataset_name: cb
32
+ use_prompt: promptsource:*
33
+ training_split: train
34
+ validation_split: validation
35
+ output_type: generate_until
36
+ metric_list:
37
+ - metric: exact_match
38
+ aggregation: mean
39
+ higher_is_better: true
40
+ ignore_case: true
41
+ ignore_punctuation: true
42
+ - dataset_path: super_glue
43
+ dataset_name: rte
44
+ use_prompt: promptsource:*
45
+ training_split: train
46
+ validation_split: validation
47
+ output_type: generate_until
48
+ metric_list:
49
+ - metric: exact_match
50
+ aggregation: mean
51
+ higher_is_better: true
52
+ ignore_case: true
53
+ ignore_punctuation: true
54
+ - task: anli_r1
55
+ dataset_path: anli
56
+ use_prompt: promptsource:*
57
+ training_split: train_r1
58
+ validation_split: dev_r1
59
+ output_type: generate_until
60
+ metric_list:
61
+ - metric: exact_match
62
+ aggregation: mean
63
+ higher_is_better: true
64
+ ignore_case: true
65
+ ignore_punctuation: true
66
+ - task: anli_r2
67
+ dataset_path: anli
68
+ use_prompt: promptsource:*
69
+ training_split: train_r2
70
+ validation_split: dev_r2
71
+ output_type: generate_until
72
+ metric_list:
73
+ - metric: exact_match
74
+ aggregation: mean
75
+ higher_is_better: true
76
+ ignore_case: true
77
+ ignore_punctuation: true
78
+ - task: anli_r3
79
+ dataset_path: anli
80
+ use_prompt: promptsource:*
81
+ training_split: train_r3
82
+ validation_split: dev_r3
83
+ output_type: generate_until
84
+ metric_list:
85
+ - metric: exact_match
86
+ aggregation: mean
87
+ higher_is_better: true
88
+ ignore_case: true
89
+ ignore_punctuation: true
90
+ # Sentence Completion
91
+ - dataset_path: super_glue
92
+ dataset_name: copa
93
+ use_prompt: promptsource:*
94
+ training_split: train
95
+ validation_split: validation
96
+ output_type: generate_until
97
+ metric_list:
98
+ - metric: exact_match
99
+ aggregation: mean
100
+ higher_is_better: true
101
+ ignore_case: true
102
+ ignore_punctuation: true
103
+ # Natural Language Inference
104
+ - dataset_path: hellaswag
105
+ use_prompt: promptsource:*
106
+ training_split: train
107
+ validation_split: validation
108
+ output_type: generate_until
109
+ metric_list:
110
+ - metric: exact_match
111
+ aggregation: mean
112
+ higher_is_better: true
113
+ ignore_case: true
114
+ ignore_punctuation: true
115
+ # Word Sense Disambiguation
116
+ - dataset_path: super_glue
117
+ dataset_name: wic
118
+ use_prompt: promptsource:*
119
+ training_split: train
120
+ validation_split: validation
121
+ output_type: generate_until
122
+ metric_list:
123
+ - metric: exact_match
124
+ aggregation: mean
125
+ higher_is_better: true
126
+ ignore_case: true
127
+ ignore_punctuation: true
scripts/yans/lm-evaluation-harness/lm_eval/tasks/fld/README.md ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FLD
2
+
3
+ ### Paper
4
+
5
+ Title: Learning Deductive Reasoning from Synthetic Corpus based on Formal Logic
6
+
7
+ Abstract: https://arxiv.org/abs/2308.07336
8
+
9
+ **FLD** (**F**ormal **L**ogic **D**eduction) is a deductive reasoning benchmark.
10
+ Given a set of facts and a hypothesis, an LLM is required to generate (i) proof steps to (dis-)prove the hypothesis, and (ii) an answer ("proved", "disproved" or unknown").
11
+
12
+ Unique features of FLD are:
13
+ * It assesses the model's logical reasoning ability *isolated from knowledge*, as the facts are randomly constructed so that referring to existing knowledge never helps solve the task.
14
+ * It assesses diverse reasoning patterns (i.e., deduction rules), as it is based on formal logic theory.
15
+ * As a result, it is highly challenging. Indeed, even GPT-4 can solve only about half of the problems.
16
+
17
+ Homepage: https://github.com/hitachi-nlp/FLD
18
+
19
+
20
+ ### Citation
21
+
22
+ ```
23
+ @InProceedings{pmlr-v202-morishita23a,
24
+ title = {Learning Deductive Reasoning from Synthetic Corpus based on Formal Logic},
25
+ author = {Morishita, Terufumi and Morio, Gaku and Yamaguchi, Atsuki and Sogawa, Yasuhiro},
26
+ booktitle = {Proceedings of the 40th International Conference on Machine Learning},
27
+ pages = {25254--25274},
28
+ year = {2023},
29
+ editor = {Krause, Andreas and Brunskill, Emma and Cho, Kyunghyun and Engelhardt, Barbara and Sabato, Sivan and Scarlett, Jonathan},
30
+ volume = {202},
31
+ series = {Proceedings of Machine Learning Research},
32
+ month = {23--29 Jul},
33
+ publisher = {PMLR},
34
+ pdf = {https://proceedings.mlr.press/v202/morishita23a/morishita23a.pdf},
35
+ url = {https://proceedings.mlr.press/v202/morishita23a.html},
36
+ }
37
+ ```
38
+
39
+ ### Groups and Tasks
40
+
41
+ This release is the simplified version of FLD where a model is required to predict only an answer.
42
+ This setting is described by "answer accuracy" in the original paper.
43
+
44
+ #### Tasks in Group `fld`
45
+ * `fld_default` is a basic task based on [FLD.v2](https://huggingface.co/datasets/hitachi-nlp/FLD.v2/viewer/star)
46
+ * `fld_star`: is a more challenging version based on [FLD.v2-star](https://huggingface.co/datasets/hitachi-nlp/FLD.v2/viewer/star)
47
+
48
+ #### Tasks in Group `fld_logical_formula`
49
+ Further, we have "logical formula" versions of the benchmarks, which evaluate LLMs' pure logical reasoning capabilities within the domain of logical formulas, rather than natural language:
50
+ * `fld_logical_formula_default`
51
+ * `fld_logical_formula_fld_star`
52
+
53
+
54
+ ### Checklist
55
+
56
+ For adding novel benchmarks/datasets to the library:
57
+ * [x] Is the task an existing benchmark in the literature?
58
+ * [x] Have you referenced the original paper that introduced the task?
59
+ * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
60
+
61
+
62
+ If other tasks on this dataset are already supported:
63
+ * [ ] Is the "Main" variant of this task clearly denoted?
64
+ * [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
65
+ * [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
scripts/yans/lm-evaluation-harness/lm_eval/tasks/fld/fld_default.yaml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task: fld_default
2
+ dataset_path: hitachi-nlp/FLD.v2
3
+ dataset_name: default
4
+ training_split: train
5
+ validation_split: validation
6
+ test_split: test
7
+ doc_to_text: "Based on the provided facts ($context$), either prove or disprove the hypothesis or state that it is unknown. {{prompt_serial}}"
8
+ doc_to_target: world_assump_label
9
+ metric_list:
10
+ - metric: exact_match
11
+ aggregation: mean
12
+ higher_is_better: true
13
+ filter_list:
14
+ - name: remove_whitespace
15
+ filter:
16
+ - function: remove_whitespace
17
+ - function: take_first
18
+ metadata:
19
+ version: 2.0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/fld/fld_logical_formula_default.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ group:
2
+ - fld_logical_formula
3
+ task: fld_logical_formula_default
4
+ dataset_path: hitachi-nlp/FLD.v2
5
+ dataset_name: default
6
+ training_split: train
7
+ validation_split: validation
8
+ test_split: test
9
+ doc_to_text: "Based on the provided facts ($context$), either prove or disprove the hypothesis or state that it is unknown. The facts and the hypothesis are written in logical formulas as follows: capital letters such as \"{A}\", \"{B}\", \"{AB}\" are predicates, small letters such as \"{a}\", \"{b}\", \"{ab}\" are constants, \"&\" is logical conjunction, \"v\" is logical disjunction, \"¬\" is negation, \"->\" is implication, \"(x)\" is \"for all x\", and \"(Ex)\" is \"for some x\". $hypothesis$ = {{hypothesis_formula}} ; $context$ = {{context_formula}} ; $proof$ = "
10
+ doc_to_target: world_assump_label
11
+ metric_list:
12
+ - metric: exact_match
13
+ aggregation: mean
14
+ higher_is_better: true
15
+ filter_list:
16
+ - name: remove_whitespace
17
+ filter:
18
+ - function: remove_whitespace
19
+ - function: take_first
20
+ metadata:
21
+ version: 2.0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/fld/fld_logical_formula_star.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ include: fld_logical_formula_default.yaml
2
+ task: fld_logical_formula_star
3
+ dataset_name: star
scripts/yans/lm-evaluation-harness/lm_eval/tasks/fld/fld_star.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ include: fld_default.yaml
2
+ task: fld_star
3
+ dataset_name: star
scripts/yans/lm-evaluation-harness/lm_eval/tasks/leaderboard/README.md ADDED
@@ -0,0 +1,328 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Leaderboard evaluations
2
+ Our goal with this group is to create an unchanging through time version of
3
+ evaluations that will power the Open LLM Leaderboard on HuggingFace.
4
+
5
+ As we want to evaluate models across capabilities, the list currently contains:
6
+ - BBH (3-shots, multichoice)
7
+ - GPQA (0-shot, multichoice)
8
+ - mmlu-pro (5-shots, multichoice)
9
+ - Musr (0-shot, multichoice)
10
+ - ifeval (0-shot, generative)
11
+ - Math-lvl-5 (4-shots, generative, minerva version)
12
+
13
+
14
+ Details on the choice of those evals can be found [here](https://huggingface.co/spaces/open-llm-leaderboard/blog) !
15
+
16
+ ## BigBenchHard (BBH)
17
+
18
+ A suite of 23 challenging BIG-Bench tasks which we call BIG-Bench Hard (BBH).
19
+ These are the task for which prior language model evaluations did not
20
+ outperform the average human-rater.
21
+
22
+ ### Paper
23
+
24
+ Title: Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them
25
+
26
+ BIG-Bench (Srivastava et al., 2022) is a diverse evaluation suite that focuses on tasks believed to be beyond the capabilities of current language models. Language models have already made good progress on this benchmark, with the best model in the BIG-Bench paper outperforming average reported human-rater results on 65% of the BIG-Bench tasks via few-shot prompting. But on what tasks do language models fall short of average human-rater performance, and are those tasks actually unsolvable by current language models?
27
+ In this work, we focus on a suite of 23 challenging BIG-Bench tasks which we call BIG-Bench Hard (BBH). These are the task for which prior language model evaluations did not outperform the average human-rater. We find that applying chain-of-thought (CoT) prompting to BBH tasks enables PaLM to surpass the average human-rater performance on 10 of the 23 tasks, and Codex (code-davinci-002) to surpass the average human-rater performance on 17 of the 23 tasks. Since many tasks in BBH require multi-step reasoning, few-shot prompting without CoT, as done in the BIG-Bench evaluations (Srivastava et al., 2022), substantially underestimates the best performance and capabilities of language models, which is better captured via CoT prompting. As further analysis, we explore the interaction between CoT and model scale on BBH, finding that CoT enables emergent task performance on several BBH tasks with otherwise flat scaling curves.
28
+
29
+
30
+ - paper: https://huggingface.co/papers/2210.09261
31
+ - Homepage: https://github.com/suzgunmirac/BIG-Bench-Hard
32
+
33
+ ### Citation
34
+
35
+ ```
36
+ @article{suzgun2022challenging,
37
+ title={Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them},
38
+ author={Suzgun, Mirac and Scales, Nathan and Sch{\"a}rli, Nathanael and Gehrmann, Sebastian and Tay, Yi and Chung, Hyung Won and Chowdhery, Aakanksha and Le, Quoc V and Chi, Ed H and Zhou, Denny and and Wei, Jason},
39
+ journal={arXiv preprint arXiv:2210.09261},
40
+ year={2022}
41
+ }
42
+ ```
43
+
44
+ ### Groups
45
+
46
+ - `leaderboard_bbh`
47
+
48
+ ### Tasks
49
+
50
+ - `leaderboard_bbh_boolean_expressions`
51
+ - `leaderboard_bbh_causal_judgement`
52
+ - `leaderboard_bbh_date_understanding`
53
+ - `leaderboard_bbh_disambiguation_qa`
54
+ - `leaderboard_bbh_dyck_languages`
55
+ - `leaderboard_bbh_formal_fallacies`
56
+ - `leaderboard_bbh_geometric_shapes`
57
+ - `leaderboard_bbh_hyperbaton`
58
+ - `leaderboard_bbh_logical_deduction_five_objects`
59
+ - `leaderboard_bbh_logical_deduction_seven_objects`
60
+ - `leaderboard_bbh_logical_deduction_three_objects`
61
+ - `leaderboard_bbh_movie_recommendation`
62
+ - `leaderboard_bbh_multistep_arithmetic_two`
63
+ - `leaderboard_bbh_navigate`
64
+ - `leaderboard_bbh_object_counting`
65
+ - `leaderboard_bbh_penguins_in_a_table`
66
+ - `leaderboard_bbh_reasoning_about_colored_objects`
67
+ - `leaderboard_bbh_ruin_names`
68
+ - `leaderboard_bbh_salient_translation_error_detection`
69
+ - `leaderboard_bbh_snarks`
70
+ - `leaderboard_bbh_sports_understanding`
71
+ - `leaderboard_bbh_temporal_sequences`
72
+ - `leaderboard_bbh_tracking_shuffled_objects_five_objects`
73
+ - `leaderboard_bbh_tracking_shuffled_objects_seven_objects`
74
+ - `leaderboard_bbh_tracking_shuffled_objects_three_objects`
75
+ - `leaderboard_bbh_web_of_lies`
76
+ - `leaderboard_bbh_word_sorting`
77
+
78
+ ## GPQA
79
+
80
+ ### Paper
81
+
82
+ Title: GPQA: A Graduate-Level Google-Proof Q&A Benchmark
83
+
84
+ We present GPQA, a challenging dataset of 448 multiple-choice questions written
85
+ by domain experts in biology, physics, and chemistry. We ensure that the
86
+ questions are high-quality and extremely difficult: experts who have or are
87
+ pursuing PhDs in the corresponding domains reach 65% accuracy (74% when
88
+ discounting clear mistakes the experts identified in retrospect), while highly
89
+ skilled non-expert validators only reach 34% accuracy, despite spending on
90
+ average over 30 minutes with unrestricted access to the web (i.e., the
91
+ questions are “Google-proof”). The questions are also difficult for
92
+ state-of-the-art AI systems, with our strongest GPT-4–based baseline achieving
93
+ 39% accuracy. If we are to use future AI systems to help us answer very hard
94
+ questions—for example, when developing new scientific knowledge—we need to
95
+ develop scalable oversight methods that enable humans to supervise their
96
+ outputs, which may be difficult even if the supervisors are themselves skilled
97
+ and knowledgeable. The difficulty of GPQA both for skilled non-experts and
98
+ frontier AI systems should enable realistic scalable oversight experiments,
99
+ which we hope can help devise ways for human experts to reliably get truthful
100
+ information from AI systems that surpass human capabilities.
101
+
102
+ - Paper: https://huggingface.co/papers/2311.12022
103
+ - Homepage: https://github.com/idavidrein/gpqa/tree/main
104
+
105
+ ### Citation
106
+
107
+ ```
108
+ @misc{rein2023gpqa,
109
+ title={GPQA: A Graduate-Level Google-Proof Q&A Benchmark},
110
+ author={David Rein and Betty Li Hou and Asa Cooper Stickland and Jackson Petty and Richard Yuanzhe Pang and Julien Dirani and Julian Michael and Samuel R. Bowman},
111
+ year={2023},
112
+ eprint={2311.12022},
113
+ archivePrefix={arXiv},
114
+ primaryClass={cs.AI}
115
+ }
116
+ ```
117
+
118
+ ### Groups
119
+
120
+ - `leaderboard_gpqa`
121
+
122
+ ### Tasks
123
+
124
+ - `leaderboard_gpqa_extended`
125
+ - `leaderboard_gpqa_diamond`
126
+ - `leaderboard_gpqa_main`
127
+
128
+ ## IFEval
129
+
130
+ ### Paper
131
+
132
+ Title: Instruction-Following Evaluation for Large Language Models
133
+
134
+ One core capability of Large Language Models (LLMs) is to follow natural
135
+ language instructions. However, the evaluation of such abilities is not
136
+ standardized: Human evaluations are expensive, slow, and not objectively
137
+ reproducible, while LLM-based auto-evaluation is potentially biased or limited
138
+ by the ability of the evaluator LLM. To overcome these issues, we introduce
139
+ Instruction-Following Eval (IFEval) for large language models. IFEval is a
140
+ straightforward and easy-to-reproduce evaluation benchmark. It focuses on a set
141
+ of "verifiable instructions" such as "write in more than 400 words" and
142
+ "mention the keyword of AI at least 3 times". We identified 25 types of those
143
+ verifiable instructions and constructed around 500 prompts, with each prompt
144
+ containing one or more verifiable instructions. We show evaluation results of
145
+ two widely available LLMs on the market.
146
+
147
+ - Paper: https://huggingface.co/papers/2210.09261
148
+ - Homepage: https://github.com/google-research/google-research/tree/master/instruction_following_eval
149
+
150
+ ### Citation
151
+
152
+ ```
153
+ @article{zhou2023instructionfollowing,
154
+ title={Instruction-Following Evaluation for Large Language Models},
155
+ author={Jeffrey Zhou and Tianjian Lu and Swaroop Mishra and Siddhartha Brahma and Sujoy Basu and Yi Luan and Denny Zhou and Le Hou},
156
+ journal={arXiv preprint arXiv:2311.07911},
157
+ year={2023},
158
+ }
159
+ ```
160
+
161
+ ### Tasks
162
+
163
+ - `leaderboard_ifeval`
164
+
165
+ ## MATH-hard
166
+
167
+ This is the 4 shots variant of minerva math but only keeping the level 5 questions.
168
+
169
+ ### Paper
170
+
171
+ Title: Measuring Mathematical Problem Solving With the MATH Dataset
172
+
173
+ Many intellectual endeavors require mathematical problem solving, but this
174
+ skill remains beyond the capabilities of computers. To measure this ability in
175
+ machine learning models, we introduce MATH, a new dataset of 12,500 challenging
176
+ competition mathematics problems. Each problem in MATH has a full step-by-step
177
+ solution which can be used to teach models to generate answer derivations and
178
+ explanations.
179
+
180
+ NOTE: The few-shot and the generated answer extraction is based on the
181
+ [Minerva](https://arxiv.org/abs/2206.14858) and exact match equivalence is
182
+ calculated using the `sympy` library. This requires additional dependencies,
183
+ which can be installed via the `lm-eval[math]` extra.
184
+
185
+ - Paper: https://huggingface.co/papers/2103.03874
186
+ - Homepage: https://github.com/hendrycks/math
187
+
188
+
189
+ ### Citation
190
+
191
+ ```
192
+ @article{hendrycksmath2021,
193
+ title={Measuring Mathematical Problem Solving With the MATH Dataset},
194
+ author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
195
+ journal={NeurIPS},
196
+ year={2021}
197
+ }
198
+ @misc{2206.14858,
199
+ Author = {Aitor Lewkowycz and Anders Andreassen and David Dohan and Ethan Dye and Henryk Michalewski and Vinay Ramasesh and Ambrose Slone and Cem Anil and Imanol Schlag and Theo Gutman-Solo and Yuhuai Wu and Behnam Neyshabur and Guy Gur-Ari and Vedant Misra},
200
+ Title = {Solving Quantitative Reasoning Problems with Language Models},
201
+ Year = {2022},
202
+ Eprint = {arXiv:2206.14858},
203
+ }
204
+ ```
205
+
206
+ ### Groups
207
+
208
+ - `leaderboard_math_hard`
209
+
210
+ ### Tasks
211
+
212
+ - `leaderboard_math_algebra_hard`
213
+ - `leaderboard_math_counting_and_prob_hard`
214
+ - `leaderboard_math_geometry_hard`
215
+ - `leaderboard_math_intermediate_algebra_hard`
216
+ - `leaderboard_math_num_theory_hard`
217
+ - `leaderboard_math_prealgebra_hard`
218
+ - `leaderboard_math_precalc_hard`
219
+
220
+
221
+ ## MMLU-Pro
222
+
223
+ ### Paper
224
+
225
+ Title: MMLU-Pro: A More Robust and Challenging Multi-Task Language
226
+ Understanding Benchmark
227
+
228
+ In the age of large-scale language models, benchmarks like the Massive
229
+ Multitask Language Understanding (MMLU) have been pivotal in pushing the
230
+ boundaries of what AI can achieve in language comprehension and reasoning
231
+ across diverse domains. However, as models continue to improve, their
232
+ performance on these benchmarks has begun to plateau, making it increasingly
233
+ difficult to discern differences in model capabilities. This paper introduces
234
+ MMLU-Pro, an enhanced dataset designed to extend the mostly knowledge-driven
235
+ MMLU benchmark by integrating more challenging, reasoning-focused questions and
236
+ expanding the choice set from four to ten options. Additionally, MMLU-Pro
237
+ eliminates the trivial and noisy questions in MMLU. Our experimental results
238
+ show that MMLU-Pro not only raises the challenge, causing a significant drop in
239
+ accuracy by 16% to 33% compared to MMLU but also demonstrates greater stability
240
+ under varying prompts. With 24 different prompt styles tested, the sensitivity
241
+ of model scores to prompt variations decreased from 4-5% in MMLU to just 2% in
242
+ MMLU-Pro. Additionally, we found that models utilizing Chain of Thought (CoT)
243
+ reasoning achieved better performance on MMLU-Pro compared to direct answering,
244
+ which is in stark contrast to the findings on the original MMLU, indicating
245
+ that MMLU-Pro includes more complex reasoning questions. Our assessments
246
+ confirm that MMLU-Pro is a more discriminative benchmark to better track
247
+ progress in the field.
248
+
249
+ - Paper: https://huggingface.co/papers/2406.01574
250
+ - Homepage: https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro
251
+
252
+ ### Citation
253
+
254
+ ```
255
+ @misc{wang2024mmluprorobustchallengingmultitask,
256
+ title={MMLU-Pro: A More Robust and Challenging Multi-Task Language
257
+ Understanding Benchmark},
258
+ author={Yubo Wang and Xueguang Ma and Ge Zhang and Yuansheng Ni and Abhranil Chandra and Shiguang Guo and Weiming Ren and Aaran Arulraj and Xuan He and Ziyan Jiang and Tianle Li and Max Ku and Kai Wang and Alex Zhuang and Rongqi Fan and Xiang Yue and Wenhu Chen},
259
+ year={2024},
260
+ eprint={2406.01574},
261
+ archivePrefix={arXiv},
262
+ primaryClass={cs.CL},
263
+ url={https://arxiv.org/abs/2406.01574},
264
+ }
265
+ ```
266
+
267
+ ### Groups
268
+
269
+ - `leaderboard_mmlu_pro`
270
+
271
+ ### Tasks
272
+
273
+ - `leaderboard_mmlu_pro`
274
+
275
+
276
+ ## Musr
277
+
278
+ ### Paper
279
+
280
+ Title: MuSR: Testing the Limits of Chain-of-thought with Multistep Soft
281
+ Reasoning
282
+
283
+ While large language models (LLMs) equipped with techniques like
284
+ chain-of-thought prompting have demonstrated impressive capabilities, they
285
+ still fall short in their ability to reason robustly in complex settings.
286
+ However, evaluating LLM reasoning is challenging because system capabilities
287
+ continue to grow while benchmark datasets for tasks like logical deduction have
288
+ remained static. We introduce MuSR, a dataset for evaluating language models on
289
+ multistep soft reasoning tasks specified in a natural language narrative. This
290
+ dataset has two crucial features. First, it is created through a novel
291
+ neurosymbolic synthetic-to-natural generation algorithm, enabling the
292
+ construction of complex reasoning instances that challenge GPT-4 (e.g., murder
293
+ mysteries roughly 1000 words in length) and which can be scaled further as more
294
+ capable LLMs are released. Second, our dataset instances are free text
295
+ narratives corresponding to real-world domains of reasoning; this makes it
296
+ simultaneously much more challenging than other synthetically-crafted
297
+ benchmarks while remaining realistic and tractable for human annotators to
298
+ solve with high accuracy. We evaluate a range of LLMs and prompting techniques
299
+ on this dataset and characterize the gaps that remain for techniques like
300
+ chain-of-thought to perform robust reasoning.
301
+
302
+ - Paper: https://huggingface.co/papers/2310.16049
303
+ - Homepage: https://zayne-sprague.github.io/MuSR/
304
+
305
+ ### Citation
306
+
307
+ ```
308
+ @misc{sprague2024musrtestinglimitschainofthought,
309
+ title={MuSR: Testing the Limits of Chain-of-thought with Multistep Soft
310
+ Reasoning},
311
+ author={Zayne Sprague and Xi Ye and Kaj Bostrom and Swarat Chaudhuri and Greg Durrett},
312
+ year={2024},
313
+ eprint={2310.16049},
314
+ archivePrefix={arXiv},
315
+ primaryClass={cs.CL},
316
+ url={https://arxiv.org/abs/2310.16049},
317
+ }
318
+ ```
319
+
320
+ ### Groups
321
+
322
+ - `leaderboard_musr`
323
+
324
+ ### Tasks
325
+
326
+ - `leaderboard_musr_murder_mysteries`
327
+ - `leaderboard_musr_object_placements`
328
+ - `leaderboard_musr_team_allocation`
scripts/yans/lm-evaluation-harness/lm_eval/tasks/leaderboard/bbh_mc/_leaderboard_bbh.yaml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ group: leaderboard_bbh
2
+ task:
3
+ - leaderboard_bbh_boolean_expressions
4
+ - leaderboard_bbh_causal_judgement
5
+ - leaderboard_bbh_date_understanding
6
+ - leaderboard_bbh_disambiguation_qa
7
+ - leaderboard_bbh_formal_fallacies
8
+ - leaderboard_bbh_geometric_shapes
9
+ - leaderboard_bbh_hyperbaton
10
+ - leaderboard_bbh_logical_deduction_five_objects
11
+ - leaderboard_bbh_logical_deduction_seven_objects
12
+ - leaderboard_bbh_logical_deduction_three_objects
13
+ - leaderboard_bbh_movie_recommendation
14
+ - leaderboard_bbh_navigate
15
+ - leaderboard_bbh_object_counting
16
+ - leaderboard_bbh_penguins_in_a_table
17
+ - leaderboard_bbh_reasoning_about_colored_objects
18
+ - leaderboard_bbh_ruin_names
19
+ - leaderboard_bbh_salient_translation_error_detection
20
+ - leaderboard_bbh_snarks
21
+ - leaderboard_bbh_sports_understanding
22
+ - leaderboard_bbh_temporal_sequences
23
+ - leaderboard_bbh_tracking_shuffled_objects_five_objects
24
+ - leaderboard_bbh_tracking_shuffled_objects_seven_objects
25
+ - leaderboard_bbh_tracking_shuffled_objects_three_objects
26
+ - leaderboard_bbh_web_of_lies
scripts/yans/lm-evaluation-harness/lm_eval/tasks/leaderboard/bbh_mc/boolean_expressions.yaml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: boolean_expressions
2
+ description: 'Evaluate the result of a random Boolean expression.'
3
+ doc_to_choice: ["False", "True"]
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - input: not ( ( not not True ) ) is
8
+ target: 'False'
9
+ - input: True and False and not True and True is
10
+ target: 'False'
11
+ - input: not not ( not ( False ) ) is
12
+ target: 'True'
13
+ include: _fewshot_template_yaml
14
+ task: leaderboard_bbh_boolean_expressions
scripts/yans/lm-evaluation-harness/lm_eval/tasks/leaderboard/bbh_mc/formal_fallacies.yaml ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: formal_fallacies
2
+ description: 'Distinguish deductively valid arguments from formal fallacies.'
3
+ doc_to_choice: ["valid", "invalid"]
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - input: '"It is not always easy to see who is related to whom -- and in which ways.
8
+ The following argument pertains to this question: To begin with, Lesley is a
9
+ close friend of Fernando. Moreover, being a close friend of Fernando or a schoolmate
10
+ of Lowell is sufficient for being a great-grandfather of Leroy. It follows that
11
+ Lesley is a great-grandfather of Leroy."
12
+
13
+ Is the argument, given the explicitly stated premises, deductively valid or
14
+ invalid?
15
+
16
+ Options:
17
+
18
+ - valid
19
+
20
+ - invalid'
21
+ target: valid
22
+ - input: '"It is not always easy to see who is related to whom -- and in which ways.
23
+ The following argument pertains to this question: Whoever is not a great-grandfather
24
+ of Clyde is a stepbrother of Brian. Being an ancestor of Dana is sufficient
25
+ for not being a great-grandfather of Clyde. We may conclude: Everyone who is
26
+ an ancestor of Dana is a stepbrother of Brian, too."
27
+
28
+ Is the argument, given the explicitly stated premises, deductively valid or
29
+ invalid?
30
+
31
+ Options:
32
+
33
+ - valid
34
+
35
+ - invalid'
36
+ target: valid
37
+ - input: '"It is not always easy to grasp who is consuming which products. The following
38
+ argument pertains to this question: Every infrequent user of Paul Mitchell shampoo
39
+ is either a rare consumer of Nioxin shampoo or a loyal buyer of Caress soap,
40
+ or both. No regular consumer of Lush soap is a rare consumer of Nioxin shampoo
41
+ and, in the same time, a loyal buyer of Caress soap. It follows that whoever
42
+ is an infrequent user of Paul Mitchell shampoo is not a regular consumer of
43
+ Lush soap."
44
+
45
+ Is the argument, given the explicitly stated premises, deductively valid or
46
+ invalid?
47
+
48
+ Options:
49
+
50
+ - valid
51
+
52
+ - invalid'
53
+ target: invalid
54
+ include: _fewshot_template_yaml
55
+ task: leaderboard_bbh_formal_fallacies
scripts/yans/lm-evaluation-harness/lm_eval/tasks/leaderboard/bbh_mc/geometric_shapes.yaml ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: geometric_shapes
2
+ description: 'Name geometric shapes from their SVG paths.'
3
+ doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)","(G)","(H)","(I)","(J)","(K)"]
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - input: 'This SVG path element <path d="M 31.00,73.00 L 32.00,59.00 L 44.00,50.00
8
+ L 49.00,41.00 L 64.00,37.00 L 71.00,55.00 L 64.00,76.00 L 52.00,61.00 L 31.00,73.00"/>
9
+ draws a
10
+
11
+ Options:
12
+
13
+ (A) circle
14
+
15
+ (B) heptagon
16
+
17
+ (C) hexagon
18
+
19
+ (D) kite
20
+
21
+ (E) line
22
+
23
+ (F) octagon
24
+
25
+ (G) pentagon
26
+
27
+ (H) rectangle
28
+
29
+ (I) sector
30
+
31
+ (J) triangle'
32
+ target: (F)
33
+ - input: 'This SVG path element <path d="M 14.19,26.04 L 51.43,39.21 L 58.44,36.69
34
+ L 56.63,30.17 L 48.53,26.66 L 14.19,26.04"/> draws a
35
+
36
+ Options:
37
+
38
+ (A) circle
39
+
40
+ (B) heptagon
41
+
42
+ (C) hexagon
43
+
44
+ (D) kite
45
+
46
+ (E) line
47
+
48
+ (F) octagon
49
+
50
+ (G) pentagon
51
+
52
+ (H) rectangle
53
+
54
+ (I) sector
55
+
56
+ (J) triangle'
57
+ target: (G)
58
+ - input: 'This SVG path element <path d="M 41.00,43.00 L 37.00,34.00 L 41.00,33.00
59
+ L 45.00,34.00 L 41.00,43.00"/> draws a
60
+
61
+ Options:
62
+
63
+ (A) circle
64
+
65
+ (B) heptagon
66
+
67
+ (C) hexagon
68
+
69
+ (D) kite
70
+
71
+ (E) line
72
+
73
+ (F) octagon
74
+
75
+ (G) pentagon
76
+
77
+ (H) rectangle
78
+
79
+ (I) sector
80
+
81
+ (J) triangle'
82
+ target: (D)
83
+ include: _fewshot_template_yaml
84
+ task: leaderboard_bbh_geometric_shapes
scripts/yans/lm-evaluation-harness/lm_eval/tasks/leaderboard/bbh_mc/hyperbaton.yaml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: hyperbaton
2
+ description: 'Order adjectives correctly in English sentences.'
3
+ doc_to_choice: ["(A)", "(B)"]
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - input: 'Which sentence has the correct adjective order:
8
+
9
+ Options:
10
+
11
+ (A) rubber terrible ship
12
+
13
+ (B) terrible rubber ship'
14
+ target: (B)
15
+ - input: 'Which sentence has the correct adjective order:
16
+
17
+ Options:
18
+
19
+ (A) repulsive small Brazilian exercise ship
20
+
21
+ (B) Brazilian repulsive exercise small ship'
22
+ target: (A)
23
+ - input: 'Which sentence has the correct adjective order:
24
+
25
+ Options:
26
+
27
+ (A) blue gold wonderful square shoe
28
+
29
+ (B) wonderful square blue gold shoe'
30
+ target: (B)
31
+ include: _fewshot_template_yaml
32
+ task: leaderboard_bbh_hyperbaton
scripts/yans/lm-evaluation-harness/lm_eval/tasks/leaderboard/bbh_mc/reasoning_about_colored_objects.yaml ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_name: reasoning_about_colored_objects
2
+ description: 'Answer extremely simple questions about the colors of objects on a surface.'
3
+ doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)","(G)","(H)","(I)","(J)","(K)","(L)","(M)","(N)","(O)","(P)","(Q)","(R)"]
4
+ fewshot_config:
5
+ sampler: first_n
6
+ samples:
7
+ - input: 'On the nightstand, there is a red pencil, a purple mug, a burgundy keychain,
8
+ a fuchsia teddy bear, a black plate, and a blue stress ball. What color is the
9
+ stress ball?
10
+
11
+ Options:
12
+
13
+ (A) red
14
+
15
+ (B) orange
16
+
17
+ (C) yellow
18
+
19
+ (D) green
20
+
21
+ (E) blue
22
+
23
+ (F) brown
24
+
25
+ (G) magenta
26
+
27
+ (H) fuchsia
28
+
29
+ (I) mauve
30
+
31
+ (J) teal
32
+
33
+ (K) turquoise
34
+
35
+ (L) burgundy
36
+
37
+ (M) silver
38
+
39
+ (N) gold
40
+
41
+ (O) black
42
+
43
+ (P) grey
44
+
45
+ (Q) purple
46
+
47
+ (R) pink'
48
+ target: (E)
49
+ - input: 'On the table, you see a bunch of objects arranged in a row: a purple paperclip,
50
+ a pink stress ball, a brown keychain, a green scrunchiephone charger, a mauve
51
+ fidget spinner, and a burgundy pen. What is the color of the object directly
52
+ to the right of the stress ball?
53
+
54
+ Options:
55
+
56
+ (A) red
57
+
58
+ (B) orange
59
+
60
+ (C) yellow
61
+
62
+ (D) green
63
+
64
+ (E) blue
65
+
66
+ (F) brown
67
+
68
+ (G) magenta
69
+
70
+ (H) fuchsia
71
+
72
+ (I) mauve
73
+
74
+ (J) teal
75
+
76
+ (K) turquoise
77
+
78
+ (L) burgundy
79
+
80
+ (M) silver
81
+
82
+ (N) gold
83
+
84
+ (O) black
85
+
86
+ (P) grey
87
+
88
+ (Q) purple
89
+
90
+ (R) pink'
91
+ target: (F)
92
+ - input: 'On the nightstand, you see the following items arranged in a row: a teal
93
+ plate, a burgundy keychain, a yellow scrunchiephone charger, an orange mug,
94
+ a pink notebook, and a grey cup. How many non-orange items do you see to the
95
+ left of the teal item?
96
+
97
+ Options:
98
+
99
+ (A) zero
100
+
101
+ (B) one
102
+
103
+ (C) two
104
+
105
+ (D) three
106
+
107
+ (E) four
108
+
109
+ (F) five
110
+
111
+ (G) six'
112
+ target: (A)
113
+ include: _fewshot_template_yaml
114
+ task: leaderboard_bbh_reasoning_about_colored_objects