koichi12 commited on
Commit
55e98b2
·
verified ·
1 Parent(s): 735d58a

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/README.md +53 -0
  2. scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/_aexams.yaml +16 -0
  3. scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/_default_template_yaml +18 -0
  4. scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/aexams_Biology.yaml +4 -0
  5. scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/aexams_IslamicStudies.yaml +4 -0
  6. scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/aexams_Physics.yaml +4 -0
  7. scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/aexams_Science.yaml +4 -0
  8. scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/aexams_Social.yaml +4 -0
  9. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/_default_template_yaml +18 -0
  10. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_arb_Arab.yaml +4 -0
  11. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_azj_Latn.yaml +4 -0
  12. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ben_Latn.yaml +4 -0
  13. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_bod_Tibt.yaml +4 -0
  14. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ell_Grek.yaml +4 -0
  15. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_eus_Latn.yaml +4 -0
  16. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_fra_Latn.yaml +4 -0
  17. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_hau_Latn.yaml +4 -0
  18. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_hin_Latn.yaml +4 -0
  19. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_hrv_Latn.yaml +4 -0
  20. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_hye_Armn.yaml +4 -0
  21. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_isl_Latn.yaml +4 -0
  22. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_kan_Knda.yaml +4 -0
  23. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_kaz_Cyrl.yaml +4 -0
  24. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_khm_Khmr.yaml +4 -0
  25. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_kor_Hang.yaml +4 -0
  26. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_lao_Laoo.yaml +4 -0
  27. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_mkd_Cyrl.yaml +4 -0
  28. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_nob_Latn.yaml +4 -0
  29. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_npi_Deva.yaml +4 -0
  30. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ory_Orya.yaml +4 -0
  31. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_pbt_Arab.yaml +4 -0
  32. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_sin_Sinh.yaml +4 -0
  33. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_sot_Latn.yaml +4 -0
  34. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ssw_Latn.yaml +4 -0
  35. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_swh_Latn.yaml +4 -0
  36. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_tam_Taml.yaml +4 -0
  37. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_tir_Ethi.yaml +4 -0
  38. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_tso_Latn.yaml +4 -0
  39. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ukr_Cyrl.yaml +4 -0
  40. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_urd_Latn.yaml +4 -0
  41. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_wol_Latn.yaml +4 -0
  42. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_yor_Latn.yaml +4 -0
  43. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_zho_Hant.yaml +4 -0
  44. scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_zul_Latn.yaml +4 -0
  45. scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/README.md +55 -0
  46. scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/_inverse_scaling_mc_yaml +17 -0
  47. scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/_some_results +39 -0
  48. scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_hindsight_neglect.yaml +3 -0
  49. scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_into_the_unknown.yaml +3 -0
  50. scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_memo_trap.yaml +3 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/README.md ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Arabic EXAMS
2
+
3
+ ### Paper
4
+
5
+ EXAMS: a resource specialized in multilingual high school exam questions.
6
+ The original paper [EXAMS](https://aclanthology.org/2020.emnlp-main.438/)
7
+
8
+ The Arabic EXAMS dataset includes five subjects
9
+
10
+ - Islamic studies
11
+ - Biology
12
+ - Physics
13
+ - Science
14
+ - Social
15
+
16
+ The original dataset [EXAMS-QA](https://github.com/mhardalov/exams-qa)
17
+
18
+ EXAMS is a benchmark dataset for cross-lingual and multilingual question answering for high school examinations.
19
+ With 24,000 high-quality high school exam questions in 16 languages, covering 8 language families and 24 school subjects from Natural Sciences and Social Sciences, among others.
20
+ EXAMS offers unique fine-grained evaluation framework across multiple languages and subjects
21
+
22
+ Homepage for Arabic EXAMS: [EXAMS Arabic Homepage](https://github.com/FreedomIntelligence/AceGPT/tree/main/eval/benchmark_eval/benchmarks/EXAMS_Arabic)
23
+
24
+ ### Citation
25
+
26
+
27
+ ### Groups, Tags, and Tasks
28
+
29
+ #### Groups
30
+
31
+ - `aexams`: Arabic EXAMS dataset, including IslamicStudies, Biology, Science, Physics, Social subjects.
32
+
33
+ #### Tasks
34
+
35
+
36
+ The following tasks evaluate subjects in Arabic EXAMS dataset using loglikelihood-based multiple-choice scoring:
37
+ - `aexams_IslamicStudies`
38
+ - `aexams_Biology`
39
+ - `aexams_Science`
40
+ - `aexams_Physics`
41
+ - `aexams_Social`
42
+
43
+ ### Checklist
44
+
45
+ * [x] Is the task an existing benchmark in the literature?
46
+ * [x] Have you referenced the original paper that introduced the task?
47
+ * [x] If yes, does the original paper provide a reference implementation?
48
+ * [x] Yes, original implementation contributed by author of the benchmark
49
+
50
+ If other tasks on this dataset are already supported:
51
+ * [x] Is the "Main" variant of this task clearly denoted?
52
+ * [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
53
+ * [x] Have you noted which, if any, published evaluation setups are matched by this variant?
scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/_aexams.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ group: aexams
2
+ task:
3
+ - aexams_Biology
4
+ - aexams_IslamicStudies
5
+ - aexams_Physics
6
+ - aexams_Science
7
+ - aexams_Social
8
+ aggregate_metric_list:
9
+ - metric: acc
10
+ aggregation: mean
11
+ weight_by_size: true
12
+ - metric: acc_norm
13
+ aggregation: mean
14
+ weight_by_size: true
15
+ metadata:
16
+ version: 1.0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/_default_template_yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_path: Hennara/aexams
2
+ test_split: test
3
+ fewshot_split: dev
4
+ fewshot_config:
5
+ sampler: first_n
6
+ output_type: multiple_choice
7
+ doc_to_text: "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nالجواب:"
8
+ doc_to_choice: ["A", "B", "C", "D"]
9
+ doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}"
10
+ metric_list:
11
+ - metric: acc
12
+ aggregation: mean
13
+ higher_is_better: true
14
+ - metric: acc_norm
15
+ aggregation: mean
16
+ higher_is_better: true
17
+ metadata:
18
+ version: 1.0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/aexams_Biology.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "Biology"
2
+ "description": "قم بالإجابة على مايلي في مجال العلوم الحيوية\n\n"
3
+ "include": "_default_template_yaml"
4
+ "task": "aexams_Biology"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/aexams_IslamicStudies.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "IslamicStudies"
2
+ "description": "قم بالإجابة على مايلي في مجال العلوم الإسلامية \n\n"
3
+ "include": "_default_template_yaml"
4
+ "task": "aexams_IslamicStudies"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/aexams_Physics.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "Physics"
2
+ "description": "قم بالإجابة على مايلي في مجال الفيزياء \n\n"
3
+ "include": "_default_template_yaml"
4
+ "task": "aexams_Physics"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/aexams_Science.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "Science"
2
+ "description": "قم بالإجابة على مايلي في مجال العلوم \n\n"
3
+ "include": "_default_template_yaml"
4
+ "task": "aexams_Science"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/aexams_Social.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "Social"
2
+ "description": "قم بالإجابة على مايلي في مجال العلوم الإجتماعية \n\n"
3
+ "include": "_default_template_yaml"
4
+ "task": "aexams_Social"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/_default_template_yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_path: facebook/belebele
2
+ fewshot_config:
3
+ sampler: first_n
4
+ output_type: multiple_choice
5
+ should_decontaminate: true
6
+ doc_to_decontamination_query: "{{question}}"
7
+ doc_to_text: "P: {{flores_passage}}\nQ: {{question.strip()}}\nA: {{mc_answer1}}\nB: {{mc_answer2}}\nC: {{mc_answer3}}\nD: {{mc_answer4}}\nAnswer:"
8
+ doc_to_choice: ["A", "B", "C", "D"]
9
+ doc_to_target: "{{['1', '2', '3', '4'].index(correct_answer_num)}}"
10
+ metric_list:
11
+ - metric: acc
12
+ aggregation: mean
13
+ higher_is_better: true
14
+ - metric: acc_norm
15
+ aggregation: mean
16
+ higher_is_better: true
17
+ metadata:
18
+ version: 0.0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_arb_Arab.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "arb_Arab"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_arb_Arab"
4
+ "test_split": "arb_Arab"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_azj_Latn.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "azj_Latn"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_azj_Latn"
4
+ "test_split": "azj_Latn"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ben_Latn.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "ben_Latn"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_ben_Latn"
4
+ "test_split": "ben_Latn"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_bod_Tibt.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "bod_Tibt"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_bod_Tibt"
4
+ "test_split": "bod_Tibt"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ell_Grek.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "ell_Grek"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_ell_Grek"
4
+ "test_split": "ell_Grek"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_eus_Latn.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "eus_Latn"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_eus_Latn"
4
+ "test_split": "eus_Latn"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_fra_Latn.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "fra_Latn"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_fra_Latn"
4
+ "test_split": "fra_Latn"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_hau_Latn.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "hau_Latn"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_hau_Latn"
4
+ "test_split": "hau_Latn"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_hin_Latn.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "hin_Latn"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_hin_Latn"
4
+ "test_split": "hin_Latn"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_hrv_Latn.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "hrv_Latn"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_hrv_Latn"
4
+ "test_split": "hrv_Latn"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_hye_Armn.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "hye_Armn"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_hye_Armn"
4
+ "test_split": "hye_Armn"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_isl_Latn.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "isl_Latn"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_isl_Latn"
4
+ "test_split": "isl_Latn"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_kan_Knda.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "kan_Knda"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_kan_Knda"
4
+ "test_split": "kan_Knda"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_kaz_Cyrl.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "kaz_Cyrl"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_kaz_Cyrl"
4
+ "test_split": "kaz_Cyrl"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_khm_Khmr.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "khm_Khmr"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_khm_Khmr"
4
+ "test_split": "khm_Khmr"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_kor_Hang.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "kor_Hang"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_kor_Hang"
4
+ "test_split": "kor_Hang"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_lao_Laoo.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "lao_Laoo"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_lao_Laoo"
4
+ "test_split": "lao_Laoo"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_mkd_Cyrl.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "mkd_Cyrl"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_mkd_Cyrl"
4
+ "test_split": "mkd_Cyrl"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_nob_Latn.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "nob_Latn"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_nob_Latn"
4
+ "test_split": "nob_Latn"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_npi_Deva.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "npi_Deva"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_npi_Deva"
4
+ "test_split": "npi_Deva"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ory_Orya.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "ory_Orya"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_ory_Orya"
4
+ "test_split": "ory_Orya"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_pbt_Arab.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "pbt_Arab"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_pbt_Arab"
4
+ "test_split": "pbt_Arab"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_sin_Sinh.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "sin_Sinh"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_sin_Sinh"
4
+ "test_split": "sin_Sinh"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_sot_Latn.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "sot_Latn"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_sot_Latn"
4
+ "test_split": "sot_Latn"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ssw_Latn.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "ssw_Latn"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_ssw_Latn"
4
+ "test_split": "ssw_Latn"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_swh_Latn.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "swh_Latn"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_swh_Latn"
4
+ "test_split": "swh_Latn"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_tam_Taml.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "tam_Taml"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_tam_Taml"
4
+ "test_split": "tam_Taml"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_tir_Ethi.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "tir_Ethi"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_tir_Ethi"
4
+ "test_split": "tir_Ethi"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_tso_Latn.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "tso_Latn"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_tso_Latn"
4
+ "test_split": "tso_Latn"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ukr_Cyrl.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "ukr_Cyrl"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_ukr_Cyrl"
4
+ "test_split": "ukr_Cyrl"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_urd_Latn.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "urd_Latn"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_urd_Latn"
4
+ "test_split": "urd_Latn"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_wol_Latn.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "wol_Latn"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_wol_Latn"
4
+ "test_split": "wol_Latn"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_yor_Latn.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "yor_Latn"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_yor_Latn"
4
+ "test_split": "yor_Latn"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_zho_Hant.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "zho_Hant"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_zho_Hant"
4
+ "test_split": "zho_Hant"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_zul_Latn.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "fewshot_split": "zul_Latn"
2
+ "include": "_default_template_yaml"
3
+ "task": "belebele_zul_Latn"
4
+ "test_split": "zul_Latn"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/README.md ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # inverse_scaling
2
+
3
+ ### Paper
4
+
5
+ Title: `Inverse Scaling: When Bigger Isn't Better`
6
+
7
+ Abstract: `Work on scaling laws has found that large language models (LMs) show predictable improvements to overall loss with increased scale (model size, training data, and compute). Here, we present evidence for the claim that LMs may show inverse scaling, or worse task performance with increased scale, e.g., due to flaws in the training objective and data. We present empirical evidence of inverse scaling on 11 datasets collected by running a public contest, the Inverse Scaling Prize, with a substantial prize pool. Through analysis of the datasets, along with other examples found in the literature, we identify four potential causes of inverse scaling: (i) preference to repeat memorized sequences over following in-context instructions, (ii) imitation of undesirable patterns in the training data, (iii) tasks containing an easy distractor task which LMs could focus on, rather than the harder real task, and (iv) correct but misleading few-shot demonstrations of the task. We release the winning datasets at this https URL to allow for further investigation of inverse scaling. Our tasks have helped drive the discovery of U-shaped and inverted-U scaling trends, where an initial trend reverses, suggesting that scaling trends are less reliable at predicting the behavior of larger-scale models than previously understood. Overall, our results suggest that there are tasks for which increased model scale alone may not lead to progress, and that more careful thought needs to go into the data and objectives for training language models.`
8
+
9
+ Note: This is not official implementation of inverse scaling prize. Implemented by h-albert-lee with permission from the authors of the paper.
10
+
11
+ Homepage: https://github.com/inverse-scaling/prize
12
+
13
+ ### Citation
14
+
15
+ @article{mckenzie2023inverse,
16
+ title={Inverse Scaling: When Bigger Isn't Better},
17
+ author={Ian R. McKenzie and Alexander Lyzhov and Michael Pieler and Alicia Parrish and Aaron Mueller and Ameya Prabhu and Euan McLean and Aaron Kirtland and Alexis Ross and Alisa Liu and Andrew Gritsevskiy and Daniel Wurgaft and Derik Kauffman and Gabriel Recchia and Jiacheng Liu and Joe Cavanagh and Max Weiss and Sicong Huang and The Floating Droid and Tom Tseng and Tomasz Korbak and Xudong Shen and Yuhui Zhang and Zhengping Zhou and Najoung Kim and Samuel R. Bowman and Ethan Perez},
18
+ journal={arXiv preprint arXiv:2306.09479},
19
+ year={2023}
20
+ }
21
+
22
+ ### Groups and Tasks
23
+
24
+ #### Groups
25
+
26
+ * `inverse_scaling_mc`: all tasks of Inverse Scaling Prize (currently aside from Prompt Injection), matching their implementations on OPT for multiple-choice type classification tasks. **These match the published dataset versions from the prize, which may slightly differ from numbers in the paper (but have been tested for equivalence to the OPT numbers reported at https://huggingface.co/inverse-scaling/opt-1.3b_eval for multiple sizes.**
27
+
28
+
29
+ #### Tasks
30
+
31
+ - `inverse_scaling_hindsight_neglect_10shot`
32
+ - `inverse_scaling_redefine_math`
33
+ - `inverse_scaling_quote_repetition`
34
+ - `inverse_scaling_neqa`
35
+ - `inverse_scaling_winobias_antistereotype`: not an official Inverse Scaling prize winner, but eval results reported on it at https://huggingface.co/inverse-scaling/opt-1.3b_eval .
36
+ - `inverse_scaling_into_the_unknown`
37
+ - `inverse_scaling_memo_trap`
38
+ - `inverse_scaling_modus_tollens`
39
+ - `inverse_scaling_pattern_matching_suppression`
40
+ - `inverse_scaling_repetitive_algebra`
41
+ - `inverse_scaling_sig_figs`
42
+
43
+
44
+ ### Checklist
45
+
46
+ For adding novel benchmarks/datasets to the library:
47
+ * [x] Is the task an existing benchmark in the literature?
48
+ * [x] Have you referenced the original paper that introduced the task?
49
+ * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
50
+
51
+
52
+ If other tasks on this dataset are already supported:
53
+ * [ ] Is the "Main" variant of this task clearly denoted?
54
+ * [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
55
+ * [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/_inverse_scaling_mc_yaml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ tag:
2
+ - inverse_scaling_mc
3
+ output_type: multiple_choice
4
+ test_split: train
5
+ doc_to_text: prompt
6
+ doc_to_choice: classes
7
+ doc_to_target: answer_index
8
+ target_delimiter: ""
9
+ metric_list:
10
+ - metric: acc
11
+ aggregation: mean
12
+ higher_is_better: true
13
+ - metric: acc_norm
14
+ aggregation: mean
15
+ higher_is_better: true
16
+ metadata:
17
+ version: 0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/_some_results ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # | Tasks |Version|Filter|n-shot| Metric |Value | |Stderr|
2
+ # |-------------------------------------------|-------|------|-----:|--------|-----:|---|-----:|
3
+ # | - inverse_scaling_hindsight_neglect_10shot| 0|none | 0|acc |0.4476|± |0.0281|
4
+ # | | |none | 0|acc_norm|0.4476|± |0.0281|
5
+ # |inverse_scaling_mc |N/A |none | 0|acc_norm|0.6273|± |0.0096|
6
+ # | | |none | 0|acc |0.6210|± |0.0095|
7
+ # | - inverse_scaling_neqa | 0|none | 0|acc |0.5300|± |0.0289|
8
+ # | | |none | 0|acc_norm|0.5300|± |0.0289|
9
+ # | - inverse_scaling_quote_repetition | 0|none | 0|acc |0.9367|± |0.0141|
10
+ # | | |none | 0|acc_norm|0.9367|± |0.0141|
11
+ # | - inverse_scaling_redefine_math | 0|none | 0|acc |0.7178|± |0.0150|
12
+ # | | |none | 0|acc_norm|0.7178|± |0.0150|
13
+ # | - inverse_scaling_winobias_antistereotype | 0|none | 0|acc |0.3786|± |0.0239|
14
+ # | | |none | 0|acc_norm|0.4126|± |0.0243|
15
+
16
+ # | Groups |Version|Filter|n-shot| Metric |Value | |Stderr|
17
+ # |------------------|-------|------|-----:|--------|-----:|---|-----:|
18
+ # |inverse_scaling_mc|N/A |none | 0|acc_norm|0.6273|± |0.0096|
19
+ # | | |none | 0|acc |0.6210|± |0.0095|
20
+ # hf (pretrained=facebook/opt-2.7b,add_bos_token=True,dtype=float32), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (32)
21
+ # | Tasks |Version|Filter|n-shot| Metric |Value | |Stderr|
22
+ # |-------------------------------------------|-------|------|-----:|--------|-----:|---|-----:|
23
+ # | - inverse_scaling_hindsight_neglect_10shot| 0|none | 0|acc |0.4476|± |0.0281|
24
+ # | | |none | 0|acc_norm|0.4476|± |0.0281|
25
+ # |inverse_scaling_mc |N/A |none | 0|acc_norm|0.6291|± |0.0095|
26
+ # | | |none | 0|acc |0.6219|± |0.0095|
27
+ # | - inverse_scaling_neqa | 0|none | 0|acc |0.5267|± |0.0289|
28
+ # | | |none | 0|acc_norm|0.5267|± |0.0289|
29
+ # | - inverse_scaling_quote_repetition | 0|none | 0|acc |0.9433|± |0.0134|
30
+ # | | |none | 0|acc_norm|0.9433|± |0.0134|
31
+ # | - inverse_scaling_redefine_math | 0|none | 0|acc |0.7200|± |0.0150|
32
+ # | | |none | 0|acc_norm|0.7200|± |0.0150|
33
+ # | - inverse_scaling_winobias_antistereotype | 0|none | 0|acc |0.3762|± |0.0239|
34
+ # | | |none | 0|acc_norm|0.4150|± |0.0243|
35
+
36
+ # | Groups |Version|Filter|n-shot| Metric |Value | |Stderr|
37
+ # |------------------|-------|------|-----:|--------|-----:|---|-----:|
38
+ # |inverse_scaling_mc|N/A |none | 0|acc_norm|0.6291|± |0.0095|
39
+ # | | |none | 0|acc |0.6219|± |0.0095|
scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_hindsight_neglect.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ include: _inverse_scaling_mc_yaml
2
+ task: inverse_scaling_hindsight_neglect_10shot
3
+ dataset_path: inverse-scaling/hindsight-neglect-10shot
scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_into_the_unknown.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ include: _inverse_scaling_mc_yaml
2
+ task: inverse_scaling_into_the_unknown
3
+ dataset_path: Albertmade/into-the-unknown
scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_memo_trap.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ include: _inverse_scaling_mc_yaml
2
+ task: inverse_scaling_memo_trap
3
+ dataset_path: Albertmade/memo-trap