diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0c0461920c1cd51b6c3a4deb2af68843558116e1 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/README.md @@ -0,0 +1,53 @@ +# Arabic EXAMS + +### Paper + +EXAMS: a resource specialized in multilingual high school exam questions. +The original paper [EXAMS](https://aclanthology.org/2020.emnlp-main.438/) + +The Arabic EXAMS dataset includes five subjects + + - Islamic studies + - Biology + - Physics + - Science + - Social + +The original dataset [EXAMS-QA](https://github.com/mhardalov/exams-qa) + +EXAMS is a benchmark dataset for cross-lingual and multilingual question answering for high school examinations. +With 24,000 high-quality high school exam questions in 16 languages, covering 8 language families and 24 school subjects from Natural Sciences and Social Sciences, among others. +EXAMS offers unique fine-grained evaluation framework across multiple languages and subjects + +Homepage for Arabic EXAMS: [EXAMS Arabic Homepage](https://github.com/FreedomIntelligence/AceGPT/tree/main/eval/benchmark_eval/benchmarks/EXAMS_Arabic) + +### Citation + + +### Groups, Tags, and Tasks + +#### Groups + +- `aexams`: Arabic EXAMS dataset, including IslamicStudies, Biology, Science, Physics, Social subjects. + +#### Tasks + + +The following tasks evaluate subjects in Arabic EXAMS dataset using loglikelihood-based multiple-choice scoring: +- `aexams_IslamicStudies` +- `aexams_Biology` +- `aexams_Science` +- `aexams_Physics` +- `aexams_Social` + +### Checklist + +* [x] Is the task an existing benchmark in the literature? + * [x] Have you referenced the original paper that introduced the task? + * [x] If yes, does the original paper provide a reference implementation? + * [x] Yes, original implementation contributed by author of the benchmark + +If other tasks on this dataset are already supported: +* [x] Is the "Main" variant of this task clearly denoted? +* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [x] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/_aexams.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/_aexams.yaml new file mode 100644 index 0000000000000000000000000000000000000000..59099b9c38c11e5391e031a2e07808a83d645938 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/_aexams.yaml @@ -0,0 +1,16 @@ +group: aexams +task: + - aexams_Biology + - aexams_IslamicStudies + - aexams_Physics + - aexams_Science + - aexams_Social +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true + - metric: acc_norm + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/_default_template_yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/_default_template_yaml new file mode 100644 index 0000000000000000000000000000000000000000..3f7100ad70190a67bd86675ce7a15d88a5a5976a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/_default_template_yaml @@ -0,0 +1,18 @@ +dataset_path: Hennara/aexams +test_split: test +fewshot_split: dev +fewshot_config: + sampler: first_n +output_type: multiple_choice +doc_to_text: "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nالجواب:" +doc_to_choice: ["A", "B", "C", "D"] +doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}" +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/aexams_Biology.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/aexams_Biology.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9ee2e33b5844ef438da4ac51bfd916af04cb53e6 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/aexams_Biology.yaml @@ -0,0 +1,4 @@ +"dataset_name": "Biology" +"description": "قم بالإجابة على مايلي في مجال العلوم الحيوية\n\n" +"include": "_default_template_yaml" +"task": "aexams_Biology" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/aexams_IslamicStudies.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/aexams_IslamicStudies.yaml new file mode 100644 index 0000000000000000000000000000000000000000..831afc376ec25fdeddbb18bf5e4063d2e3c17ebf --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/aexams_IslamicStudies.yaml @@ -0,0 +1,4 @@ +"dataset_name": "IslamicStudies" +"description": "قم بالإجابة على مايلي في مجال العلوم الإسلامية \n\n" +"include": "_default_template_yaml" +"task": "aexams_IslamicStudies" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/aexams_Physics.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/aexams_Physics.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f2764a06ef2680a1c81ccca0e76dcbcf1ba52672 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/aexams_Physics.yaml @@ -0,0 +1,4 @@ +"dataset_name": "Physics" +"description": "قم بالإجابة على مايلي في مجال الفيزياء \n\n" +"include": "_default_template_yaml" +"task": "aexams_Physics" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/aexams_Science.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/aexams_Science.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c89dc8c8ca6d32b922483f48ee8da427e027a92b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/aexams_Science.yaml @@ -0,0 +1,4 @@ +"dataset_name": "Science" +"description": "قم بالإجابة على مايلي في مجال العلوم \n\n" +"include": "_default_template_yaml" +"task": "aexams_Science" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/aexams_Social.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/aexams_Social.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3042a419e6e3902ddd0090028fc4b875a148a213 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/aexams/aexams_Social.yaml @@ -0,0 +1,4 @@ +"dataset_name": "Social" +"description": "قم بالإجابة على مايلي في مجال العلوم الإجتماعية \n\n" +"include": "_default_template_yaml" +"task": "aexams_Social" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/_default_template_yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/_default_template_yaml new file mode 100644 index 0000000000000000000000000000000000000000..c32b310ba86091f627c7ec5717fd7cb4f69a46f2 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/_default_template_yaml @@ -0,0 +1,18 @@ +dataset_path: facebook/belebele +fewshot_config: + sampler: first_n +output_type: multiple_choice +should_decontaminate: true +doc_to_decontamination_query: "{{question}}" +doc_to_text: "P: {{flores_passage}}\nQ: {{question.strip()}}\nA: {{mc_answer1}}\nB: {{mc_answer2}}\nC: {{mc_answer3}}\nD: {{mc_answer4}}\nAnswer:" +doc_to_choice: ["A", "B", "C", "D"] +doc_to_target: "{{['1', '2', '3', '4'].index(correct_answer_num)}}" +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 0.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_arb_Arab.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_arb_Arab.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b6242fd38b2666f496751b8ed03639f712f350e3 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_arb_Arab.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "arb_Arab" +"include": "_default_template_yaml" +"task": "belebele_arb_Arab" +"test_split": "arb_Arab" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_azj_Latn.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_azj_Latn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8a9c7f2a8cb428cde2bfcbdf8c2485150c9c1db0 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_azj_Latn.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "azj_Latn" +"include": "_default_template_yaml" +"task": "belebele_azj_Latn" +"test_split": "azj_Latn" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ben_Latn.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ben_Latn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c8736c5242e5b8ff3f717c317292a42fd718db5b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ben_Latn.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "ben_Latn" +"include": "_default_template_yaml" +"task": "belebele_ben_Latn" +"test_split": "ben_Latn" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_bod_Tibt.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_bod_Tibt.yaml new file mode 100644 index 0000000000000000000000000000000000000000..53d40c4d574edd84946d7ee9e626e368af705ff8 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_bod_Tibt.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "bod_Tibt" +"include": "_default_template_yaml" +"task": "belebele_bod_Tibt" +"test_split": "bod_Tibt" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ell_Grek.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ell_Grek.yaml new file mode 100644 index 0000000000000000000000000000000000000000..57226e976bc5f22fce37e2778e1f9596660ceceb --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ell_Grek.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "ell_Grek" +"include": "_default_template_yaml" +"task": "belebele_ell_Grek" +"test_split": "ell_Grek" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_eus_Latn.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_eus_Latn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b2ad09b8169ccf946d1cd5ec0cc4f02d9f2b3b21 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_eus_Latn.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "eus_Latn" +"include": "_default_template_yaml" +"task": "belebele_eus_Latn" +"test_split": "eus_Latn" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_fra_Latn.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_fra_Latn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c60fa9a7a90db3fc7f5451b39705a487acf18b29 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_fra_Latn.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "fra_Latn" +"include": "_default_template_yaml" +"task": "belebele_fra_Latn" +"test_split": "fra_Latn" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_hau_Latn.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_hau_Latn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e3626f36ad94245c596c3135b4e1950a47f71fb9 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_hau_Latn.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "hau_Latn" +"include": "_default_template_yaml" +"task": "belebele_hau_Latn" +"test_split": "hau_Latn" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_hin_Latn.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_hin_Latn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f44f4f55f458197f0453be18fb5e389939da27b1 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_hin_Latn.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "hin_Latn" +"include": "_default_template_yaml" +"task": "belebele_hin_Latn" +"test_split": "hin_Latn" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_hrv_Latn.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_hrv_Latn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..69b100c44bcf9218e541a7f3c41020dedafbec88 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_hrv_Latn.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "hrv_Latn" +"include": "_default_template_yaml" +"task": "belebele_hrv_Latn" +"test_split": "hrv_Latn" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_hye_Armn.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_hye_Armn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5a57fa86451f834e5c4d8bea7d2961c2ff220b9d --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_hye_Armn.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "hye_Armn" +"include": "_default_template_yaml" +"task": "belebele_hye_Armn" +"test_split": "hye_Armn" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_isl_Latn.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_isl_Latn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..69f9bb4e372ce1a39057ce4b70a7e48d23d199e2 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_isl_Latn.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "isl_Latn" +"include": "_default_template_yaml" +"task": "belebele_isl_Latn" +"test_split": "isl_Latn" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_kan_Knda.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_kan_Knda.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b4592afca58272c1b58d163c7f6626f3a2135d27 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_kan_Knda.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "kan_Knda" +"include": "_default_template_yaml" +"task": "belebele_kan_Knda" +"test_split": "kan_Knda" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_kaz_Cyrl.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_kaz_Cyrl.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8af667d1ad67568d17f6c833979d8b04f9b8cffe --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_kaz_Cyrl.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "kaz_Cyrl" +"include": "_default_template_yaml" +"task": "belebele_kaz_Cyrl" +"test_split": "kaz_Cyrl" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_khm_Khmr.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_khm_Khmr.yaml new file mode 100644 index 0000000000000000000000000000000000000000..39641c836b4ccf13ff16a730ef0ddd0ed6cc0962 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_khm_Khmr.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "khm_Khmr" +"include": "_default_template_yaml" +"task": "belebele_khm_Khmr" +"test_split": "khm_Khmr" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_kor_Hang.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_kor_Hang.yaml new file mode 100644 index 0000000000000000000000000000000000000000..088d2e9359feaaf6504504e18390c0979988c6f9 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_kor_Hang.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "kor_Hang" +"include": "_default_template_yaml" +"task": "belebele_kor_Hang" +"test_split": "kor_Hang" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_lao_Laoo.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_lao_Laoo.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bdee22168b7536d2063e1d1602cb9032d97cb357 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_lao_Laoo.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "lao_Laoo" +"include": "_default_template_yaml" +"task": "belebele_lao_Laoo" +"test_split": "lao_Laoo" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_mkd_Cyrl.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_mkd_Cyrl.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c9887108a949eae7be2e17e28a7ed9f81559f303 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_mkd_Cyrl.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "mkd_Cyrl" +"include": "_default_template_yaml" +"task": "belebele_mkd_Cyrl" +"test_split": "mkd_Cyrl" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_nob_Latn.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_nob_Latn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cf824f3b9de3ce40db37060d2348c4a7b60a4c00 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_nob_Latn.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "nob_Latn" +"include": "_default_template_yaml" +"task": "belebele_nob_Latn" +"test_split": "nob_Latn" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_npi_Deva.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_npi_Deva.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c8126174671cdcc412958b90d1bc3051a8d4386a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_npi_Deva.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "npi_Deva" +"include": "_default_template_yaml" +"task": "belebele_npi_Deva" +"test_split": "npi_Deva" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ory_Orya.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ory_Orya.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5590560aaac88be6ec8dc90353d308d23c759323 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ory_Orya.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "ory_Orya" +"include": "_default_template_yaml" +"task": "belebele_ory_Orya" +"test_split": "ory_Orya" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_pbt_Arab.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_pbt_Arab.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7630b0fc3dc83d4a3587913a689fa05f2eaf432b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_pbt_Arab.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "pbt_Arab" +"include": "_default_template_yaml" +"task": "belebele_pbt_Arab" +"test_split": "pbt_Arab" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_sin_Sinh.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_sin_Sinh.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c1e42d9ac72601ffe27ddcc5994166a87a76bb31 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_sin_Sinh.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "sin_Sinh" +"include": "_default_template_yaml" +"task": "belebele_sin_Sinh" +"test_split": "sin_Sinh" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_sot_Latn.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_sot_Latn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f98de9a3dab12ca1af6f570904abc976d11525df --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_sot_Latn.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "sot_Latn" +"include": "_default_template_yaml" +"task": "belebele_sot_Latn" +"test_split": "sot_Latn" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ssw_Latn.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ssw_Latn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..788d6959976320f5fb962e442aa8fa9c2ed9cca8 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ssw_Latn.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "ssw_Latn" +"include": "_default_template_yaml" +"task": "belebele_ssw_Latn" +"test_split": "ssw_Latn" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_swh_Latn.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_swh_Latn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..55845837a155e9fd1525830a263019910c793864 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_swh_Latn.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "swh_Latn" +"include": "_default_template_yaml" +"task": "belebele_swh_Latn" +"test_split": "swh_Latn" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_tam_Taml.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_tam_Taml.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9c81da2f8a7ab0c60628cd834eecb1f70031173b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_tam_Taml.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "tam_Taml" +"include": "_default_template_yaml" +"task": "belebele_tam_Taml" +"test_split": "tam_Taml" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_tir_Ethi.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_tir_Ethi.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ca902d2a391ea872a2c3a75eded5eadfd3b8a1a6 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_tir_Ethi.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "tir_Ethi" +"include": "_default_template_yaml" +"task": "belebele_tir_Ethi" +"test_split": "tir_Ethi" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_tso_Latn.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_tso_Latn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1dae599eb73aade0b753216421eac391afe89985 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_tso_Latn.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "tso_Latn" +"include": "_default_template_yaml" +"task": "belebele_tso_Latn" +"test_split": "tso_Latn" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ukr_Cyrl.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ukr_Cyrl.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c24156d846cb64db04877d9c36d394b54f56aa3e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ukr_Cyrl.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "ukr_Cyrl" +"include": "_default_template_yaml" +"task": "belebele_ukr_Cyrl" +"test_split": "ukr_Cyrl" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_urd_Latn.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_urd_Latn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8ea63063b6f99815ae5c51faeec53352bc28721d --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_urd_Latn.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "urd_Latn" +"include": "_default_template_yaml" +"task": "belebele_urd_Latn" +"test_split": "urd_Latn" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_wol_Latn.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_wol_Latn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7683e3d2206e9bfb04ec2a2cf2d068c2be9570c3 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_wol_Latn.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "wol_Latn" +"include": "_default_template_yaml" +"task": "belebele_wol_Latn" +"test_split": "wol_Latn" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_yor_Latn.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_yor_Latn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9eb295cc0471885c83a4114b0551d134611d8c56 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_yor_Latn.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "yor_Latn" +"include": "_default_template_yaml" +"task": "belebele_yor_Latn" +"test_split": "yor_Latn" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_zho_Hant.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_zho_Hant.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c8604e58b6b747cd32a0621fe2a1858a3102da36 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_zho_Hant.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "zho_Hant" +"include": "_default_template_yaml" +"task": "belebele_zho_Hant" +"test_split": "zho_Hant" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_zul_Latn.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_zul_Latn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1e7fede97ca234d40a87acc7a0e21aaf659a2faf --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/belebele/belebele_zul_Latn.yaml @@ -0,0 +1,4 @@ +"fewshot_split": "zul_Latn" +"include": "_default_template_yaml" +"task": "belebele_zul_Latn" +"test_split": "zul_Latn" diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/README.md new file mode 100644 index 0000000000000000000000000000000000000000..10364d8ea8edf4972eb1ec991e8bc29137b87c0b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/README.md @@ -0,0 +1,55 @@ +# inverse_scaling + +### Paper + +Title: `Inverse Scaling: When Bigger Isn't Better` + +Abstract: `Work on scaling laws has found that large language models (LMs) show predictable improvements to overall loss with increased scale (model size, training data, and compute). Here, we present evidence for the claim that LMs may show inverse scaling, or worse task performance with increased scale, e.g., due to flaws in the training objective and data. We present empirical evidence of inverse scaling on 11 datasets collected by running a public contest, the Inverse Scaling Prize, with a substantial prize pool. Through analysis of the datasets, along with other examples found in the literature, we identify four potential causes of inverse scaling: (i) preference to repeat memorized sequences over following in-context instructions, (ii) imitation of undesirable patterns in the training data, (iii) tasks containing an easy distractor task which LMs could focus on, rather than the harder real task, and (iv) correct but misleading few-shot demonstrations of the task. We release the winning datasets at this https URL to allow for further investigation of inverse scaling. Our tasks have helped drive the discovery of U-shaped and inverted-U scaling trends, where an initial trend reverses, suggesting that scaling trends are less reliable at predicting the behavior of larger-scale models than previously understood. Overall, our results suggest that there are tasks for which increased model scale alone may not lead to progress, and that more careful thought needs to go into the data and objectives for training language models.` + +Note: This is not official implementation of inverse scaling prize. Implemented by h-albert-lee with permission from the authors of the paper. + +Homepage: https://github.com/inverse-scaling/prize + +### Citation + +@article{mckenzie2023inverse, + title={Inverse Scaling: When Bigger Isn't Better}, + author={Ian R. McKenzie and Alexander Lyzhov and Michael Pieler and Alicia Parrish and Aaron Mueller and Ameya Prabhu and Euan McLean and Aaron Kirtland and Alexis Ross and Alisa Liu and Andrew Gritsevskiy and Daniel Wurgaft and Derik Kauffman and Gabriel Recchia and Jiacheng Liu and Joe Cavanagh and Max Weiss and Sicong Huang and The Floating Droid and Tom Tseng and Tomasz Korbak and Xudong Shen and Yuhui Zhang and Zhengping Zhou and Najoung Kim and Samuel R. Bowman and Ethan Perez}, + journal={arXiv preprint arXiv:2306.09479}, + year={2023} +} + +### Groups and Tasks + +#### Groups + +* `inverse_scaling_mc`: all tasks of Inverse Scaling Prize (currently aside from Prompt Injection), matching their implementations on OPT for multiple-choice type classification tasks. **These match the published dataset versions from the prize, which may slightly differ from numbers in the paper (but have been tested for equivalence to the OPT numbers reported at https://huggingface.co/inverse-scaling/opt-1.3b_eval for multiple sizes.** + + +#### Tasks + +- `inverse_scaling_hindsight_neglect_10shot` +- `inverse_scaling_redefine_math` +- `inverse_scaling_quote_repetition` +- `inverse_scaling_neqa` +- `inverse_scaling_winobias_antistereotype`: not an official Inverse Scaling prize winner, but eval results reported on it at https://huggingface.co/inverse-scaling/opt-1.3b_eval . +- `inverse_scaling_into_the_unknown` +- `inverse_scaling_memo_trap` +- `inverse_scaling_modus_tollens` +- `inverse_scaling_pattern_matching_suppression` +- `inverse_scaling_repetitive_algebra` +- `inverse_scaling_sig_figs` + + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [x] Is the task an existing benchmark in the literature? + * [x] Have you referenced the original paper that introduced the task? + * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/_inverse_scaling_mc_yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/_inverse_scaling_mc_yaml new file mode 100644 index 0000000000000000000000000000000000000000..c504eb2cd8833cfef70b3d60748a1769829ac11a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/_inverse_scaling_mc_yaml @@ -0,0 +1,17 @@ +tag: + - inverse_scaling_mc +output_type: multiple_choice +test_split: train +doc_to_text: prompt +doc_to_choice: classes +doc_to_target: answer_index +target_delimiter: "" +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/_some_results b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/_some_results new file mode 100644 index 0000000000000000000000000000000000000000..9afe58d8e02cdc0cee5a756c8e24aaeb6f2e87cf --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/_some_results @@ -0,0 +1,39 @@ +# | Tasks |Version|Filter|n-shot| Metric |Value | |Stderr| +# |-------------------------------------------|-------|------|-----:|--------|-----:|---|-----:| +# | - inverse_scaling_hindsight_neglect_10shot| 0|none | 0|acc |0.4476|± |0.0281| +# | | |none | 0|acc_norm|0.4476|± |0.0281| +# |inverse_scaling_mc |N/A |none | 0|acc_norm|0.6273|± |0.0096| +# | | |none | 0|acc |0.6210|± |0.0095| +# | - inverse_scaling_neqa | 0|none | 0|acc |0.5300|± |0.0289| +# | | |none | 0|acc_norm|0.5300|± |0.0289| +# | - inverse_scaling_quote_repetition | 0|none | 0|acc |0.9367|± |0.0141| +# | | |none | 0|acc_norm|0.9367|± |0.0141| +# | - inverse_scaling_redefine_math | 0|none | 0|acc |0.7178|± |0.0150| +# | | |none | 0|acc_norm|0.7178|± |0.0150| +# | - inverse_scaling_winobias_antistereotype | 0|none | 0|acc |0.3786|± |0.0239| +# | | |none | 0|acc_norm|0.4126|± |0.0243| + +# | Groups |Version|Filter|n-shot| Metric |Value | |Stderr| +# |------------------|-------|------|-----:|--------|-----:|---|-----:| +# |inverse_scaling_mc|N/A |none | 0|acc_norm|0.6273|± |0.0096| +# | | |none | 0|acc |0.6210|± |0.0095| +# hf (pretrained=facebook/opt-2.7b,add_bos_token=True,dtype=float32), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: auto (32) +# | Tasks |Version|Filter|n-shot| Metric |Value | |Stderr| +# |-------------------------------------------|-------|------|-----:|--------|-----:|---|-----:| +# | - inverse_scaling_hindsight_neglect_10shot| 0|none | 0|acc |0.4476|± |0.0281| +# | | |none | 0|acc_norm|0.4476|± |0.0281| +# |inverse_scaling_mc |N/A |none | 0|acc_norm|0.6291|± |0.0095| +# | | |none | 0|acc |0.6219|± |0.0095| +# | - inverse_scaling_neqa | 0|none | 0|acc |0.5267|± |0.0289| +# | | |none | 0|acc_norm|0.5267|± |0.0289| +# | - inverse_scaling_quote_repetition | 0|none | 0|acc |0.9433|± |0.0134| +# | | |none | 0|acc_norm|0.9433|± |0.0134| +# | - inverse_scaling_redefine_math | 0|none | 0|acc |0.7200|± |0.0150| +# | | |none | 0|acc_norm|0.7200|± |0.0150| +# | - inverse_scaling_winobias_antistereotype | 0|none | 0|acc |0.3762|± |0.0239| +# | | |none | 0|acc_norm|0.4150|± |0.0243| + +# | Groups |Version|Filter|n-shot| Metric |Value | |Stderr| +# |------------------|-------|------|-----:|--------|-----:|---|-----:| +# |inverse_scaling_mc|N/A |none | 0|acc_norm|0.6291|± |0.0095| +# | | |none | 0|acc |0.6219|± |0.0095| diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_hindsight_neglect.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_hindsight_neglect.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b21baee22a42077ecfe0b53678b92e5daa2b1e62 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_hindsight_neglect.yaml @@ -0,0 +1,3 @@ +include: _inverse_scaling_mc_yaml +task: inverse_scaling_hindsight_neglect_10shot +dataset_path: inverse-scaling/hindsight-neglect-10shot diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_into_the_unknown.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_into_the_unknown.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0c7ccd3bb5cbd3d0dfe924d5fa7b22ad466ea198 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_into_the_unknown.yaml @@ -0,0 +1,3 @@ +include: _inverse_scaling_mc_yaml +task: inverse_scaling_into_the_unknown +dataset_path: Albertmade/into-the-unknown diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_memo_trap.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_memo_trap.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5c77ffc25077a3c9c4e287dd96cdcc24a29fa483 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_memo_trap.yaml @@ -0,0 +1,3 @@ +include: _inverse_scaling_mc_yaml +task: inverse_scaling_memo_trap +dataset_path: Albertmade/memo-trap diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_modus_tollens.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_modus_tollens.yaml new file mode 100644 index 0000000000000000000000000000000000000000..89e1ebbc55ecb483b4e486077373b9a0ee7cd12b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_modus_tollens.yaml @@ -0,0 +1,3 @@ +include: _inverse_scaling_mc_yaml +task: inverse_scaling_modus_tollens +dataset_path: Albertmade/modus-tollens diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_neqa.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_neqa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..11a057d8ed756f0d2e697c459be49f22b57dde94 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_neqa.yaml @@ -0,0 +1,3 @@ +include: _inverse_scaling_mc_yaml +task: inverse_scaling_neqa +dataset_path: inverse-scaling/NeQA diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_pattern_matching_suppression.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_pattern_matching_suppression.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b415bba1327741396b8d9c0d158b9fba95fa7a52 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_pattern_matching_suppression.yaml @@ -0,0 +1,3 @@ +include: _inverse_scaling_mc_yaml +task: inverse_scaling_pattern_matching_suppression +dataset_path: Albertmade/pattern-matching-suppression diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_quote_repetition.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_quote_repetition.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6750953cd7f7de45518345d4c7e6ff2861e5926b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_quote_repetition.yaml @@ -0,0 +1,3 @@ +include: _inverse_scaling_mc_yaml +task: inverse_scaling_quote_repetition +dataset_path: inverse-scaling/quote-repetition diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_redefine_math.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_redefine_math.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2f14ba28bddc5bb1586379b179c956a21a6dcc68 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_redefine_math.yaml @@ -0,0 +1,3 @@ +include: _inverse_scaling_mc_yaml +task: inverse_scaling_redefine_math +dataset_path: inverse-scaling/redefine-math diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_repetitive_algebra.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_repetitive_algebra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..048ea271edf7252b0dc0aa120032b04102685e9c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_repetitive_algebra.yaml @@ -0,0 +1,3 @@ +include: _inverse_scaling_mc_yaml +task: inverse_scaling_repetitive_algebra +dataset_path: Albertmade/repetitive-algebra diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_sig_figs.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_sig_figs.yaml new file mode 100644 index 0000000000000000000000000000000000000000..25d86e33a8c839bc750ad8bb83be908c1e19eb19 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_sig_figs.yaml @@ -0,0 +1,3 @@ +include: _inverse_scaling_mc_yaml +task: inverse_scaling_sig_figs +dataset_path: Albertmade/sig-figs diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_winobias_antistereotype.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_winobias_antistereotype.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4c5ff300aae7ee84e103dea2be053ec88fe9f771 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/inverse_scaling/inverse_scaling_winobias_antistereotype.yaml @@ -0,0 +1,20 @@ +group: + - inverse_scaling_mc +task: inverse_scaling_winobias_antistereotype +dataset_path: mathemakitten/winobias_antistereotype_test_v5 +output_type: multiple_choice +test_split: test +doc_to_text: text +doc_to_choice: classes +doc_to_target: target +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +dataset_kwargs: + trust_remote_code: true +metadata: + version: 0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/minerva_math/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/minerva_math/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2d273b23d7ec4761ad90e3992d7f7d2da30d1226 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/minerva_math/README.md @@ -0,0 +1,65 @@ +# MATH +ℹ️ This is the 4-shot variant! +## Paper +Measuring Mathematical Problem Solving With the MATH Dataset +https://arxiv.org/abs/2103.03874 + +Many intellectual endeavors require mathematical problem solving, but this skill remains beyond the capabilities of computers. To measure this ability in machine learning models, we introduce MATH, a new dataset of 12,500 challenging competition mathematics problems. Each problem in MATH has a full step-by-step solution which can be used to teach models to generate answer derivations and explanations. + +NOTE: The few-shot and the generated answer extraction is based on the [Minerva](https://arxiv.org/abs/2206.14858) and exact match equivalence is calculated using the `sympy` library. This requires additional dependencies, which can be installed via the `lm-eval[math]` extra. + +Homepage: https://github.com/hendrycks/math + + +## Citation +``` +@article{hendrycksmath2021, + title={Measuring Mathematical Problem Solving With the MATH Dataset}, + author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt}, + journal={NeurIPS}, + year={2021} +} + +@misc{2206.14858, +Author = {Aitor Lewkowycz and Anders Andreassen and David Dohan and Ethan Dyer and Henryk Michalewski and Vinay Ramasesh and Ambrose Slone and Cem Anil and Imanol Schlag and Theo Gutman-Solo and Yuhuai Wu and Behnam Neyshabur and Guy Gur-Ari and Vedant Misra}, +Title = {Solving Quantitative Reasoning Problems with Language Models}, +Year = {2022}, +Eprint = {arXiv:2206.14858}, +} +``` + +### Groups and Tasks + +#### Groups + +- `minerva_math` + +#### Tasks + +- `minerva_math_algebra` +- `minerva_math_counting_and_prob` +- `minerva_math_geometry` +- `minerva_math_intermediate_algebra` +- `minerva_math_num_theory` +- `minerva_math_prealgebra` +- `minerva_math_precalc` + +### Checklist + +The checklist is the following: + +For adding novel benchmarks/datasets to the library: +* [x] Is the task an existing benchmark in the literature? + * [x] Have you referenced the original paper that introduced the task? + * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + * The implementation in the original paper is one where the model is first fine-tuned on the data. They do have a few-shot evaluation for GPT-3, however the few-shot context used here is sourced from [Lewkowycz et al](https://arxiv.org/abs/2206.14858). The achieved accuracy on Llama-2 models is comparable to that provided in the paper, though not identical. + + +If other tasks on this dataset are already supported: +* [x] Is the "Main" variant of this task clearly denoted? +* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [x] Have you noted which, if any, published evaluation setups are matched by this variant? + +### Variant Wishlist + +- [ ] zero-shot variant diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ac06b7ac6e00580ad7918d2000e82bff486733a5 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml @@ -0,0 +1,29 @@ +group: + - math_word_problems +task: minerva_math_algebra +dataset_path: EleutherAI/hendrycks_math +process_docs: !function utils.process_docs +dataset_name: algebra +output_type: generate_until +training_split: train +test_split: test +doc_to_text: !function utils.doc_to_text +process_results: !function utils.process_results +doc_to_target: "{{answer if few_shot is undefined else solution}}" +generation_kwargs: + until: + - "Problem:" + do_sample: false + temperature: 0 +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true +num_fewshot: 4 +metadata: + version: 1.0 +dataset_kwargs: + trust_remote_code: true +fewshot_config: + sampler: first_n + samples: !function utils.list_fewshot_samples diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/minerva_math/minerva_math_counting_and_prob.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/minerva_math/minerva_math_counting_and_prob.yaml new file mode 100644 index 0000000000000000000000000000000000000000..688cd711c50d005d5d78ca55116ad333d96161ce --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/minerva_math/minerva_math_counting_and_prob.yaml @@ -0,0 +1,3 @@ +include: minerva_math_algebra.yaml +dataset_name: counting_and_probability +task: minerva_math_counting_and_prob diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/minerva_math/minerva_math_geometry.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/minerva_math/minerva_math_geometry.yaml new file mode 100644 index 0000000000000000000000000000000000000000..079ee70e9ed8997f351d1732c0c88dad1e4896de --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/minerva_math/minerva_math_geometry.yaml @@ -0,0 +1,3 @@ +include: minerva_math_algebra.yaml +dataset_name: geometry +task: minerva_math_geometry diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/minerva_math/minerva_math_intermediate_algebra.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/minerva_math/minerva_math_intermediate_algebra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7b3f063c36e10063dd06be93c290820a787ddd1d --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/minerva_math/minerva_math_intermediate_algebra.yaml @@ -0,0 +1,3 @@ +include: minerva_math_algebra.yaml +dataset_name: intermediate_algebra +task: minerva_math_intermediate_algebra diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/minerva_math/minerva_math_num_theory.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/minerva_math/minerva_math_num_theory.yaml new file mode 100644 index 0000000000000000000000000000000000000000..44f587bce4cce5e4ab80d24b938b88488553d6da --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/minerva_math/minerva_math_num_theory.yaml @@ -0,0 +1,3 @@ +include: minerva_math_algebra.yaml +dataset_name: number_theory +task: minerva_math_num_theory diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/minerva_math/minerva_math_prealgebra.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/minerva_math/minerva_math_prealgebra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..865e2f2c6e5397a07fb473a89f4d8eaf47d3eb52 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/minerva_math/minerva_math_prealgebra.yaml @@ -0,0 +1,3 @@ +include: minerva_math_algebra.yaml +dataset_name: prealgebra +task: minerva_math_prealgebra diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/minerva_math/minerva_math_precalc.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/minerva_math/minerva_math_precalc.yaml new file mode 100644 index 0000000000000000000000000000000000000000..06e63abc7c206b43759217b38cd5db2395e554a9 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/minerva_math/minerva_math_precalc.yaml @@ -0,0 +1,3 @@ +include: minerva_math_algebra.yaml +dataset_name: precalculus +task: minerva_math_precalc diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/minerva_math/utils.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/minerva_math/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..e3ebcf991bac2a3727098a437586faf0d2ce3a62 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/minerva_math/utils.py @@ -0,0 +1,294 @@ +import re +import signal +from typing import Dict, List, Optional + +import datasets + +from lm_eval.utils import eval_logger + + +try: + import sympy + from sympy.parsing.latex import parse_latex +except ModuleNotFoundError: + raise ModuleNotFoundError( + "`sympy` is required for generating translation task prompt templates. \ +please install sympy via pip install lm-eval[math] or pip install -e .[math]", + ) + + +# taken from +# https://github.com/wellecks/lm-evaluation-harness/blob/master/lm_eval/tasks/minerva_math.py +def doc_to_text(doc: dict) -> str: + return "Problem:" + "\n" + doc["problem"] + "\n\n" + "Solution:" + + +def process_docs(dataset: datasets.Dataset) -> datasets.Dataset: + def _process_doc(doc: dict) -> dict: + out_doc = { + "problem": doc["problem"], + "solution": doc["solution"], + "answer": normalize_final_answer( + remove_boxed(last_boxed_only_string(doc["solution"])) + ), + } + if getattr(doc, "few_shot", None) is not None: + out_doc["few_shot"] = True + return out_doc + + return dataset.map(_process_doc) + + +def list_fewshot_samples() -> list[dict]: + return [ + { + "problem": "Find the domain of the expression $\\frac{\\sqrt{x-2}}{\\sqrt{5-x}}$.}", + "solution": "The expressions inside each square root must be non-negative. Therefore, $x-2 \\ge 0$, so $x\\ge2$, and $5 - x \\ge 0$, so $x \\le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{[2,5)}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.", + "few_shot": "1", + }, + { + "problem": "If $\\det \\mathbf{A} = 2$ and $\\det \\mathbf{B} = 12,$ then find $\\det (\\mathbf{A} \\mathbf{B}).$", + "solution": "We have that $\\det (\\mathbf{A} \\mathbf{B}) = (\\det \\mathbf{A})(\\det \\mathbf{B}) = (2)(12) = \\boxed{24}.$\nFinal Answer: The final answer is $24$. I hope it is correct.", + "few_shot": "1", + }, + { + "problem": "Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?", + "solution": "If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\\cdot 12\\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\\cdot15\\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$:\n\\begin{align*}\n30n&=480\\\n\\Rightarrow\\qquad n&=480/30=\\boxed{16}\n\\end{align*}\nFinal Answer: The final answer is $16$. I hope it is correct.", + "few_shot": "1", + }, + { + "problem": "If the system of equations\n\n\\begin{align*}\n6x-4y&=a,\\\n6y-9x &=b.\n\\end{align*}has a solution $(x, y)$ where $x$ and $y$ are both nonzero,\nfind $\\frac{a}{b},$ assuming $b$ is nonzero.", + "solution": "If we multiply the first equation by $-\\frac{3}{2}$, we obtain\n\n$$6y-9x=-\\frac{3}{2}a.$$Since we also know that $6y-9x=b$, we have\n\n$$-\\frac{3}{2}a=b\\Rightarrow\\frac{a}{b}=\\boxed{-\\frac{2}{3}}.$$\nFinal Answer: The final answer is $-\\frac{2}{3}$. I hope it is correct.", + "few_shot": "1", + }, + ] + + +def process_results(doc: dict, results: List[str]) -> Dict[str, int]: + candidates = results[0] + + unnormalized_answer = get_unnormalized_answer(candidates) + answer = normalize_final_answer(unnormalized_answer) + + if is_equiv(answer, doc["answer"]): + retval = 1 + else: + retval = 0 + + results = { + "exact_match": retval, + } + return results + + +def last_boxed_only_string(string: str) -> Optional[str]: + idx = string.rfind("\\boxed") + if "\\boxed " in string: + return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0] + if idx < 0: + idx = string.rfind("\\fbox") + if idx < 0: + return None + + i = idx + right_brace_idx = None + num_left_braces_open = 0 + while i < len(string): + if string[i] == "{": + num_left_braces_open += 1 + if string[i] == "}": + num_left_braces_open -= 1 + if num_left_braces_open == 0: + right_brace_idx = i + break + i += 1 + + if right_brace_idx is None: + retval = None + else: + retval = string[idx : right_brace_idx + 1] + + return retval + + +def remove_boxed(s: str) -> str: + if "\\boxed " in s: + left = "\\boxed " + assert s[: len(left)] == left + return s[len(left) :] + + left = "\\boxed{" + + assert s[: len(left)] == left + assert s[-1] == "}" + + return s[len(left) : -1] + + +class timeout: + def __init__(self, seconds=1, error_message="Timeout"): + self.seconds = seconds + self.error_message = error_message + + def handle_timeout(self, signum, frame): + raise TimeoutError(self.error_message) + + def __enter__(self): + signal.signal(signal.SIGALRM, self.handle_timeout) + signal.alarm(self.seconds) + + def __exit__(self, type, value, traceback): + signal.alarm(0) + + +def is_equiv(x1: str, x2: str) -> bool: + """ + x1 and x2 are normalized latex string + """ + try: + with timeout(seconds=5): + try: + parsed_x1 = parse_latex(x1) + parsed_x2 = parse_latex(x2) + except ( + sympy.parsing.latex.errors.LaTeXParsingError, + sympy.SympifyError, + TypeError, + ): + eval_logger.debug(f"couldn't parse one of {x1} or {x2}") + return False + + try: + diff = parsed_x1 - parsed_x2 + except TypeError: + eval_logger.debug(f"couldn't subtract {x1} and {x2}") + return False + + try: + if sympy.simplify(diff) == 0: + return True + else: + return False + except ValueError: + eval_logger.debug( + f"Had some trouble simplifying when comparing {x1} and {x2}" + ) + except TimeoutError: + eval_logger.debug(f"Timed out comparing {x1} and {x2}") + return False + except ImportError as e: + eval_logger.error(e) + raise + except Exception as e: + eval_logger.debug(f"Failed comparing {x1} and {x2} with {e}") + return False + + +def get_unnormalized_answer(text: str) -> str: + INVALID_ANSWER = "[invalidanswer]" + end_seq = "I hope it is correct." + text += end_seq + match = re.search( + r"Final Answer: The final answer is(.*?). I hope it is correct.", + text, + ) + if match: + return match.group(1).strip() + else: + return INVALID_ANSWER + + +SUBSTITUTIONS = [ + ("an ", ""), + ("a ", ""), + (".$", "$"), + ("\\$", ""), + (r"\ ", ""), + (" ", ""), + ("mbox", "text"), + (",\\text{and}", ","), + ("\\text{and}", ","), + ("\\text{m}", "\\text{}"), +] +REMOVED_EXPRESSIONS = [ + "square", + "ways", + "integers", + "dollars", + "mph", + "inches", + "ft", + "hours", + "km", + "units", + "\\ldots", + "sue", + "points", + "feet", + "minutes", + "digits", + "cents", + "degrees", + "cm", + "gm", + "pounds", + "meters", + "meals", + "edges", + "students", + "childrentickets", + "multiples", + "\\text{s}", + "\\text{.}", + "\\text{\ns}", + "\\text{}^2", + "\\text{}^3", + "\\text{\n}", + "\\text{}", + r"\mathrm{th}", + r"^\circ", + r"^{\circ}", + r"\;", + r",\!", + "{,}", + '"', + "\\dots", +] + + +def normalize_final_answer(final_answer: str) -> str: + """ + Normalize a final answer to a quantitative reasoning question. + + Copied character for character from appendix D of Lewkowycz et al. (2022) + """ + final_answer = final_answer.split("=")[-1] + + for before, after in SUBSTITUTIONS: + final_answer = final_answer.replace(before, after) + for expr in REMOVED_EXPRESSIONS: + final_answer = final_answer.replace(expr, "") + + # Extract answer that is in LaTeX math, is bold, + # is surrounded by a box, etc. + final_answer = re.sub(r"(.*?)(\$)(.*?)(\$)(.*)", "$\\3$", final_answer) + final_answer = re.sub(r"(\\text\{)(.*?)(\})", "\\2", final_answer) + final_answer = re.sub(r"(\\textbf\{)(.*?)(\})", "\\2", final_answer) + final_answer = re.sub(r"(\\overline\{)(.*?)(\})", "\\2", final_answer) + final_answer = re.sub(r"(\\boxed\{)(.*)(\})", "\\2", final_answer) + + # Normalize shorthand TeX: + # \fracab -> \frac{a}{b} + # \frac{abc}{bef} -> \frac{abc}{bef} + # \fracabc -> \frac{a}{b}c + # \sqrta -> \sqrt{a} + # \sqrtab -> sqrt{a}b + final_answer = re.sub(r"(frac)([^{])(.)", "frac{\\2}{\\3}", final_answer) + final_answer = re.sub(r"(sqrt)([^{])", "sqrt{\\2}", final_answer) + final_answer = final_answer.replace("$", "") + + # Normalize 100,000 -> 100000 + if final_answer.replace(",", "").isdigit(): + final_answer = final_answer.replace(",", "") + + return final_answer diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/storycloze/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/storycloze/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1c92db9b349429c558fd328d23784bf27a67b732 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/storycloze/README.md @@ -0,0 +1,55 @@ +# StoryCloze + +### Paper + +Title: `A Corpus and Evaluation Framework for Deeper Understanding of Commonsense Stories` +Abstract: `https://arxiv.org/abs/1604.01696` + +Homepage: https://cs.rochester.edu/nlp/rocstories/ + +'Story Cloze Test' is a new commonsense reasoning framework for evaluating story understanding, story generation, and script learning. This test requires a system to choose the correct ending to a four-sentence story + + +### Citation + +``` +@misc{mostafazadeh2016corpus, + title={A Corpus and Evaluation Framework for Deeper Understanding of Commonsense Stories}, + author={Nasrin Mostafazadeh and + Nathanael Chambers and + Xiaodong He and + Devi Parikh and + Dhruv Batra and + Lucy Vanderwende and + Pushmeet Kohli and + James Allen}, + year={2016}, + eprint={1604.01696}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` + +### Groups and Tasks + +#### Groups + +* `storycloze` + +#### Tasks + +* `storycloze_2016` +* `storycloze_2018` + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [ ] Is the task an existing benchmark in the literature? + * [ ] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/storycloze/storycloze_2016.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/storycloze/storycloze_2016.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2d0f9222046412fb27d4f8d9f72ed78e97b5a048 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/storycloze/storycloze_2016.yaml @@ -0,0 +1,18 @@ +tag: storycloze +task: storycloze_2016 +dataset_path: story_cloze +dataset_name: 2016 +output_type: multiple_choice +validation_split: validation +test_split: test +doc_to_text: "{{[input_sentence_1, input_sentence_2, input_sentence_3, input_sentence_4]|join(' ')}}" +doc_to_target: "{{answer_right_ending-1}}" +doc_to_choice: "{{[sentence_quiz1, sentence_quiz2]}}" +should_decontaminate: true +doc_to_decontamination_query: "{{[input_sentence_1, input_sentence_2, input_sentence_3, input_sentence_4]|join(' ')}}" +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/storycloze/storycloze_2018.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/storycloze/storycloze_2018.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3ac23830dd4c985f8a8a57c18dc15348b7e200f8 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/storycloze/storycloze_2018.yaml @@ -0,0 +1,16 @@ +tag: storycloze +task: storycloze_2018 +dataset_path: story_cloze +dataset_name: 2018 +output_type: multiple_choice +validation_split: validation +test_split: test +doc_to_text: "{{[input_sentence_1, input_sentence_2, input_sentence_3, input_sentence_4]|join(' ')}}" +doc_to_target: "{{answer_right_ending-1}}" +doc_to_choice: "{{[sentence_quiz1, sentence_quiz2]}}" +should_decontaminate: true +doc_to_decontamination_query: "{{[input_sentence_1, input_sentence_2, input_sentence_3, input_sentence_4]|join(' ')}}" +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unscramble/anagrams1.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unscramble/anagrams1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ee044474f5f44e1b63f065a919e4604dfc242756 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unscramble/anagrams1.yaml @@ -0,0 +1,22 @@ +tag: + - unscramble +task: anagrams1 +dataset_path: EleutherAI/unscramble +dataset_name: mid_word_1_anagrams +output_type: generate_until +test_split: validation +doc_to_text: "{{context}}" +doc_to_target: "{{completion}}" +generation_kwargs: + until: + - "\n" +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: false + ignore_punctuation: false +metadata: + version: 2.0 +dataset_kwargs: + trust_remote_code: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unscramble/anagrams2.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unscramble/anagrams2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cb5e91dec2c0bb000441f83c52c7871cdc93b382 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unscramble/anagrams2.yaml @@ -0,0 +1,22 @@ +tag: + - unscramble +task: anagrams2 +dataset_path: EleutherAI/unscramble +dataset_name: mid_word_2_anagrams +output_type: generate_until +test_split: validation +doc_to_text: "{{context}}" +doc_to_target: "{{completion}}" +generation_kwargs: + until: + - "\n" +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: false + ignore_punctuation: false +metadata: + version: 2.0 +dataset_kwargs: + trust_remote_code: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unscramble/cycle_letters.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unscramble/cycle_letters.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b433b16737958065b3d6a6c224610eecea6634c3 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unscramble/cycle_letters.yaml @@ -0,0 +1,22 @@ +tag: + - unscramble +task: cycle_letters +dataset_path: EleutherAI/unscramble +dataset_name: cycle_letters_in_word +output_type: generate_until +test_split: validation +doc_to_text: "{{context}}" +doc_to_target: "{{completion}}" +generation_kwargs: + until: + - "\n" +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: false + ignore_punctuation: false +metadata: + version: 2.0 +dataset_kwargs: + trust_remote_code: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unscramble/random_insertion.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unscramble/random_insertion.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f537620d8b8c234ef43658779d7e43b207e1e1a3 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unscramble/random_insertion.yaml @@ -0,0 +1,22 @@ +tag: + - unscramble +task: random_insertion +dataset_path: EleutherAI/unscramble +dataset_name: random_insertion_in_word +output_type: generate_until +test_split: validation +doc_to_text: "{{context}}" +doc_to_target: "{{completion}}" +generation_kwargs: + until: + - "\n" +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: false + ignore_punctuation: false +metadata: + version: 2.0 +dataset_kwargs: + trust_remote_code: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unscramble/reversed_words.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unscramble/reversed_words.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b074de6eaaac0433c739451a2f0eec8fbc393a0f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/unscramble/reversed_words.yaml @@ -0,0 +1,20 @@ +tag: + - unscramble +task: reversed_words +dataset_path: EleutherAI/unscramble +dataset_name: reversed_words +output_type: generate_until +test_split: validation +doc_to_text: "{{context}}" +doc_to_target: "{{completion}}" +generation_kwargs: + until: + - "\n" +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: false + ignore_punctuation: false +metadata: + version: 2.0