koichi12 commited on
Commit
7c44d25
·
verified ·
1 Parent(s): 181156e

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/README.md +50 -0
  2. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_common_yaml +37 -0
  3. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_amh.yaml +3 -0
  4. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_eng.yaml +3 -0
  5. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ewe.yaml +3 -0
  6. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ibo.yaml +3 -0
  7. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_kin.yaml +3 -0
  8. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_twi.yaml +3 -0
  9. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_wol.yaml +3 -0
  10. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_xho.yaml +3 -0
  11. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_yor.yaml +3 -0
  12. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_zul.yaml +3 -0
  13. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/fewshot.sh +8 -0
  14. scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/utils.py +32 -0
  15. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/README.md +127 -0
  16. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/_ceval-valid.yaml +63 -0
  17. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_accountant.yaml +4 -0
  18. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_advanced_mathematics.yaml +4 -0
  19. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_art_studies.yaml +4 -0
  20. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_business_administration.yaml +4 -0
  21. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_chinese_language_and_literature.yaml +4 -0
  22. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_civil_servant.yaml +4 -0
  23. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_clinical_medicine.yaml +4 -0
  24. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_college_chemistry.yaml +4 -0
  25. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_college_economics.yaml +4 -0
  26. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_college_programming.yaml +4 -0
  27. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_computer_architecture.yaml +4 -0
  28. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_computer_network.yaml +4 -0
  29. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_discrete_mathematics.yaml +4 -0
  30. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_education_science.yaml +4 -0
  31. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_electrical_engineer.yaml +4 -0
  32. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_environmental_impact_assessment_engineer.yaml +4 -0
  33. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_high_school_biology.yaml +4 -0
  34. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_high_school_geography.yaml +4 -0
  35. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_high_school_history.yaml +4 -0
  36. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_high_school_physics.yaml +4 -0
  37. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_ideological_and_moral_cultivation.yaml +4 -0
  38. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_law.yaml +4 -0
  39. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_legal_professional.yaml +4 -0
  40. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_logic.yaml +4 -0
  41. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_mao_zedong_thought.yaml +4 -0
  42. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_marxism.yaml +4 -0
  43. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_metrology_engineer.yaml +4 -0
  44. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_middle_school_geography.yaml +4 -0
  45. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_middle_school_history.yaml +4 -0
  46. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_middle_school_mathematics.yaml +4 -0
  47. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_middle_school_physics.yaml +4 -0
  48. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_middle_school_politics.yaml +4 -0
  49. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_modern_chinese_history.yaml +4 -0
  50. scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_operating_system.yaml +4 -0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/README.md ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MathQA
2
+
3
+ ### Paper
4
+
5
+ IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models
6
+ https://arxiv.org/pdf/2406.03368
7
+
8
+ IrokoBench is a human-translated benchmark dataset for 16 typologically diverse
9
+ low-resource African languages covering three tasks: natural language inference (AfriXNLI),
10
+ mathematical reasoning (AfriMGSM), and multi-choice knowledge-based QA (AfriMMLU).
11
+
12
+
13
+ ### Citation
14
+
15
+ ```
16
+ @misc{adelani2024irokobenchnewbenchmarkafrican,
17
+ title={IrokoBench: A New Benchmark for African Languages in the Age of Large Language Models},
18
+ author={David Ifeoluwa Adelani and Jessica Ojo and Israel Abebe Azime and Jian Yun Zhuang and Jesujoba O. Alabi and Xuanli He and Millicent Ochieng and Sara Hooker and Andiswa Bukula and En-Shiun Annie Lee and Chiamaka Chukwuneke and Happy Buzaaba and Blessing Sibanda and Godson Kalipe and Jonathan Mukiibi and Salomon Kabongo and Foutse Yuehgoh and Mmasibidi Setaka and Lolwethu Ndolela and Nkiruka Odu and Rooweither Mabuya and Shamsuddeen Hassan Muhammad and Salomey Osei and Sokhar Samb and Tadesse Kebede Guge and Pontus Stenetorp},
19
+ year={2024},
20
+ eprint={2406.03368},
21
+ archivePrefix={arXiv},
22
+ primaryClass={cs.CL},
23
+ url={https://arxiv.org/abs/2406.03368},
24
+ }
25
+ ```
26
+
27
+ ### Groups and Tasks
28
+
29
+ #### Groups
30
+
31
+ * `afrimmlu`: All afrimmlu tasks
32
+ * `afrimmlu_direct`: afrimmlu_direct evaluates models performance on the curated dataset
33
+ * `afrimmlu_translate`: afrimmlu_translate evaluates models in translate-test setting
34
+
35
+ #### Tasks
36
+ * `afrimmlu_direct_{language_code}`: each task evaluates for one language
37
+ * `afrimmlu_translate_{language_code}`: each task evaluates for one language
38
+
39
+ ### Checklist
40
+
41
+ For adding novel benchmarks/datasets to the library:
42
+ * [x] Is the task an existing benchmark in the literature?
43
+ * [x] Have you referenced the original paper that introduced the task?
44
+ * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
45
+
46
+ If other tasks on this dataset are already supported:
47
+ * [x] Is the "Main" variant of this task clearly denoted?
48
+ * [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
49
+ * [x] Have you noted which, if any, published evaluation setups are matched by this variant?
50
+ * [x] Checked for equivalence with v0.3.0 LM Evaluation Harness
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_common_yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ group:
2
+ - afrimmlu
3
+ - afrimmlu_direct
4
+ task: null
5
+ dataset_path: masakhane/afrimmlu
6
+ dataset_name: null
7
+ output_type: multiple_choice
8
+ validation_split: validation
9
+ test_split: test
10
+ fewshot_split: validation
11
+ doc_to_text: !function utils.doc_to_text
12
+ doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}"
13
+ doc_to_choice: !function utils.doc_to_choice
14
+ should_decontaminate: true
15
+ doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
16
+ metric_list:
17
+ - metric: f1
18
+ aggregation: !function utils.weighted_f1_score
19
+ # aggregation: mean
20
+ average: weighted
21
+ hf_evaluate: true
22
+ higher_is_better: True
23
+ ignore_case: true
24
+ ignore_punctuation: true
25
+ regexes_to_ignore:
26
+ - ","
27
+ - "\\$"
28
+ - metric: acc
29
+ aggregation: mean
30
+ higher_is_better: true
31
+ ignore_case: true
32
+ ignore_punctuation: true
33
+ regexes_to_ignore:
34
+ - ","
35
+ - "\\$"
36
+ metadata:
37
+ version: 1.0
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_amh.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: amh
2
+ include: afrimmlu_common_yaml
3
+ task: afrimmlu_direct_amh
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_eng.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: eng
2
+ include: afrimmlu_common_yaml
3
+ task: afrimmlu_direct_eng
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ewe.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: ewe
2
+ include: afrimmlu_common_yaml
3
+ task: afrimmlu_direct_ewe
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_ibo.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: ibo
2
+ include: afrimmlu_common_yaml
3
+ task: afrimmlu_direct_ibo
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_kin.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: kin
2
+ include: afrimmlu_common_yaml
3
+ task: afrimmlu_direct_kin
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_twi.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: twi
2
+ include: afrimmlu_common_yaml
3
+ task: afrimmlu_direct_twi
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_wol.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: wol
2
+ include: afrimmlu_common_yaml
3
+ task: afrimmlu_direct_wol
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_xho.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: xho
2
+ include: afrimmlu_common_yaml
3
+ task: afrimmlu_direct_xho
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_yor.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: yor
2
+ include: afrimmlu_common_yaml
3
+ task: afrimmlu_direct_yor
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/direct/afrimmlu_direct_zul.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ dataset_name: zul
2
+ include: afrimmlu_common_yaml
3
+ task: afrimmlu_direct_zul
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/fewshot.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ lm_eval --model hf \
2
+ --model_args pretrained=masakhane/African-ultrachat-alpaca \
3
+ --tasks afrimmlu_direct_amh,afrimmlu_direct_eng,afrimmlu_direct_ewe,afrimmlu_direct_fra,afrimmlu_direct_hau,afrimmlu_direct_ibo,afrimmlu_direct_kin,afrimmlu_direct_lin,afrimmlu_direct_lug,afrimmlu_direct_orm,afrimmlu_direct_sna,afrimmlu_direct_sot,afrimmlu_direct_twi,afrimmlu_direct_wol,afrimmlu_direct_xho,afrimmlu_direct_yor,afrimmlu_direct_zul \
4
+ --device cuda:0 \
5
+ --batch_size 1 \
6
+ --num_fewshot 0 \
7
+ --verbosity DEBUG \
8
+ --wandb_args project=afrimmlu
scripts/yans/lm-evaluation-harness/lm_eval/tasks/afrimmlu/utils.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from lm_eval.utils import weighted_f1_score
2
+
3
+
4
+ def doc_to_choice(doc):
5
+ choices = eval(doc["choices"])
6
+ return choices
7
+
8
+
9
+ def doc_to_text(doc):
10
+ output = """You are a highly knowledgeable and intelligent artificial intelligence
11
+ model answers multiple-choice questions about '{subject}'
12
+
13
+ Question: '''{question}'''
14
+
15
+ Choices:
16
+ A: ''{choice1}'''
17
+ B: ''{choice2}'''
18
+ C: ''{choice3}'''
19
+ D: ''{choice4}'''
20
+
21
+ Answer: """
22
+
23
+ choices = eval(doc["choices"])
24
+ text = output.format(
25
+ subject=doc["subject"],
26
+ question=doc["question"],
27
+ choice1=choices[0],
28
+ choice2=choices[1],
29
+ choice3=choices[2],
30
+ choice4=choices[3],
31
+ )
32
+ return text
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/README.md ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # C-Eval (Validation)
2
+
3
+ ### Paper
4
+ C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models
5
+ https://arxiv.org/pdf/2305.08322.pdf
6
+
7
+ C-Eval is a comprehensive Chinese evaluation suite for foundation models.
8
+ It consists of 13948 multi-choice questions spanning 52 diverse disciplines
9
+ and four difficulty levels.
10
+
11
+ Homepage: https://cevalbenchmark.com/
12
+
13
+ ### Citation
14
+
15
+ ```bibtex
16
+ @article{huang2023ceval,
17
+ title={C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models},
18
+ author={Huang, Yuzhen and Bai, Yuzhuo and Zhu, Zhihao and Zhang, Junlei and Zhang, Jinghan and Su, Tangjun and Liu, Junteng and Lv, Chuancheng and Zhang, Yikai and Lei, Jiayi and Fu, Yao and Sun, Maosong and He, Junxian},
19
+ journal={arXiv preprint arXiv:2305.08322},
20
+ year={2023}
21
+ }
22
+ ```
23
+
24
+
25
+ SUBJECTS = {
26
+ "computer_network":"计算机网络",
27
+ "operating_system":"操作系统",
28
+ "computer_architecture":"计算机组成",
29
+ "college_programming":"大学编程",
30
+ "college_physics":"大学物理",
31
+ "college_chemistry":"大学化学",
32
+ "advanced_mathematics":"高等数学",
33
+ "probability_and_statistics":"概率统计",
34
+ "discrete_mathematics":"离散数学",
35
+ "electrical_engineer":"注册电气工程师",
36
+ "metrology_engineer":"注册计量师",
37
+ "high_school_mathematics":"高中数学",
38
+ "high_school_physics":"高中物理",
39
+ "high_school_chemistry":"高中化学",
40
+ "high_school_biology":"高中生物",
41
+ "middle_school_mathematics":"初中数学",
42
+ "middle_school_biology":"初中生物",
43
+ "middle_school_physics":"初中物理",
44
+ "middle_school_chemistry":"初中化学",
45
+ "veterinary_medicine":"兽医学",
46
+ "college_economics":"大学经济学",
47
+ "business_administration":"工商管理",
48
+ "marxism":"马克思主义基本原理",
49
+ "mao_zedong_thought":"毛泽东思想和中国特色社会主义理论体系概论",
50
+ "education_science":"教育学",
51
+ "teacher_qualification":"教师资格",
52
+ "high_school_politics":"高中政治",
53
+ "high_school_geography":"高中地理",
54
+ "middle_school_politics":"初中政治",
55
+ "middle_school_geography":"初中地理",
56
+ "modern_chinese_history":"近代史纲要",
57
+ "ideological_and_moral_cultivation":"思想道德修养与法律基础",
58
+ "logic":"逻辑学",
59
+ "law":"法学",
60
+ "chinese_language_and_literature":"中国语言文学",
61
+ "art_studies":"艺术学",
62
+ "professional_tour_guide":"导游资格",
63
+ "legal_professional":"法律职业资格",
64
+ "high_school_chinese":"高中语文",
65
+ "high_school_history":"高中历史",
66
+ "middle_school_history":"初中历史",
67
+ "civil_servant":"公务员",
68
+ "sports_science":"体育学",
69
+ "plant_protection":"植物保护",
70
+ "basic_medicine":"基础医学",
71
+ "clinical_medicine":"临床医学",
72
+ "urban_and_rural_planner":"注册城乡规划师",
73
+ "accountant":"注册会计师",
74
+ "fire_engineer":"注册消防工程师",
75
+ "environmental_impact_assessment_engineer":"环境影响评价工程师",
76
+ "tax_accountant":"税务师",
77
+ "physician":"医师资格"
78
+ }
79
+
80
+
81
+ # CMMLU
82
+
83
+ ### Paper
84
+
85
+ CMMLU: Measuring massive multitask language understanding in Chinese
86
+ https://arxiv.org/abs/2306.09212
87
+
88
+ CMMLU is a comprehensive evaluation benchmark specifically designed to evaluate the knowledge and reasoning abilities of LLMs within the context of Chinese language and culture.
89
+ CMMLU covers a wide range of subjects, comprising 67 topics that span from elementary to advanced professional levels.
90
+
91
+ Homepage: https://github.com/haonan-li/CMMLU
92
+
93
+ ### Citation
94
+
95
+ ```bibtex
96
+ @misc{li2023cmmlu,
97
+ title={CMMLU: Measuring massive multitask language understanding in Chinese},
98
+ author={Haonan Li and Yixuan Zhang and Fajri Koto and Yifei Yang and Hai Zhao and Yeyun Gong and Nan Duan and Timothy Baldwin},
99
+ year={2023},
100
+ eprint={2306.09212},
101
+ archivePrefix={arXiv},
102
+ primaryClass={cs.CL}
103
+ }
104
+ ```
105
+
106
+ ### Groups and Tasks
107
+
108
+ #### Groups
109
+
110
+ - `ceval-valid`: All 52 subjects of the C-Eval dataset, evaluated following the methodology in MMLU's original implementation. This implementation consists solely of the validation set of C-Eval, as the test set requires submission of model predictions to an external site.
111
+
112
+ #### Tasks
113
+
114
+
115
+ The following tasks evaluate subjects in the C-Eval dataset using loglikelihood-based multiple-choice scoring:
116
+ - `ceval-valid_{subject_english}`
117
+
118
+ ### Checklist
119
+
120
+ * [x] Is the task an existing benchmark in the literature?
121
+ * [x] Have you referenced the original paper that introduced the task?
122
+ * [ ] If yes, does the original paper provide a reference implementation?
123
+
124
+ If other tasks on this dataset are already supported:
125
+ * [x] Is the "Main" variant of this task clearly denoted?
126
+ * [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
127
+ * [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/_ceval-valid.yaml ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aggregate_metric_list:
2
+ - aggregation: mean
3
+ metric: acc
4
+ weight_by_size: true
5
+ - aggregation: mean
6
+ metric: acc_norm
7
+ weight_by_size: true
8
+ group: ceval-valid
9
+ metadata:
10
+ version: 2.0
11
+ task:
12
+ - ceval-valid_computer_network
13
+ - ceval-valid_operating_system
14
+ - ceval-valid_computer_architecture
15
+ - ceval-valid_college_programming
16
+ - ceval-valid_college_physics
17
+ - ceval-valid_college_chemistry
18
+ - ceval-valid_advanced_mathematics
19
+ - ceval-valid_probability_and_statistics
20
+ - ceval-valid_discrete_mathematics
21
+ - ceval-valid_electrical_engineer
22
+ - ceval-valid_metrology_engineer
23
+ - ceval-valid_high_school_mathematics
24
+ - ceval-valid_high_school_physics
25
+ - ceval-valid_high_school_chemistry
26
+ - ceval-valid_high_school_biology
27
+ - ceval-valid_middle_school_mathematics
28
+ - ceval-valid_middle_school_biology
29
+ - ceval-valid_middle_school_physics
30
+ - ceval-valid_middle_school_chemistry
31
+ - ceval-valid_veterinary_medicine
32
+ - ceval-valid_college_economics
33
+ - ceval-valid_business_administration
34
+ - ceval-valid_marxism
35
+ - ceval-valid_mao_zedong_thought
36
+ - ceval-valid_education_science
37
+ - ceval-valid_teacher_qualification
38
+ - ceval-valid_high_school_politics
39
+ - ceval-valid_high_school_geography
40
+ - ceval-valid_middle_school_politics
41
+ - ceval-valid_middle_school_geography
42
+ - ceval-valid_modern_chinese_history
43
+ - ceval-valid_ideological_and_moral_cultivation
44
+ - ceval-valid_logic
45
+ - ceval-valid_law
46
+ - ceval-valid_chinese_language_and_literature
47
+ - ceval-valid_art_studies
48
+ - ceval-valid_professional_tour_guide
49
+ - ceval-valid_legal_professional
50
+ - ceval-valid_high_school_chinese
51
+ - ceval-valid_high_school_history
52
+ - ceval-valid_middle_school_history
53
+ - ceval-valid_civil_servant
54
+ - ceval-valid_sports_science
55
+ - ceval-valid_plant_protection
56
+ - ceval-valid_basic_medicine
57
+ - ceval-valid_clinical_medicine
58
+ - ceval-valid_urban_and_rural_planner
59
+ - ceval-valid_accountant
60
+ - ceval-valid_fire_engineer
61
+ - ceval-valid_environmental_impact_assessment_engineer
62
+ - ceval-valid_tax_accountant
63
+ - ceval-valid_physician
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_accountant.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "accountant"
2
+ "description": "以下是中国关于注册会计师的单项选择题,请选出其中的正确答案。\n\n"
3
+ "include": "_default_ceval_yaml"
4
+ "task": "ceval-valid_accountant"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_advanced_mathematics.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "advanced_mathematics"
2
+ "description": "以下是中国关于高等数学的单项选择题,请选出其中的正确答案。\n\n"
3
+ "include": "_default_ceval_yaml"
4
+ "task": "ceval-valid_advanced_mathematics"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_art_studies.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "art_studies"
2
+ "description": "以下是中国关于艺术学的单项选择题,请选出其中的正确答案。\n\n"
3
+ "include": "_default_ceval_yaml"
4
+ "task": "ceval-valid_art_studies"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_business_administration.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "business_administration"
2
+ "description": "以下是中国关于工商管理的单项选择题,请选出其中的正确答案。\n\n"
3
+ "include": "_default_ceval_yaml"
4
+ "task": "ceval-valid_business_administration"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_chinese_language_and_literature.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "chinese_language_and_literature"
2
+ "description": "以下是中国关于中国语言文学的单项选择题,请选出其中的正确答案。\n\n"
3
+ "include": "_default_ceval_yaml"
4
+ "task": "ceval-valid_chinese_language_and_literature"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_civil_servant.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "civil_servant"
2
+ "description": "以下是中国关于公务员的单项选择题,请选出其中的正确答案。\n\n"
3
+ "include": "_default_ceval_yaml"
4
+ "task": "ceval-valid_civil_servant"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_clinical_medicine.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "clinical_medicine"
2
+ "description": "以下是中国关于临床医学的单项选择题,请选出其中的正确答案。\n\n"
3
+ "include": "_default_ceval_yaml"
4
+ "task": "ceval-valid_clinical_medicine"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_college_chemistry.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "college_chemistry"
2
+ "description": "以下是中国关于大学化学的单项选择题,请选出其中的正确答案。\n\n"
3
+ "include": "_default_ceval_yaml"
4
+ "task": "ceval-valid_college_chemistry"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_college_economics.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "college_economics"
2
+ "description": "以下是中国关于大学经济学的单项选择题,请选出其中的正确答案。\n\n"
3
+ "include": "_default_ceval_yaml"
4
+ "task": "ceval-valid_college_economics"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_college_programming.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "college_programming"
2
+ "description": "以下是中国关于大学编程的单项选择题,请选出其中的正确答案。\n\n"
3
+ "include": "_default_ceval_yaml"
4
+ "task": "ceval-valid_college_programming"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_computer_architecture.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "computer_architecture"
2
+ "description": "以下是中国关于计算机组成的单项选择题,请选出其中的正确答案。\n\n"
3
+ "include": "_default_ceval_yaml"
4
+ "task": "ceval-valid_computer_architecture"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_computer_network.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "computer_network"
2
+ "description": "以下是中国关于计算机网络的单项选择题,请选出其中的正确答案。\n\n"
3
+ "include": "_default_ceval_yaml"
4
+ "task": "ceval-valid_computer_network"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_discrete_mathematics.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "discrete_mathematics"
2
+ "description": "以下是中国关于离散数学的单项选择题,请选出其中的正确答案。\n\n"
3
+ "include": "_default_ceval_yaml"
4
+ "task": "ceval-valid_discrete_mathematics"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_education_science.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "education_science"
2
+ "description": "以下是中国关于教育学的单项选择题,请选出其中的正确答案。\n\n"
3
+ "include": "_default_ceval_yaml"
4
+ "task": "ceval-valid_education_science"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_electrical_engineer.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "electrical_engineer"
2
+ "description": "以下是中国关于注册电气工程师的单项选择题,请选出其中的正确答案。\n\n"
3
+ "include": "_default_ceval_yaml"
4
+ "task": "ceval-valid_electrical_engineer"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_environmental_impact_assessment_engineer.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "environmental_impact_assessment_engineer"
2
+ "description": "以下是中国关于环境影响评价工程师的单项选择题,请选出其中的正确答案。\n\n"
3
+ "include": "_default_ceval_yaml"
4
+ "task": "ceval-valid_environmental_impact_assessment_engineer"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_high_school_biology.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "high_school_biology"
2
+ "description": "以下是中国关于高中生物的单项选择题,请选出其中的正确答案。\n\n"
3
+ "include": "_default_ceval_yaml"
4
+ "task": "ceval-valid_high_school_biology"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_high_school_geography.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "high_school_geography"
2
+ "description": "以下是中国关于高中地理的单项选择题,请选出其中的正确答案。\n\n"
3
+ "include": "_default_ceval_yaml"
4
+ "task": "ceval-valid_high_school_geography"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_high_school_history.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "high_school_history"
2
+ "description": "以下是中国关于高中历史的单项选择题,请选出其中的正确答案。\n\n"
3
+ "include": "_default_ceval_yaml"
4
+ "task": "ceval-valid_high_school_history"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_high_school_physics.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "high_school_physics"
2
+ "description": "以下是中国关于高中物理的单项选择题,请选出其中的正确答案。\n\n"
3
+ "include": "_default_ceval_yaml"
4
+ "task": "ceval-valid_high_school_physics"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_ideological_and_moral_cultivation.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "ideological_and_moral_cultivation"
2
+ "description": "以下是中国关于思想道德修养与法律基础的单项选择题,请选出其中的正确答案。\n\n"
3
+ "include": "_default_ceval_yaml"
4
+ "task": "ceval-valid_ideological_and_moral_cultivation"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_law.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "law"
2
+ "description": "以下是中国关于法学的单项选择题,请选出其中的正确答案。\n\n"
3
+ "include": "_default_ceval_yaml"
4
+ "task": "ceval-valid_law"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_legal_professional.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "legal_professional"
2
+ "description": "以下是中国关于法律职业资格的单项选择题,请选出其中的正确答案。\n\n"
3
+ "include": "_default_ceval_yaml"
4
+ "task": "ceval-valid_legal_professional"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_logic.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "logic"
2
+ "description": "以下是中国关于逻辑学的单项选择题,请选出其中的正确答案。\n\n"
3
+ "include": "_default_ceval_yaml"
4
+ "task": "ceval-valid_logic"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_mao_zedong_thought.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "mao_zedong_thought"
2
+ "description": "以下是中国关于毛泽东思想和中国特色社会主义理论体系概论的单项选择题,请选出其中的正确答案。\n\n"
3
+ "include": "_default_ceval_yaml"
4
+ "task": "ceval-valid_mao_zedong_thought"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_marxism.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "marxism"
2
+ "description": "以下是中国关于马克思主义基本原理的单项选择题,请选出其中的正确答案。\n\n"
3
+ "include": "_default_ceval_yaml"
4
+ "task": "ceval-valid_marxism"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_metrology_engineer.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "metrology_engineer"
2
+ "description": "以下是中国关于注册计量师的单项选择题,请选出其中的正确答案。\n\n"
3
+ "include": "_default_ceval_yaml"
4
+ "task": "ceval-valid_metrology_engineer"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_middle_school_geography.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "middle_school_geography"
2
+ "description": "以下是中国关于初中地理的单项选择题,请选出其中的正确答案。\n\n"
3
+ "include": "_default_ceval_yaml"
4
+ "task": "ceval-valid_middle_school_geography"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_middle_school_history.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "middle_school_history"
2
+ "description": "以下是中国关于初中历史的单项选择题,请选出其中的正确答案。\n\n"
3
+ "include": "_default_ceval_yaml"
4
+ "task": "ceval-valid_middle_school_history"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_middle_school_mathematics.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "middle_school_mathematics"
2
+ "description": "以下是中国关于初中数学的单项选择题,请选出其中的正确答案。\n\n"
3
+ "include": "_default_ceval_yaml"
4
+ "task": "ceval-valid_middle_school_mathematics"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_middle_school_physics.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "middle_school_physics"
2
+ "description": "以下是中国关于初中物理的单项选择题,请选出其中的正确答案。\n\n"
3
+ "include": "_default_ceval_yaml"
4
+ "task": "ceval-valid_middle_school_physics"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_middle_school_politics.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "middle_school_politics"
2
+ "description": "以下是中国关于初中政治的单项选择题,请选出其中的正确答案。\n\n"
3
+ "include": "_default_ceval_yaml"
4
+ "task": "ceval-valid_middle_school_politics"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_modern_chinese_history.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "modern_chinese_history"
2
+ "description": "以下是中国关于近代史纲要的单项选择题,请选出其中的正确答案。\n\n"
3
+ "include": "_default_ceval_yaml"
4
+ "task": "ceval-valid_modern_chinese_history"
scripts/yans/lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_operating_system.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ "dataset_name": "operating_system"
2
+ "description": "以下是中国关于操作系统的单项选择题,请选出其中的正确答案。\n\n"
3
+ "include": "_default_ceval_yaml"
4
+ "task": "ceval-valid_operating_system"