diff --git "a/Results/CMMLU/results.json" "b/Results/CMMLU/results.json" new file mode 100644--- /dev/null +++ "b/Results/CMMLU/results.json" @@ -0,0 +1,3466 @@ +{ + "results": { + "cmmlu": { + "acc_norm,none": 0.5373855983422552, + "acc_norm_stderr,none": 0.004487253022655707, + "acc,none": 0.5373855983422552, + "acc_stderr,none": 0.004487253022655707, + "alias": "cmmlu" + }, + "cmmlu_agronomy": { + "acc,none": 0.46153846153846156, + "acc_stderr,none": 0.03846153846153845, + "acc_norm,none": 0.46153846153846156, + "acc_norm_stderr,none": 0.03846153846153845, + "alias": " - cmmlu_agronomy" + }, + "cmmlu_anatomy": { + "acc,none": 0.47297297297297297, + "acc_stderr,none": 0.04117901352838131, + "acc_norm,none": 0.47297297297297297, + "acc_norm_stderr,none": 0.04117901352838131, + "alias": " - cmmlu_anatomy" + }, + "cmmlu_ancient_chinese": { + "acc,none": 0.25609756097560976, + "acc_stderr,none": 0.03418746588364997, + "acc_norm,none": 0.25609756097560976, + "acc_norm_stderr,none": 0.03418746588364997, + "alias": " - cmmlu_ancient_chinese" + }, + "cmmlu_arts": { + "acc,none": 0.65625, + "acc_stderr,none": 0.03766668927755763, + "acc_norm,none": 0.65625, + "acc_norm_stderr,none": 0.03766668927755763, + "alias": " - cmmlu_arts" + }, + "cmmlu_astronomy": { + "acc,none": 0.32727272727272727, + "acc_stderr,none": 0.03663974994391244, + "acc_norm,none": 0.32727272727272727, + "acc_norm_stderr,none": 0.03663974994391244, + "alias": " - cmmlu_astronomy" + }, + "cmmlu_business_ethics": { + "acc,none": 0.583732057416268, + "acc_stderr,none": 0.034179175970231256, + "acc_norm,none": 0.583732057416268, + "acc_norm_stderr,none": 0.034179175970231256, + "alias": " - cmmlu_business_ethics" + }, + "cmmlu_chinese_civil_service_exam": { + "acc,none": 0.46875, + "acc_stderr,none": 0.039575057062617526, + "acc_norm,none": 0.46875, + "acc_norm_stderr,none": 0.039575057062617526, + "alias": " - cmmlu_chinese_civil_service_exam" + }, + "cmmlu_chinese_driving_rule": { + "acc,none": 0.8549618320610687, + "acc_stderr,none": 0.03088466108951538, + "acc_norm,none": 0.8549618320610687, + "acc_norm_stderr,none": 0.03088466108951538, + "alias": " - cmmlu_chinese_driving_rule" + }, + "cmmlu_chinese_food_culture": { + "acc,none": 0.47058823529411764, + "acc_stderr,none": 0.04295863196118948, + "acc_norm,none": 0.47058823529411764, + "acc_norm_stderr,none": 0.04295863196118948, + "alias": " - cmmlu_chinese_food_culture" + }, + "cmmlu_chinese_foreign_policy": { + "acc,none": 0.5233644859813084, + "acc_stderr,none": 0.048511241723296745, + "acc_norm,none": 0.5233644859813084, + "acc_norm_stderr,none": 0.048511241723296745, + "alias": " - cmmlu_chinese_foreign_policy" + }, + "cmmlu_chinese_history": { + "acc,none": 0.6130030959752322, + "acc_stderr,none": 0.02714295604836581, + "acc_norm,none": 0.6130030959752322, + "acc_norm_stderr,none": 0.02714295604836581, + "alias": " - cmmlu_chinese_history" + }, + "cmmlu_chinese_literature": { + "acc,none": 0.39215686274509803, + "acc_stderr,none": 0.03426712349247271, + "acc_norm,none": 0.39215686274509803, + "acc_norm_stderr,none": 0.03426712349247271, + "alias": " - cmmlu_chinese_literature" + }, + "cmmlu_chinese_teacher_qualification": { + "acc,none": 0.7206703910614525, + "acc_stderr,none": 0.0336292223871436, + "acc_norm,none": 0.7206703910614525, + "acc_norm_stderr,none": 0.0336292223871436, + "alias": " - cmmlu_chinese_teacher_qualification" + }, + "cmmlu_clinical_knowledge": { + "acc,none": 0.5021097046413502, + "acc_stderr,none": 0.032546938018020076, + "acc_norm,none": 0.5021097046413502, + "acc_norm_stderr,none": 0.032546938018020076, + "alias": " - cmmlu_clinical_knowledge" + }, + "cmmlu_college_actuarial_science": { + "acc,none": 0.29245283018867924, + "acc_stderr,none": 0.04439263906199628, + "acc_norm,none": 0.29245283018867924, + "acc_norm_stderr,none": 0.04439263906199628, + "alias": " - cmmlu_college_actuarial_science" + }, + "cmmlu_college_education": { + "acc,none": 0.6355140186915887, + "acc_stderr,none": 0.046746602211107775, + "acc_norm,none": 0.6355140186915887, + "acc_norm_stderr,none": 0.046746602211107775, + "alias": " - cmmlu_college_education" + }, + "cmmlu_college_engineering_hydrology": { + "acc,none": 0.39622641509433965, + "acc_stderr,none": 0.047732492983673595, + "acc_norm,none": 0.39622641509433965, + "acc_norm_stderr,none": 0.047732492983673595, + "alias": " - cmmlu_college_engineering_hydrology" + }, + "cmmlu_college_law": { + "acc,none": 0.5555555555555556, + "acc_stderr,none": 0.04803752235190193, + "acc_norm,none": 0.5555555555555556, + "acc_norm_stderr,none": 0.04803752235190193, + "alias": " - cmmlu_college_law" + }, + "cmmlu_college_mathematics": { + "acc,none": 0.24761904761904763, + "acc_stderr,none": 0.04232473532055043, + "acc_norm,none": 0.24761904761904763, + "acc_norm_stderr,none": 0.04232473532055043, + "alias": " - cmmlu_college_mathematics" + }, + "cmmlu_college_medical_statistics": { + "acc,none": 0.5094339622641509, + "acc_stderr,none": 0.04878631739837742, + "acc_norm,none": 0.5094339622641509, + "acc_norm_stderr,none": 0.04878631739837742, + "alias": " - cmmlu_college_medical_statistics" + }, + "cmmlu_college_medicine": { + "acc,none": 0.5531135531135531, + "acc_stderr,none": 0.030145416591160445, + "acc_norm,none": 0.5531135531135531, + "acc_norm_stderr,none": 0.030145416591160445, + "alias": " - cmmlu_college_medicine" + }, + "cmmlu_computer_science": { + "acc,none": 0.5882352941176471, + "acc_stderr,none": 0.0345423658538061, + "acc_norm,none": 0.5882352941176471, + "acc_norm_stderr,none": 0.0345423658538061, + "alias": " - cmmlu_computer_science" + }, + "cmmlu_computer_security": { + "acc,none": 0.6549707602339181, + "acc_stderr,none": 0.036459813773888065, + "acc_norm,none": 0.6549707602339181, + "acc_norm_stderr,none": 0.036459813773888065, + "alias": " - cmmlu_computer_security" + }, + "cmmlu_conceptual_physics": { + "acc,none": 0.6122448979591837, + "acc_stderr,none": 0.040324121989960035, + "acc_norm,none": 0.6122448979591837, + "acc_norm_stderr,none": 0.040324121989960035, + "alias": " - cmmlu_conceptual_physics" + }, + "cmmlu_construction_project_management": { + "acc,none": 0.4676258992805755, + "acc_stderr,none": 0.04247351413431025, + "acc_norm,none": 0.4676258992805755, + "acc_norm_stderr,none": 0.04247351413431025, + "alias": " - cmmlu_construction_project_management" + }, + "cmmlu_economics": { + "acc,none": 0.5534591194968553, + "acc_stderr,none": 0.03954985017675704, + "acc_norm,none": 0.5534591194968553, + "acc_norm_stderr,none": 0.03954985017675704, + "alias": " - cmmlu_economics" + }, + "cmmlu_education": { + "acc,none": 0.6319018404907976, + "acc_stderr,none": 0.03789213935838396, + "acc_norm,none": 0.6319018404907976, + "acc_norm_stderr,none": 0.03789213935838396, + "alias": " - cmmlu_education" + }, + "cmmlu_electrical_engineering": { + "acc,none": 0.6627906976744186, + "acc_stderr,none": 0.03615263198871634, + "acc_norm,none": 0.6627906976744186, + "acc_norm_stderr,none": 0.03615263198871634, + "alias": " - cmmlu_electrical_engineering" + }, + "cmmlu_elementary_chinese": { + "acc,none": 0.5436507936507936, + "acc_stderr,none": 0.03143922285806298, + "acc_norm,none": 0.5436507936507936, + "acc_norm_stderr,none": 0.03143922285806298, + "alias": " - cmmlu_elementary_chinese" + }, + "cmmlu_elementary_commonsense": { + "acc,none": 0.5909090909090909, + "acc_stderr,none": 0.03502975799413007, + "acc_norm,none": 0.5909090909090909, + "acc_norm_stderr,none": 0.03502975799413007, + "alias": " - cmmlu_elementary_commonsense" + }, + "cmmlu_elementary_information_and_technology": { + "acc,none": 0.7605042016806722, + "acc_stderr,none": 0.027722065493361255, + "acc_norm,none": 0.7605042016806722, + "acc_norm_stderr,none": 0.027722065493361255, + "alias": " - cmmlu_elementary_information_and_technology" + }, + "cmmlu_elementary_mathematics": { + "acc,none": 0.3391304347826087, + "acc_stderr,none": 0.03128408938822598, + "acc_norm,none": 0.3391304347826087, + "acc_norm_stderr,none": 0.03128408938822598, + "alias": " - cmmlu_elementary_mathematics" + }, + "cmmlu_ethnology": { + "acc,none": 0.48148148148148145, + "acc_stderr,none": 0.043163785995113245, + "acc_norm,none": 0.48148148148148145, + "acc_norm_stderr,none": 0.043163785995113245, + "alias": " - cmmlu_ethnology" + }, + "cmmlu_food_science": { + "acc,none": 0.5104895104895105, + "acc_stderr,none": 0.041949833400405354, + "acc_norm,none": 0.5104895104895105, + "acc_norm_stderr,none": 0.041949833400405354, + "alias": " - cmmlu_food_science" + }, + "cmmlu_genetics": { + "acc,none": 0.4431818181818182, + "acc_stderr,none": 0.03755161736785979, + "acc_norm,none": 0.4431818181818182, + "acc_norm_stderr,none": 0.03755161736785979, + "alias": " - cmmlu_genetics" + }, + "cmmlu_global_facts": { + "acc,none": 0.5234899328859061, + "acc_stderr,none": 0.04105436598675506, + "acc_norm,none": 0.5234899328859061, + "acc_norm_stderr,none": 0.04105436598675506, + "alias": " - cmmlu_global_facts" + }, + "cmmlu_high_school_biology": { + "acc,none": 0.5266272189349113, + "acc_stderr,none": 0.03852109743620031, + "acc_norm,none": 0.5266272189349113, + "acc_norm_stderr,none": 0.03852109743620031, + "alias": " - cmmlu_high_school_biology" + }, + "cmmlu_high_school_chemistry": { + "acc,none": 0.5227272727272727, + "acc_stderr,none": 0.04364005015655943, + "acc_norm,none": 0.5227272727272727, + "acc_norm_stderr,none": 0.04364005015655943, + "alias": " - cmmlu_high_school_chemistry" + }, + "cmmlu_high_school_geography": { + "acc,none": 0.4830508474576271, + "acc_stderr,none": 0.04619845024855636, + "acc_norm,none": 0.4830508474576271, + "acc_norm_stderr,none": 0.04619845024855636, + "alias": " - cmmlu_high_school_geography" + }, + "cmmlu_high_school_mathematics": { + "acc,none": 0.2865853658536585, + "acc_stderr,none": 0.03541638332993505, + "acc_norm,none": 0.2865853658536585, + "acc_norm_stderr,none": 0.03541638332993505, + "alias": " - cmmlu_high_school_mathematics" + }, + "cmmlu_high_school_physics": { + "acc,none": 0.4636363636363636, + "acc_stderr,none": 0.04776449162396197, + "acc_norm,none": 0.4636363636363636, + "acc_norm_stderr,none": 0.04776449162396197, + "alias": " - cmmlu_high_school_physics" + }, + "cmmlu_high_school_politics": { + "acc,none": 0.5454545454545454, + "acc_stderr,none": 0.04178532361608382, + "acc_norm,none": 0.5454545454545454, + "acc_norm_stderr,none": 0.04178532361608382, + "alias": " - cmmlu_high_school_politics" + }, + "cmmlu_human_sexuality": { + "acc,none": 0.47619047619047616, + "acc_stderr,none": 0.04467062628403273, + "acc_norm,none": 0.47619047619047616, + "acc_norm_stderr,none": 0.04467062628403273, + "alias": " - cmmlu_human_sexuality" + }, + "cmmlu_international_law": { + "acc,none": 0.44324324324324327, + "acc_stderr,none": 0.03662223951330472, + "acc_norm,none": 0.44324324324324327, + "acc_norm_stderr,none": 0.03662223951330472, + "alias": " - cmmlu_international_law" + }, + "cmmlu_journalism": { + "acc,none": 0.5232558139534884, + "acc_stderr,none": 0.03819457472859222, + "acc_norm,none": 0.5232558139534884, + "acc_norm_stderr,none": 0.03819457472859222, + "alias": " - cmmlu_journalism" + }, + "cmmlu_jurisprudence": { + "acc,none": 0.5717761557177615, + "acc_stderr,none": 0.02443748537372801, + "acc_norm,none": 0.5717761557177615, + "acc_norm_stderr,none": 0.02443748537372801, + "alias": " - cmmlu_jurisprudence" + }, + "cmmlu_legal_and_moral_basis": { + "acc,none": 0.9018691588785047, + "acc_stderr,none": 0.02038378016035643, + "acc_norm,none": 0.9018691588785047, + "acc_norm_stderr,none": 0.02038378016035643, + "alias": " - cmmlu_legal_and_moral_basis" + }, + "cmmlu_logical": { + "acc,none": 0.4796747967479675, + "acc_stderr,none": 0.04523045598338889, + "acc_norm,none": 0.4796747967479675, + "acc_norm_stderr,none": 0.04523045598338889, + "alias": " - cmmlu_logical" + }, + "cmmlu_machine_learning": { + "acc,none": 0.39344262295081966, + "acc_stderr,none": 0.04441032615723205, + "acc_norm,none": 0.39344262295081966, + "acc_norm_stderr,none": 0.04441032615723205, + "alias": " - cmmlu_machine_learning" + }, + "cmmlu_management": { + "acc,none": 0.638095238095238, + "acc_stderr,none": 0.03324043951593504, + "acc_norm,none": 0.638095238095238, + "acc_norm_stderr,none": 0.03324043951593504, + "alias": " - cmmlu_management" + }, + "cmmlu_marketing": { + "acc,none": 0.4888888888888889, + "acc_stderr,none": 0.037362525904368636, + "acc_norm,none": 0.4888888888888889, + "acc_norm_stderr,none": 0.037362525904368636, + "alias": " - cmmlu_marketing" + }, + "cmmlu_marxist_theory": { + "acc,none": 0.9047619047619048, + "acc_stderr,none": 0.02140886181268027, + "acc_norm,none": 0.9047619047619048, + "acc_norm_stderr,none": 0.02140886181268027, + "alias": " - cmmlu_marxist_theory" + }, + "cmmlu_modern_chinese": { + "acc,none": 0.4051724137931034, + "acc_stderr,none": 0.04577902774948876, + "acc_norm,none": 0.4051724137931034, + "acc_norm_stderr,none": 0.04577902774948876, + "alias": " - cmmlu_modern_chinese" + }, + "cmmlu_nutrition": { + "acc,none": 0.496551724137931, + "acc_stderr,none": 0.04166567577101579, + "acc_norm,none": 0.496551724137931, + "acc_norm_stderr,none": 0.04166567577101579, + "alias": " - cmmlu_nutrition" + }, + "cmmlu_philosophy": { + "acc,none": 0.5904761904761905, + "acc_stderr,none": 0.0482196555595126, + "acc_norm,none": 0.5904761904761905, + "acc_norm_stderr,none": 0.0482196555595126, + "alias": " - cmmlu_philosophy" + }, + "cmmlu_professional_accounting": { + "acc,none": 0.6571428571428571, + "acc_stderr,none": 0.035984201709851615, + "acc_norm,none": 0.6571428571428571, + "acc_norm_stderr,none": 0.035984201709851615, + "alias": " - cmmlu_professional_accounting" + }, + "cmmlu_professional_law": { + "acc,none": 0.5023696682464455, + "acc_stderr,none": 0.03450289047052379, + "acc_norm,none": 0.5023696682464455, + "acc_norm_stderr,none": 0.03450289047052379, + "alias": " - cmmlu_professional_law" + }, + "cmmlu_professional_medicine": { + "acc,none": 0.4095744680851064, + "acc_stderr,none": 0.02539413177240804, + "acc_norm,none": 0.4095744680851064, + "acc_norm_stderr,none": 0.02539413177240804, + "alias": " - cmmlu_professional_medicine" + }, + "cmmlu_professional_psychology": { + "acc,none": 0.603448275862069, + "acc_stderr,none": 0.03218577394203013, + "acc_norm,none": 0.603448275862069, + "acc_norm_stderr,none": 0.03218577394203013, + "alias": " - cmmlu_professional_psychology" + }, + "cmmlu_public_relations": { + "acc,none": 0.5172413793103449, + "acc_stderr,none": 0.03799168868945867, + "acc_norm,none": 0.5172413793103449, + "acc_norm_stderr,none": 0.03799168868945867, + "alias": " - cmmlu_public_relations" + }, + "cmmlu_security_study": { + "acc,none": 0.7111111111111111, + "acc_stderr,none": 0.0391545063041425, + "acc_norm,none": 0.7111111111111111, + "acc_norm_stderr,none": 0.0391545063041425, + "alias": " - cmmlu_security_study" + }, + "cmmlu_sociology": { + "acc,none": 0.5088495575221239, + "acc_stderr,none": 0.03332811194650094, + "acc_norm,none": 0.5088495575221239, + "acc_norm_stderr,none": 0.03332811194650094, + "alias": " - cmmlu_sociology" + }, + "cmmlu_sports_science": { + "acc,none": 0.5333333333333333, + "acc_stderr,none": 0.03895658065271846, + "acc_norm,none": 0.5333333333333333, + "acc_norm_stderr,none": 0.03895658065271846, + "alias": " - cmmlu_sports_science" + }, + "cmmlu_traditional_chinese_medicine": { + "acc,none": 0.4918918918918919, + "acc_stderr,none": 0.036855642198496893, + "acc_norm,none": 0.4918918918918919, + "acc_norm_stderr,none": 0.036855642198496893, + "alias": " - cmmlu_traditional_chinese_medicine" + }, + "cmmlu_virology": { + "acc,none": 0.5621301775147929, + "acc_stderr,none": 0.038276861175393674, + "acc_norm,none": 0.5621301775147929, + "acc_norm_stderr,none": 0.038276861175393674, + "alias": " - cmmlu_virology" + }, + "cmmlu_world_history": { + "acc,none": 0.5714285714285714, + "acc_stderr,none": 0.0391230398217976, + "acc_norm,none": 0.5714285714285714, + "acc_norm_stderr,none": 0.0391230398217976, + "alias": " - cmmlu_world_history" + }, + "cmmlu_world_religions": { + "acc,none": 0.46875, + "acc_stderr,none": 0.039575057062617526, + "acc_norm,none": 0.46875, + "acc_norm_stderr,none": 0.039575057062617526, + "alias": " - cmmlu_world_religions" + } + }, + "groups": { + "cmmlu": { + "acc_norm,none": 0.5373855983422552, + "acc_norm_stderr,none": 0.004487253022655707, + "acc,none": 0.5373855983422552, + "acc_stderr,none": 0.004487253022655707, + "alias": "cmmlu" + } + }, + "group_subtasks": { + "cmmlu": [ + "cmmlu_sports_science", + "cmmlu_professional_accounting", + "cmmlu_machine_learning", + "cmmlu_human_sexuality", + "cmmlu_high_school_chemistry", + "cmmlu_college_mathematics", + "cmmlu_college_actuarial_science", + "cmmlu_arts", + "cmmlu_world_religions", + "cmmlu_virology", + "cmmlu_traditional_chinese_medicine", + "cmmlu_construction_project_management", + "cmmlu_conceptual_physics", + "cmmlu_college_education", + "cmmlu_world_history", + "cmmlu_security_study", + "cmmlu_global_facts", + "cmmlu_education", + "cmmlu_computer_security", + "cmmlu_college_medicine", + "cmmlu_clinical_knowledge", + "cmmlu_astronomy", + "cmmlu_sociology", + "cmmlu_marketing", + "cmmlu_journalism", + "cmmlu_high_school_physics", + "cmmlu_genetics", + "cmmlu_food_science", + "cmmlu_elementary_mathematics", + "cmmlu_elementary_commonsense", + "cmmlu_elementary_chinese", + "cmmlu_economics", + "cmmlu_business_ethics", + "cmmlu_ancient_chinese", + "cmmlu_anatomy", + "cmmlu_logical", + "cmmlu_jurisprudence", + "cmmlu_international_law", + "cmmlu_high_school_mathematics", + "cmmlu_high_school_geography", + "cmmlu_ethnology", + "cmmlu_chinese_teacher_qualification", + "cmmlu_chinese_food_culture", + "cmmlu_public_relations", + "cmmlu_philosophy", + "cmmlu_management", + "cmmlu_legal_and_moral_basis", + "cmmlu_high_school_biology", + "cmmlu_electrical_engineering", + "cmmlu_computer_science", + "cmmlu_chinese_literature", + "cmmlu_chinese_driving_rule", + "cmmlu_professional_medicine", + "cmmlu_nutrition", + "cmmlu_modern_chinese", + "cmmlu_elementary_information_and_technology", + "cmmlu_college_medical_statistics", + "cmmlu_chinese_civil_service_exam", + "cmmlu_professional_psychology", + "cmmlu_professional_law", + "cmmlu_marxist_theory", + "cmmlu_high_school_politics", + "cmmlu_college_law", + "cmmlu_college_engineering_hydrology", + "cmmlu_chinese_history", + "cmmlu_chinese_foreign_policy", + "cmmlu_agronomy" + ] + }, + "configs": { + "cmmlu_agronomy": { + "task": "cmmlu_agronomy", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "agronomy", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于农学的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_anatomy": { + "task": "cmmlu_anatomy", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "anatomy", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于解剖学的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_ancient_chinese": { + "task": "cmmlu_ancient_chinese", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "ancient_chinese", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于古汉语的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_arts": { + "task": "cmmlu_arts", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "arts", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于艺术学的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_astronomy": { + "task": "cmmlu_astronomy", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "astronomy", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于天文学的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_business_ethics": { + "task": "cmmlu_business_ethics", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "business_ethics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于商业伦理的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_chinese_civil_service_exam": { + "task": "cmmlu_chinese_civil_service_exam", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "chinese_civil_service_exam", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于中国公务员考试的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_chinese_driving_rule": { + "task": "cmmlu_chinese_driving_rule", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "chinese_driving_rule", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于中国驾驶规则的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_chinese_food_culture": { + "task": "cmmlu_chinese_food_culture", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "chinese_food_culture", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于中国饮食文化的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_chinese_foreign_policy": { + "task": "cmmlu_chinese_foreign_policy", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "chinese_foreign_policy", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于中国外交政策的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_chinese_history": { + "task": "cmmlu_chinese_history", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "chinese_history", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于中国历史的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_chinese_literature": { + "task": "cmmlu_chinese_literature", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "chinese_literature", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于中国文学的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_chinese_teacher_qualification": { + "task": "cmmlu_chinese_teacher_qualification", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "chinese_teacher_qualification", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于中国教师资格的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_clinical_knowledge": { + "task": "cmmlu_clinical_knowledge", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "clinical_knowledge", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于临床知识的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_college_actuarial_science": { + "task": "cmmlu_college_actuarial_science", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "college_actuarial_science", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于大学精算学的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_college_education": { + "task": "cmmlu_college_education", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "college_education", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于大学教育学的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_college_engineering_hydrology": { + "task": "cmmlu_college_engineering_hydrology", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "college_engineering_hydrology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于大学工程水文学的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_college_law": { + "task": "cmmlu_college_law", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "college_law", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于大学法律的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_college_mathematics": { + "task": "cmmlu_college_mathematics", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "college_mathematics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于大学数学的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_college_medical_statistics": { + "task": "cmmlu_college_medical_statistics", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "college_medical_statistics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于大学医学统计的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_college_medicine": { + "task": "cmmlu_college_medicine", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "college_medicine", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于大学医学的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_computer_science": { + "task": "cmmlu_computer_science", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "computer_science", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于计算机科学的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_computer_security": { + "task": "cmmlu_computer_security", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "computer_security", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于计算机安全的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_conceptual_physics": { + "task": "cmmlu_conceptual_physics", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "conceptual_physics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于概念物理学的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_construction_project_management": { + "task": "cmmlu_construction_project_management", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "construction_project_management", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于建设工程管理的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_economics": { + "task": "cmmlu_economics", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "economics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于经济学的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_education": { + "task": "cmmlu_education", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "education", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于教育学的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_electrical_engineering": { + "task": "cmmlu_electrical_engineering", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "electrical_engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于电气工程的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_elementary_chinese": { + "task": "cmmlu_elementary_chinese", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "elementary_chinese", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于小学语文的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_elementary_commonsense": { + "task": "cmmlu_elementary_commonsense", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "elementary_commonsense", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于小学常识的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_elementary_information_and_technology": { + "task": "cmmlu_elementary_information_and_technology", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "elementary_information_and_technology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于小学信息技术的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_elementary_mathematics": { + "task": "cmmlu_elementary_mathematics", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "elementary_mathematics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于初等数学的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_ethnology": { + "task": "cmmlu_ethnology", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "ethnology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于民族学的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_food_science": { + "task": "cmmlu_food_science", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "food_science", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于食品科学的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_genetics": { + "task": "cmmlu_genetics", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "genetics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于遗传学的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_global_facts": { + "task": "cmmlu_global_facts", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "global_facts", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于全球事实的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_high_school_biology": { + "task": "cmmlu_high_school_biology", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "high_school_biology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于高中生物的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_high_school_chemistry": { + "task": "cmmlu_high_school_chemistry", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "high_school_chemistry", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于高中化学的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_high_school_geography": { + "task": "cmmlu_high_school_geography", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "high_school_geography", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于高中地理的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_high_school_mathematics": { + "task": "cmmlu_high_school_mathematics", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "high_school_mathematics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于高中数学的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_high_school_physics": { + "task": "cmmlu_high_school_physics", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "high_school_physics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于高中物理学的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_high_school_politics": { + "task": "cmmlu_high_school_politics", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "high_school_politics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于高中政治的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_human_sexuality": { + "task": "cmmlu_human_sexuality", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "human_sexuality", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于人类性行为的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_international_law": { + "task": "cmmlu_international_law", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "international_law", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于国际法学的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_journalism": { + "task": "cmmlu_journalism", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "journalism", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于新闻学的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_jurisprudence": { + "task": "cmmlu_jurisprudence", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "jurisprudence", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于法理学的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_legal_and_moral_basis": { + "task": "cmmlu_legal_and_moral_basis", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "legal_and_moral_basis", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于法律与道德基础的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_logical": { + "task": "cmmlu_logical", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "logical", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于逻辑学的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_machine_learning": { + "task": "cmmlu_machine_learning", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "machine_learning", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于机器学习的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_management": { + "task": "cmmlu_management", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "management", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于管理学的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_marketing": { + "task": "cmmlu_marketing", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "marketing", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于市场营销的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_marxist_theory": { + "task": "cmmlu_marxist_theory", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "marxist_theory", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于马克思主义理论的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_modern_chinese": { + "task": "cmmlu_modern_chinese", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "modern_chinese", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于现代汉语的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_nutrition": { + "task": "cmmlu_nutrition", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "nutrition", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于营养学的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_philosophy": { + "task": "cmmlu_philosophy", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "philosophy", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于哲学的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_professional_accounting": { + "task": "cmmlu_professional_accounting", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "professional_accounting", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于专业会计的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_professional_law": { + "task": "cmmlu_professional_law", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "professional_law", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于专业法学的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_professional_medicine": { + "task": "cmmlu_professional_medicine", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "professional_medicine", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于专业医学的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_professional_psychology": { + "task": "cmmlu_professional_psychology", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "professional_psychology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于专��心理学的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_public_relations": { + "task": "cmmlu_public_relations", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "public_relations", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于公共关系的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_security_study": { + "task": "cmmlu_security_study", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "security_study", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于安全研究的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_sociology": { + "task": "cmmlu_sociology", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "sociology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于社会学的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_sports_science": { + "task": "cmmlu_sports_science", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "sports_science", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于体育学的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_traditional_chinese_medicine": { + "task": "cmmlu_traditional_chinese_medicine", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "traditional_chinese_medicine", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于中医中药的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_virology": { + "task": "cmmlu_virology", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "virology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于病毒学的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_world_history": { + "task": "cmmlu_world_history", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "world_history", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于世界历史的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + }, + "cmmlu_world_religions": { + "task": "cmmlu_world_religions", + "group": "cmmlu", + "dataset_path": "haonan-li/cmmlu", + "dataset_name": "world_religions", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:", + "doc_to_target": "{{['A', 'B', 'C', 'D'].index(Answer)}}", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "以下是关于世界宗教的单项选择题,请直接给出正确答案的选项。\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0.0 + } + } + }, + "versions": { + "cmmlu_agronomy": 0.0, + "cmmlu_anatomy": 0.0, + "cmmlu_ancient_chinese": 0.0, + "cmmlu_arts": 0.0, + "cmmlu_astronomy": 0.0, + "cmmlu_business_ethics": 0.0, + "cmmlu_chinese_civil_service_exam": 0.0, + "cmmlu_chinese_driving_rule": 0.0, + "cmmlu_chinese_food_culture": 0.0, + "cmmlu_chinese_foreign_policy": 0.0, + "cmmlu_chinese_history": 0.0, + "cmmlu_chinese_literature": 0.0, + "cmmlu_chinese_teacher_qualification": 0.0, + "cmmlu_clinical_knowledge": 0.0, + "cmmlu_college_actuarial_science": 0.0, + "cmmlu_college_education": 0.0, + "cmmlu_college_engineering_hydrology": 0.0, + "cmmlu_college_law": 0.0, + "cmmlu_college_mathematics": 0.0, + "cmmlu_college_medical_statistics": 0.0, + "cmmlu_college_medicine": 0.0, + "cmmlu_computer_science": 0.0, + "cmmlu_computer_security": 0.0, + "cmmlu_conceptual_physics": 0.0, + "cmmlu_construction_project_management": 0.0, + "cmmlu_economics": 0.0, + "cmmlu_education": 0.0, + "cmmlu_electrical_engineering": 0.0, + "cmmlu_elementary_chinese": 0.0, + "cmmlu_elementary_commonsense": 0.0, + "cmmlu_elementary_information_and_technology": 0.0, + "cmmlu_elementary_mathematics": 0.0, + "cmmlu_ethnology": 0.0, + "cmmlu_food_science": 0.0, + "cmmlu_genetics": 0.0, + "cmmlu_global_facts": 0.0, + "cmmlu_high_school_biology": 0.0, + "cmmlu_high_school_chemistry": 0.0, + "cmmlu_high_school_geography": 0.0, + "cmmlu_high_school_mathematics": 0.0, + "cmmlu_high_school_physics": 0.0, + "cmmlu_high_school_politics": 0.0, + "cmmlu_human_sexuality": 0.0, + "cmmlu_international_law": 0.0, + "cmmlu_journalism": 0.0, + "cmmlu_jurisprudence": 0.0, + "cmmlu_legal_and_moral_basis": 0.0, + "cmmlu_logical": 0.0, + "cmmlu_machine_learning": 0.0, + "cmmlu_management": 0.0, + "cmmlu_marketing": 0.0, + "cmmlu_marxist_theory": 0.0, + "cmmlu_modern_chinese": 0.0, + "cmmlu_nutrition": 0.0, + "cmmlu_philosophy": 0.0, + "cmmlu_professional_accounting": 0.0, + "cmmlu_professional_law": 0.0, + "cmmlu_professional_medicine": 0.0, + "cmmlu_professional_psychology": 0.0, + "cmmlu_public_relations": 0.0, + "cmmlu_security_study": 0.0, + "cmmlu_sociology": 0.0, + "cmmlu_sports_science": 0.0, + "cmmlu_traditional_chinese_medicine": 0.0, + "cmmlu_virology": 0.0, + "cmmlu_world_history": 0.0, + "cmmlu_world_religions": 0.0 + }, + "n-shot": { + "cmmlu": 0, + "cmmlu_agronomy": 0, + "cmmlu_anatomy": 0, + "cmmlu_ancient_chinese": 0, + "cmmlu_arts": 0, + "cmmlu_astronomy": 0, + "cmmlu_business_ethics": 0, + "cmmlu_chinese_civil_service_exam": 0, + "cmmlu_chinese_driving_rule": 0, + "cmmlu_chinese_food_culture": 0, + "cmmlu_chinese_foreign_policy": 0, + "cmmlu_chinese_history": 0, + "cmmlu_chinese_literature": 0, + "cmmlu_chinese_teacher_qualification": 0, + "cmmlu_clinical_knowledge": 0, + "cmmlu_college_actuarial_science": 0, + "cmmlu_college_education": 0, + "cmmlu_college_engineering_hydrology": 0, + "cmmlu_college_law": 0, + "cmmlu_college_mathematics": 0, + "cmmlu_college_medical_statistics": 0, + "cmmlu_college_medicine": 0, + "cmmlu_computer_science": 0, + "cmmlu_computer_security": 0, + "cmmlu_conceptual_physics": 0, + "cmmlu_construction_project_management": 0, + "cmmlu_economics": 0, + "cmmlu_education": 0, + "cmmlu_electrical_engineering": 0, + "cmmlu_elementary_chinese": 0, + "cmmlu_elementary_commonsense": 0, + "cmmlu_elementary_information_and_technology": 0, + "cmmlu_elementary_mathematics": 0, + "cmmlu_ethnology": 0, + "cmmlu_food_science": 0, + "cmmlu_genetics": 0, + "cmmlu_global_facts": 0, + "cmmlu_high_school_biology": 0, + "cmmlu_high_school_chemistry": 0, + "cmmlu_high_school_geography": 0, + "cmmlu_high_school_mathematics": 0, + "cmmlu_high_school_physics": 0, + "cmmlu_high_school_politics": 0, + "cmmlu_human_sexuality": 0, + "cmmlu_international_law": 0, + "cmmlu_journalism": 0, + "cmmlu_jurisprudence": 0, + "cmmlu_legal_and_moral_basis": 0, + "cmmlu_logical": 0, + "cmmlu_machine_learning": 0, + "cmmlu_management": 0, + "cmmlu_marketing": 0, + "cmmlu_marxist_theory": 0, + "cmmlu_modern_chinese": 0, + "cmmlu_nutrition": 0, + "cmmlu_philosophy": 0, + "cmmlu_professional_accounting": 0, + "cmmlu_professional_law": 0, + "cmmlu_professional_medicine": 0, + "cmmlu_professional_psychology": 0, + "cmmlu_public_relations": 0, + "cmmlu_security_study": 0, + "cmmlu_sociology": 0, + "cmmlu_sports_science": 0, + "cmmlu_traditional_chinese_medicine": 0, + "cmmlu_virology": 0, + "cmmlu_world_history": 0, + "cmmlu_world_religions": 0 + }, + "config": { + "model": "hf", + "model_args": "pretrained=/gemini/platform/public/trained_models/sft/Qwen2_0.5B-2024.07.02-label.lingzhichat+finance+account+law-exp.00/checkpoint-1750", + "batch_size": "auto", + "batch_sizes": [ + 64 + ], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null + }, + "git_hash": "3196e907", + "date": 1719990981.9163392, + "pretty_env_info": "PyTorch version: 2.4.0a0+07cecf4168.nv24.05\nIs debug build: False\nCUDA used to build PyTorch: 12.4\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.4 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.29.2\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.4.0-144-generic-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.4.131\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA H800\nGPU 1: NVIDIA H800\nGPU 2: NVIDIA H800\nGPU 3: NVIDIA H800\nGPU 4: NVIDIA H800\nGPU 5: NVIDIA H800\nGPU 6: NVIDIA H800\nGPU 7: NVIDIA H800\n\nNvidia driver version: 535.129.03\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.9.1.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv.so.9.1.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn.so.9.1.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_precompiled.so.9.1.0\n/usr/lib/x86_64-linux-gnu/libcudnn_engines_runtime_compiled.so.9.1.0\n/usr/lib/x86_64-linux-gnu/libcudnn_graph.so.9.1.0\n/usr/lib/x86_64-linux-gnu/libcudnn_heuristic.so.9.1.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops.so.9.1.0\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) Platinum 8462Y+\nCPU family: 6\nModel: 143\nThread(s) per core: 2\nCore(s) per socket: 32\nSocket(s): 2\nStepping: 8\nFrequency boost: enabled\nCPU max MHz: 2801.0000\nCPU min MHz: 800.0000\nBogoMIPS: 5600.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq dtes64 ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid dca sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm 3dnowprefetch cpuid_fault epb cat_l3 cat_l2 cdp_l3 invpcid_single cdp_l2 ssbd mba ibrs ibpb stibp ibrs_enhanced tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb intel_pt avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 wbnoinvd dtherm ida arat pln pts avx512vbmi umip pku ospke waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg tme avx512_vpopcntdq rdpid cldemote movdiri movdir64b md_clear pconfig flush_l1d arch_capabilities\nVirtualization: VT-x\nL1d cache: 3 MiB (64 instances)\nL1i cache: 2 MiB (64 instances)\nL2 cache: 128 MiB (64 instances)\nL3 cache: 120 MiB (2 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-31,64-95\nNUMA node1 CPU(s): 32-63,96-127\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Enhanced IBRS, IBPB conditional, RSB filling, PBRSB-eIBRS SW sequence\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.24.4\n[pip3] onnx==1.16.0\n[pip3] optree==0.11.0\n[pip3] pytorch-quantization==2.1.2\n[pip3] pytorch-triton==3.0.0+989adb9a2\n[pip3] torch==2.4.0a0+07cecf4168.nv24.5\n[pip3] torch-tensorrt==2.4.0a0\n[pip3] torchvision==0.19.0a0\n[conda] Could not collect", + "transformers_version": "4.41.2", + "upper_git_hash": null +} \ No newline at end of file