Amber / eval_mmlu.json
omkarenator's picture
Upload folder using huggingface_hub
abfe93f
raw
history blame
14.2 kB
{
"results": {
"hendrycksTest-abstract_algebra": {
"acc": 0.29,
"acc_stderr": 0.04560480215720683,
"acc_norm": 0.29,
"acc_norm_stderr": 0.04560480215720683
},
"hendrycksTest-anatomy": {
"acc": 0.34074074074074073,
"acc_stderr": 0.04094376269996793,
"acc_norm": 0.34074074074074073,
"acc_norm_stderr": 0.04094376269996793
},
"hendrycksTest-astronomy": {
"acc": 0.2565789473684211,
"acc_stderr": 0.035541803680256896,
"acc_norm": 0.2565789473684211,
"acc_norm_stderr": 0.035541803680256896
},
"hendrycksTest-business_ethics": {
"acc": 0.37,
"acc_stderr": 0.048523658709391,
"acc_norm": 0.37,
"acc_norm_stderr": 0.048523658709391
},
"hendrycksTest-clinical_knowledge": {
"acc": 0.30943396226415093,
"acc_stderr": 0.028450154794118627,
"acc_norm": 0.30943396226415093,
"acc_norm_stderr": 0.028450154794118627
},
"hendrycksTest-college_biology": {
"acc": 0.3055555555555556,
"acc_stderr": 0.03852084696008534,
"acc_norm": 0.3055555555555556,
"acc_norm_stderr": 0.03852084696008534
},
"hendrycksTest-college_chemistry": {
"acc": 0.26,
"acc_stderr": 0.04408440022768079,
"acc_norm": 0.26,
"acc_norm_stderr": 0.04408440022768079
},
"hendrycksTest-college_computer_science": {
"acc": 0.37,
"acc_stderr": 0.04852365870939099,
"acc_norm": 0.37,
"acc_norm_stderr": 0.04852365870939099
},
"hendrycksTest-college_mathematics": {
"acc": 0.3,
"acc_stderr": 0.046056618647183814,
"acc_norm": 0.3,
"acc_norm_stderr": 0.046056618647183814
},
"hendrycksTest-college_medicine": {
"acc": 0.2658959537572254,
"acc_stderr": 0.03368762932259431,
"acc_norm": 0.2658959537572254,
"acc_norm_stderr": 0.03368762932259431
},
"hendrycksTest-college_physics": {
"acc": 0.22549019607843138,
"acc_stderr": 0.041583075330832865,
"acc_norm": 0.22549019607843138,
"acc_norm_stderr": 0.041583075330832865
},
"hendrycksTest-computer_security": {
"acc": 0.4,
"acc_stderr": 0.049236596391733084,
"acc_norm": 0.4,
"acc_norm_stderr": 0.049236596391733084
},
"hendrycksTest-conceptual_physics": {
"acc": 0.3021276595744681,
"acc_stderr": 0.030017554471880554,
"acc_norm": 0.3021276595744681,
"acc_norm_stderr": 0.030017554471880554
},
"hendrycksTest-econometrics": {
"acc": 0.2894736842105263,
"acc_stderr": 0.04266339443159394,
"acc_norm": 0.2894736842105263,
"acc_norm_stderr": 0.04266339443159394
},
"hendrycksTest-electrical_engineering": {
"acc": 0.2896551724137931,
"acc_stderr": 0.037800192304380135,
"acc_norm": 0.2896551724137931,
"acc_norm_stderr": 0.037800192304380135
},
"hendrycksTest-elementary_mathematics": {
"acc": 0.2777777777777778,
"acc_stderr": 0.02306818884826111,
"acc_norm": 0.2777777777777778,
"acc_norm_stderr": 0.02306818884826111
},
"hendrycksTest-formal_logic": {
"acc": 0.31746031746031744,
"acc_stderr": 0.04163453031302859,
"acc_norm": 0.31746031746031744,
"acc_norm_stderr": 0.04163453031302859
},
"hendrycksTest-global_facts": {
"acc": 0.32,
"acc_stderr": 0.046882617226215034,
"acc_norm": 0.32,
"acc_norm_stderr": 0.046882617226215034
},
"hendrycksTest-high_school_biology": {
"acc": 0.27741935483870966,
"acc_stderr": 0.025470196835900055,
"acc_norm": 0.27741935483870966,
"acc_norm_stderr": 0.025470196835900055
},
"hendrycksTest-high_school_chemistry": {
"acc": 0.2660098522167488,
"acc_stderr": 0.031089826002937523,
"acc_norm": 0.2660098522167488,
"acc_norm_stderr": 0.031089826002937523
},
"hendrycksTest-high_school_computer_science": {
"acc": 0.31,
"acc_stderr": 0.04648231987117316,
"acc_norm": 0.31,
"acc_norm_stderr": 0.04648231987117316
},
"hendrycksTest-high_school_european_history": {
"acc": 0.3151515151515151,
"acc_stderr": 0.0362773057502241,
"acc_norm": 0.3151515151515151,
"acc_norm_stderr": 0.0362773057502241
},
"hendrycksTest-high_school_geography": {
"acc": 0.2676767676767677,
"acc_stderr": 0.03154449888270286,
"acc_norm": 0.2676767676767677,
"acc_norm_stderr": 0.03154449888270286
},
"hendrycksTest-high_school_government_and_politics": {
"acc": 0.23834196891191708,
"acc_stderr": 0.030748905363909892,
"acc_norm": 0.23834196891191708,
"acc_norm_stderr": 0.030748905363909892
},
"hendrycksTest-high_school_macroeconomics": {
"acc": 0.24358974358974358,
"acc_stderr": 0.021763733684173926,
"acc_norm": 0.24358974358974358,
"acc_norm_stderr": 0.021763733684173926
},
"hendrycksTest-high_school_mathematics": {
"acc": 0.23703703703703705,
"acc_stderr": 0.0259288761327661,
"acc_norm": 0.23703703703703705,
"acc_norm_stderr": 0.0259288761327661
},
"hendrycksTest-high_school_microeconomics": {
"acc": 0.2605042016806723,
"acc_stderr": 0.02851025151234192,
"acc_norm": 0.2605042016806723,
"acc_norm_stderr": 0.02851025151234192
},
"hendrycksTest-high_school_physics": {
"acc": 0.2980132450331126,
"acc_stderr": 0.037345356767871984,
"acc_norm": 0.2980132450331126,
"acc_norm_stderr": 0.037345356767871984
},
"hendrycksTest-high_school_psychology": {
"acc": 0.3192660550458716,
"acc_stderr": 0.01998782906975001,
"acc_norm": 0.3192660550458716,
"acc_norm_stderr": 0.01998782906975001
},
"hendrycksTest-high_school_statistics": {
"acc": 0.28703703703703703,
"acc_stderr": 0.030851992993257013,
"acc_norm": 0.28703703703703703,
"acc_norm_stderr": 0.030851992993257013
},
"hendrycksTest-high_school_us_history": {
"acc": 0.28921568627450983,
"acc_stderr": 0.03182231867647554,
"acc_norm": 0.28921568627450983,
"acc_norm_stderr": 0.03182231867647554
},
"hendrycksTest-high_school_world_history": {
"acc": 0.2911392405063291,
"acc_stderr": 0.029571601065753374,
"acc_norm": 0.2911392405063291,
"acc_norm_stderr": 0.029571601065753374
},
"hendrycksTest-human_aging": {
"acc": 0.30493273542600896,
"acc_stderr": 0.030898610882477515,
"acc_norm": 0.30493273542600896,
"acc_norm_stderr": 0.030898610882477515
},
"hendrycksTest-human_sexuality": {
"acc": 0.29770992366412213,
"acc_stderr": 0.04010358942462202,
"acc_norm": 0.29770992366412213,
"acc_norm_stderr": 0.04010358942462202
},
"hendrycksTest-international_law": {
"acc": 0.34710743801652894,
"acc_stderr": 0.04345724570292534,
"acc_norm": 0.34710743801652894,
"acc_norm_stderr": 0.04345724570292534
},
"hendrycksTest-jurisprudence": {
"acc": 0.3333333333333333,
"acc_stderr": 0.04557239513497751,
"acc_norm": 0.3333333333333333,
"acc_norm_stderr": 0.04557239513497751
},
"hendrycksTest-logical_fallacies": {
"acc": 0.22699386503067484,
"acc_stderr": 0.03291099578615769,
"acc_norm": 0.22699386503067484,
"acc_norm_stderr": 0.03291099578615769
},
"hendrycksTest-machine_learning": {
"acc": 0.2767857142857143,
"acc_stderr": 0.042466243366976256,
"acc_norm": 0.2767857142857143,
"acc_norm_stderr": 0.042466243366976256
},
"hendrycksTest-management": {
"acc": 0.2524271844660194,
"acc_stderr": 0.04301250399690875,
"acc_norm": 0.2524271844660194,
"acc_norm_stderr": 0.04301250399690875
},
"hendrycksTest-marketing": {
"acc": 0.3333333333333333,
"acc_stderr": 0.030882736974138642,
"acc_norm": 0.3333333333333333,
"acc_norm_stderr": 0.030882736974138642
},
"hendrycksTest-medical_genetics": {
"acc": 0.29,
"acc_stderr": 0.04560480215720683,
"acc_norm": 0.29,
"acc_norm_stderr": 0.04560480215720683
},
"hendrycksTest-miscellaneous": {
"acc": 0.3128991060025543,
"acc_stderr": 0.01658093594030407,
"acc_norm": 0.3128991060025543,
"acc_norm_stderr": 0.01658093594030407
},
"hendrycksTest-moral_disputes": {
"acc": 0.3265895953757225,
"acc_stderr": 0.025248264774242832,
"acc_norm": 0.3265895953757225,
"acc_norm_stderr": 0.025248264774242832
},
"hendrycksTest-moral_scenarios": {
"acc": 0.2424581005586592,
"acc_stderr": 0.014333522059217889,
"acc_norm": 0.2424581005586592,
"acc_norm_stderr": 0.014333522059217889
},
"hendrycksTest-nutrition": {
"acc": 0.3137254901960784,
"acc_stderr": 0.026568921015457155,
"acc_norm": 0.3137254901960784,
"acc_norm_stderr": 0.026568921015457155
},
"hendrycksTest-philosophy": {
"acc": 0.31189710610932475,
"acc_stderr": 0.02631185807185416,
"acc_norm": 0.31189710610932475,
"acc_norm_stderr": 0.02631185807185416
},
"hendrycksTest-prehistory": {
"acc": 0.2808641975308642,
"acc_stderr": 0.025006469755799208,
"acc_norm": 0.2808641975308642,
"acc_norm_stderr": 0.025006469755799208
},
"hendrycksTest-professional_accounting": {
"acc": 0.2695035460992908,
"acc_stderr": 0.026469036818590638,
"acc_norm": 0.2695035460992908,
"acc_norm_stderr": 0.026469036818590638
},
"hendrycksTest-professional_law": {
"acc": 0.28292046936114734,
"acc_stderr": 0.011503891323188974,
"acc_norm": 0.28292046936114734,
"acc_norm_stderr": 0.011503891323188974
},
"hendrycksTest-professional_medicine": {
"acc": 0.21691176470588236,
"acc_stderr": 0.025035845227711254,
"acc_norm": 0.21691176470588236,
"acc_norm_stderr": 0.025035845227711254
},
"hendrycksTest-professional_psychology": {
"acc": 0.3104575163398693,
"acc_stderr": 0.01871806705262322,
"acc_norm": 0.3104575163398693,
"acc_norm_stderr": 0.01871806705262322
},
"hendrycksTest-public_relations": {
"acc": 0.39090909090909093,
"acc_stderr": 0.04673752333670237,
"acc_norm": 0.39090909090909093,
"acc_norm_stderr": 0.04673752333670237
},
"hendrycksTest-security_studies": {
"acc": 0.2693877551020408,
"acc_stderr": 0.02840125202902294,
"acc_norm": 0.2693877551020408,
"acc_norm_stderr": 0.02840125202902294
},
"hendrycksTest-sociology": {
"acc": 0.3034825870646766,
"acc_stderr": 0.03251006816458617,
"acc_norm": 0.3034825870646766,
"acc_norm_stderr": 0.03251006816458617
},
"hendrycksTest-us_foreign_policy": {
"acc": 0.36,
"acc_stderr": 0.04824181513244218,
"acc_norm": 0.36,
"acc_norm_stderr": 0.04824181513244218
},
"hendrycksTest-virology": {
"acc": 0.3373493975903614,
"acc_stderr": 0.03680783690727581,
"acc_norm": 0.3373493975903614,
"acc_norm_stderr": 0.03680783690727581
},
"hendrycksTest-world_religions": {
"acc": 0.3391812865497076,
"acc_stderr": 0.03631053496488905,
"acc_norm": 0.3391812865497076,
"acc_norm_stderr": 0.03631053496488905
}
},
"versions": {
"hendrycksTest-abstract_algebra": 1,
"hendrycksTest-anatomy": 1,
"hendrycksTest-astronomy": 1,
"hendrycksTest-business_ethics": 1,
"hendrycksTest-clinical_knowledge": 1,
"hendrycksTest-college_biology": 1,
"hendrycksTest-college_chemistry": 1,
"hendrycksTest-college_computer_science": 1,
"hendrycksTest-college_mathematics": 1,
"hendrycksTest-college_medicine": 1,
"hendrycksTest-college_physics": 1,
"hendrycksTest-computer_security": 1,
"hendrycksTest-conceptual_physics": 1,
"hendrycksTest-econometrics": 1,
"hendrycksTest-electrical_engineering": 1,
"hendrycksTest-elementary_mathematics": 1,
"hendrycksTest-formal_logic": 1,
"hendrycksTest-global_facts": 1,
"hendrycksTest-high_school_biology": 1,
"hendrycksTest-high_school_chemistry": 1,
"hendrycksTest-high_school_computer_science": 1,
"hendrycksTest-high_school_european_history": 1,
"hendrycksTest-high_school_geography": 1,
"hendrycksTest-high_school_government_and_politics": 1,
"hendrycksTest-high_school_macroeconomics": 1,
"hendrycksTest-high_school_mathematics": 1,
"hendrycksTest-high_school_microeconomics": 1,
"hendrycksTest-high_school_physics": 1,
"hendrycksTest-high_school_psychology": 1,
"hendrycksTest-high_school_statistics": 1,
"hendrycksTest-high_school_us_history": 1,
"hendrycksTest-high_school_world_history": 1,
"hendrycksTest-human_aging": 1,
"hendrycksTest-human_sexuality": 1,
"hendrycksTest-international_law": 1,
"hendrycksTest-jurisprudence": 1,
"hendrycksTest-logical_fallacies": 1,
"hendrycksTest-machine_learning": 1,
"hendrycksTest-management": 1,
"hendrycksTest-marketing": 1,
"hendrycksTest-medical_genetics": 1,
"hendrycksTest-miscellaneous": 1,
"hendrycksTest-moral_disputes": 1,
"hendrycksTest-moral_scenarios": 1,
"hendrycksTest-nutrition": 1,
"hendrycksTest-philosophy": 1,
"hendrycksTest-prehistory": 1,
"hendrycksTest-professional_accounting": 1,
"hendrycksTest-professional_law": 1,
"hendrycksTest-professional_medicine": 1,
"hendrycksTest-professional_psychology": 1,
"hendrycksTest-public_relations": 1,
"hendrycksTest-security_studies": 1,
"hendrycksTest-sociology": 1,
"hendrycksTest-us_foreign_policy": 1,
"hendrycksTest-virology": 1,
"hendrycksTest-world_religions": 1
},
"config": {
"model": "hf-causal",
"model_args": "pretrained=workdir_7b/ckpt_351",
"num_fewshot": 5,
"batch_size": "8",
"batch_sizes": [],
"device": null,
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}