MMLU-by-task / llama-30B_mmlu_5-shot.json
Corey Morris
Added one llama result for MMLU
ffdb8d3
{
"results": {
"hendrycksTest-high_school_world_history": {
"acc": 0.6962025316455697,
"acc_stderr": 0.029936696387138598,
"acc_norm": 0.569620253164557,
"acc_norm_stderr": 0.032230171959375976
},
"hendrycksTest-formal_logic": {
"acc": 0.42063492063492064,
"acc_stderr": 0.04415438226743743,
"acc_norm": 0.3968253968253968,
"acc_norm_stderr": 0.043758884927270605
},
"hendrycksTest-human_aging": {
"acc": 0.672645739910314,
"acc_stderr": 0.03149384670994131,
"acc_norm": 0.3632286995515695,
"acc_norm_stderr": 0.032277904428505
},
"hendrycksTest-international_law": {
"acc": 0.7024793388429752,
"acc_stderr": 0.04173349148083499,
"acc_norm": 0.768595041322314,
"acc_norm_stderr": 0.03849856098794088
},
"hendrycksTest-security_studies": {
"acc": 0.5714285714285714,
"acc_stderr": 0.031680911612338825,
"acc_norm": 0.40408163265306124,
"acc_norm_stderr": 0.0314147080258659
},
"hendrycksTest-medical_genetics": {
"acc": 0.6,
"acc_stderr": 0.049236596391733084,
"acc_norm": 0.54,
"acc_norm_stderr": 0.05009082659620332
},
"hendrycksTest-econometrics": {
"acc": 0.3508771929824561,
"acc_stderr": 0.044895393502707,
"acc_norm": 0.3157894736842105,
"acc_norm_stderr": 0.043727482902780064
},
"hendrycksTest-high_school_macroeconomics": {
"acc": 0.5153846153846153,
"acc_stderr": 0.025339003010106515,
"acc_norm": 0.4153846153846154,
"acc_norm_stderr": 0.024985354923102332
},
"hendrycksTest-us_foreign_policy": {
"acc": 0.79,
"acc_stderr": 0.040936018074033256,
"acc_norm": 0.59,
"acc_norm_stderr": 0.049431107042371025
},
"hendrycksTest-logical_fallacies": {
"acc": 0.6993865030674846,
"acc_stderr": 0.03602511318806771,
"acc_norm": 0.5398773006134969,
"acc_norm_stderr": 0.039158572914369714
},
"hendrycksTest-prehistory": {
"acc": 0.6635802469135802,
"acc_stderr": 0.026289734945952926,
"acc_norm": 0.42901234567901236,
"acc_norm_stderr": 0.027538925613470867
},
"hendrycksTest-professional_psychology": {
"acc": 0.5882352941176471,
"acc_stderr": 0.019910377463105932,
"acc_norm": 0.43300653594771243,
"acc_norm_stderr": 0.02004544247332422
},
"hendrycksTest-professional_accounting": {
"acc": 0.3971631205673759,
"acc_stderr": 0.029189805673587105,
"acc_norm": 0.33687943262411346,
"acc_norm_stderr": 0.02819553487396673
},
"hendrycksTest-college_biology": {
"acc": 0.6111111111111112,
"acc_stderr": 0.04076663253918567,
"acc_norm": 0.4236111111111111,
"acc_norm_stderr": 0.04132125019723369
},
"hendrycksTest-high_school_biology": {
"acc": 0.6709677419354839,
"acc_stderr": 0.02672949906834996,
"acc_norm": 0.5451612903225806,
"acc_norm_stderr": 0.028327743091561074
},
"hendrycksTest-philosophy": {
"acc": 0.6752411575562701,
"acc_stderr": 0.02659678228769704,
"acc_norm": 0.5016077170418006,
"acc_norm_stderr": 0.02839794490780661
},
"hendrycksTest-high_school_european_history": {
"acc": 0.696969696969697,
"acc_stderr": 0.03588624800091707,
"acc_norm": 0.5636363636363636,
"acc_norm_stderr": 0.03872592983524754
},
"hendrycksTest-college_medicine": {
"acc": 0.5144508670520231,
"acc_stderr": 0.03810871630454764,
"acc_norm": 0.43352601156069365,
"acc_norm_stderr": 0.03778621079092055
},
"hendrycksTest-professional_medicine": {
"acc": 0.5551470588235294,
"acc_stderr": 0.03018753206032938,
"acc_norm": 0.35661764705882354,
"acc_norm_stderr": 0.02909720956841195
},
"hendrycksTest-moral_scenarios": {
"acc": 0.34301675977653634,
"acc_stderr": 0.015876912673057724,
"acc_norm": 0.27262569832402234,
"acc_norm_stderr": 0.014893391735249588
},
"hendrycksTest-high_school_chemistry": {
"acc": 0.39901477832512317,
"acc_stderr": 0.03445487686264716,
"acc_norm": 0.3694581280788177,
"acc_norm_stderr": 0.03395970381998573
},
"hendrycksTest-high_school_physics": {
"acc": 0.31788079470198677,
"acc_stderr": 0.038020397601079024,
"acc_norm": 0.31125827814569534,
"acc_norm_stderr": 0.03780445850526733
},
"hendrycksTest-high_school_government_and_politics": {
"acc": 0.8082901554404145,
"acc_stderr": 0.028408953626245282,
"acc_norm": 0.6113989637305699,
"acc_norm_stderr": 0.03517739796373132
},
"hendrycksTest-high_school_geography": {
"acc": 0.7575757575757576,
"acc_stderr": 0.030532892233932026,
"acc_norm": 0.5505050505050505,
"acc_norm_stderr": 0.0354413249194797
},
"hendrycksTest-global_facts": {
"acc": 0.47,
"acc_stderr": 0.05016135580465919,
"acc_norm": 0.37,
"acc_norm_stderr": 0.04852365870939099
},
"hendrycksTest-professional_law": {
"acc": 0.4002607561929596,
"acc_stderr": 0.012513582529136213,
"acc_norm": 0.3435462842242503,
"acc_norm_stderr": 0.012128961174190158
},
"hendrycksTest-college_mathematics": {
"acc": 0.37,
"acc_stderr": 0.048523658709391,
"acc_norm": 0.3,
"acc_norm_stderr": 0.046056618647183814
},
"hendrycksTest-college_physics": {
"acc": 0.23529411764705882,
"acc_stderr": 0.04220773659171452,
"acc_norm": 0.29411764705882354,
"acc_norm_stderr": 0.04533838195929774
},
"hendrycksTest-high_school_statistics": {
"acc": 0.4351851851851852,
"acc_stderr": 0.03381200005643525,
"acc_norm": 0.35648148148148145,
"acc_norm_stderr": 0.032664783315272714
},
"hendrycksTest-machine_learning": {
"acc": 0.4017857142857143,
"acc_stderr": 0.04653333146973646,
"acc_norm": 0.30357142857142855,
"acc_norm_stderr": 0.04364226155841044
},
"hendrycksTest-public_relations": {
"acc": 0.6454545454545455,
"acc_stderr": 0.045820048415054174,
"acc_norm": 0.4090909090909091,
"acc_norm_stderr": 0.047093069786618966
},
"hendrycksTest-high_school_computer_science": {
"acc": 0.61,
"acc_stderr": 0.04902071300001974,
"acc_norm": 0.47,
"acc_norm_stderr": 0.05016135580465919
},
"hendrycksTest-high_school_psychology": {
"acc": 0.7706422018348624,
"acc_stderr": 0.018025349724618684,
"acc_norm": 0.5541284403669725,
"acc_norm_stderr": 0.021311335009708582
},
"hendrycksTest-virology": {
"acc": 0.4939759036144578,
"acc_stderr": 0.03892212195333045,
"acc_norm": 0.3433734939759036,
"acc_norm_stderr": 0.03696584317010601
},
"hendrycksTest-marketing": {
"acc": 0.8461538461538461,
"acc_stderr": 0.023636873317489294,
"acc_norm": 0.7649572649572649,
"acc_norm_stderr": 0.027778835904935437
},
"hendrycksTest-human_sexuality": {
"acc": 0.7022900763358778,
"acc_stderr": 0.04010358942462203,
"acc_norm": 0.46564885496183206,
"acc_norm_stderr": 0.04374928560599738
},
"hendrycksTest-sociology": {
"acc": 0.7611940298507462,
"acc_stderr": 0.03014777593540922,
"acc_norm": 0.6616915422885572,
"acc_norm_stderr": 0.033455630703391914
},
"hendrycksTest-college_computer_science": {
"acc": 0.43,
"acc_stderr": 0.049756985195624284,
"acc_norm": 0.34,
"acc_norm_stderr": 0.04760952285695236
},
"hendrycksTest-conceptual_physics": {
"acc": 0.5106382978723404,
"acc_stderr": 0.03267862331014063,
"acc_norm": 0.3276595744680851,
"acc_norm_stderr": 0.030683020843231004
},
"hendrycksTest-anatomy": {
"acc": 0.5185185185185185,
"acc_stderr": 0.043163785995113245,
"acc_norm": 0.4074074074074074,
"acc_norm_stderr": 0.04244633238353228
},
"hendrycksTest-miscellaneous": {
"acc": 0.8186462324393359,
"acc_stderr": 0.013778693778464062,
"acc_norm": 0.6143039591315453,
"acc_norm_stderr": 0.017406476619212907
},
"hendrycksTest-jurisprudence": {
"acc": 0.6666666666666666,
"acc_stderr": 0.04557239513497751,
"acc_norm": 0.5555555555555556,
"acc_norm_stderr": 0.04803752235190193
},
"hendrycksTest-moral_disputes": {
"acc": 0.6184971098265896,
"acc_stderr": 0.026152198619726792,
"acc_norm": 0.4595375722543353,
"acc_norm_stderr": 0.026830805998952236
},
"hendrycksTest-high_school_us_history": {
"acc": 0.7205882352941176,
"acc_stderr": 0.031493281045079556,
"acc_norm": 0.553921568627451,
"acc_norm_stderr": 0.03488845451304974
},
"hendrycksTest-high_school_mathematics": {
"acc": 0.25925925925925924,
"acc_stderr": 0.026719240783712177,
"acc_norm": 0.3148148148148148,
"acc_norm_stderr": 0.02831753349606648
},
"hendrycksTest-high_school_microeconomics": {
"acc": 0.5840336134453782,
"acc_stderr": 0.032016501007396114,
"acc_norm": 0.4831932773109244,
"acc_norm_stderr": 0.03246013680375308
},
"hendrycksTest-astronomy": {
"acc": 0.5723684210526315,
"acc_stderr": 0.04026097083296564,
"acc_norm": 0.5657894736842105,
"acc_norm_stderr": 0.04033565667848319
},
"hendrycksTest-world_religions": {
"acc": 0.8128654970760234,
"acc_stderr": 0.029913127232368043,
"acc_norm": 0.7660818713450293,
"acc_norm_stderr": 0.03246721765117825
},
"hendrycksTest-clinical_knowledge": {
"acc": 0.5320754716981132,
"acc_stderr": 0.03070948699255654,
"acc_norm": 0.4641509433962264,
"acc_norm_stderr": 0.030693675018458003
},
"hendrycksTest-college_chemistry": {
"acc": 0.31,
"acc_stderr": 0.04648231987117316,
"acc_norm": 0.32,
"acc_norm_stderr": 0.046882617226215034
},
"hendrycksTest-abstract_algebra": {
"acc": 0.26,
"acc_stderr": 0.04408440022768078,
"acc_norm": 0.29,
"acc_norm_stderr": 0.04560480215720684
},
"hendrycksTest-business_ethics": {
"acc": 0.67,
"acc_stderr": 0.04725815626252609,
"acc_norm": 0.48,
"acc_norm_stderr": 0.050211673156867795
},
"hendrycksTest-elementary_mathematics": {
"acc": 0.4417989417989418,
"acc_stderr": 0.02557625706125384,
"acc_norm": 0.37037037037037035,
"acc_norm_stderr": 0.024870815251057075
},
"hendrycksTest-management": {
"acc": 0.7184466019417476,
"acc_stderr": 0.044532548363264673,
"acc_norm": 0.5533980582524272,
"acc_norm_stderr": 0.04922424153458933
},
"hendrycksTest-electrical_engineering": {
"acc": 0.5172413793103449,
"acc_stderr": 0.04164188720169375,
"acc_norm": 0.38620689655172413,
"acc_norm_stderr": 0.040573247344190336
},
"hendrycksTest-nutrition": {
"acc": 0.6111111111111112,
"acc_stderr": 0.02791405551046801,
"acc_norm": 0.5032679738562091,
"acc_norm_stderr": 0.028629305194003543
},
"hendrycksTest-computer_security": {
"acc": 0.66,
"acc_stderr": 0.04760952285695237,
"acc_norm": 0.58,
"acc_norm_stderr": 0.049604496374885836
}
},
"versions": {
"hendrycksTest-high_school_world_history": 0,
"hendrycksTest-formal_logic": 0,
"hendrycksTest-human_aging": 0,
"hendrycksTest-international_law": 0,
"hendrycksTest-security_studies": 0,
"hendrycksTest-medical_genetics": 0,
"hendrycksTest-econometrics": 0,
"hendrycksTest-high_school_macroeconomics": 0,
"hendrycksTest-us_foreign_policy": 0,
"hendrycksTest-logical_fallacies": 0,
"hendrycksTest-prehistory": 0,
"hendrycksTest-professional_psychology": 0,
"hendrycksTest-professional_accounting": 0,
"hendrycksTest-college_biology": 0,
"hendrycksTest-high_school_biology": 0,
"hendrycksTest-philosophy": 0,
"hendrycksTest-high_school_european_history": 0,
"hendrycksTest-college_medicine": 0,
"hendrycksTest-professional_medicine": 0,
"hendrycksTest-moral_scenarios": 0,
"hendrycksTest-high_school_chemistry": 0,
"hendrycksTest-high_school_physics": 0,
"hendrycksTest-high_school_government_and_politics": 0,
"hendrycksTest-high_school_geography": 0,
"hendrycksTest-global_facts": 0,
"hendrycksTest-professional_law": 0,
"hendrycksTest-college_mathematics": 0,
"hendrycksTest-college_physics": 0,
"hendrycksTest-high_school_statistics": 0,
"hendrycksTest-machine_learning": 0,
"hendrycksTest-public_relations": 0,
"hendrycksTest-high_school_computer_science": 0,
"hendrycksTest-high_school_psychology": 0,
"hendrycksTest-virology": 0,
"hendrycksTest-marketing": 0,
"hendrycksTest-human_sexuality": 0,
"hendrycksTest-sociology": 0,
"hendrycksTest-college_computer_science": 0,
"hendrycksTest-conceptual_physics": 0,
"hendrycksTest-anatomy": 0,
"hendrycksTest-miscellaneous": 0,
"hendrycksTest-jurisprudence": 0,
"hendrycksTest-moral_disputes": 0,
"hendrycksTest-high_school_us_history": 0,
"hendrycksTest-high_school_mathematics": 0,
"hendrycksTest-high_school_microeconomics": 0,
"hendrycksTest-astronomy": 0,
"hendrycksTest-world_religions": 0,
"hendrycksTest-clinical_knowledge": 0,
"hendrycksTest-college_chemistry": 0,
"hendrycksTest-abstract_algebra": 0,
"hendrycksTest-business_ethics": 0,
"hendrycksTest-elementary_mathematics": 0,
"hendrycksTest-management": 0,
"hendrycksTest-electrical_engineering": 0,
"hendrycksTest-nutrition": 0,
"hendrycksTest-computer_security": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/30B,use_accelerate=True",
"num_fewshot": 5,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}