Spaces:
Runtime error
Runtime error
MLLM_leaderboard
/
eval-results
/JosephusCheung
/Pwen-14B-Chat-20_30
/results_2023-10-08T18-25-24.586385.json
{ | |
"config_general": { | |
"model_name": "JosephusCheung/Pwen-14B-Chat-20_30", | |
"model_sha": "e878e1f1f7b533c32beb8e06ebcf0cfa23f3fe9b", | |
"model_size": "26.54 GB", | |
"model_dtype": "torch.bfloat16", | |
"lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", | |
"num_few_shot_default": 0, | |
"num_fewshot_seeds": 1, | |
"override_batch_size": 1, | |
"max_samples": null, | |
"job_id": "" | |
}, | |
"results": { | |
"harness|arc:challenge|25": { | |
"acc": 0.5290102389078498, | |
"acc_stderr": 0.014586776355294321, | |
"acc_norm": 0.5614334470989761, | |
"acc_norm_stderr": 0.014500682618212864 | |
}, | |
"harness|hellaswag|10": { | |
"acc": 0.611929894443338, | |
"acc_stderr": 0.004863147544177516, | |
"acc_norm": 0.7978490340569607, | |
"acc_norm_stderr": 0.004007834585541846 | |
}, | |
"harness|hendrycksTest-abstract_algebra|5": { | |
"acc": 0.32, | |
"acc_stderr": 0.04688261722621505, | |
"acc_norm": 0.32, | |
"acc_norm_stderr": 0.04688261722621505 | |
}, | |
"harness|hendrycksTest-anatomy|5": { | |
"acc": 0.562962962962963, | |
"acc_stderr": 0.04284958639753401, | |
"acc_norm": 0.562962962962963, | |
"acc_norm_stderr": 0.04284958639753401 | |
}, | |
"harness|hendrycksTest-astronomy|5": { | |
"acc": 0.6973684210526315, | |
"acc_stderr": 0.03738520676119668, | |
"acc_norm": 0.6973684210526315, | |
"acc_norm_stderr": 0.03738520676119668 | |
}, | |
"harness|hendrycksTest-business_ethics|5": { | |
"acc": 0.68, | |
"acc_stderr": 0.046882617226215034, | |
"acc_norm": 0.68, | |
"acc_norm_stderr": 0.046882617226215034 | |
}, | |
"harness|hendrycksTest-clinical_knowledge|5": { | |
"acc": 0.6528301886792452, | |
"acc_stderr": 0.029300101705549652, | |
"acc_norm": 0.6528301886792452, | |
"acc_norm_stderr": 0.029300101705549652 | |
}, | |
"harness|hendrycksTest-college_biology|5": { | |
"acc": 0.7083333333333334, | |
"acc_stderr": 0.038009680605548594, | |
"acc_norm": 0.7083333333333334, | |
"acc_norm_stderr": 0.038009680605548594 | |
}, | |
"harness|hendrycksTest-college_chemistry|5": { | |
"acc": 0.41, | |
"acc_stderr": 0.049431107042371025, | |
"acc_norm": 0.41, | |
"acc_norm_stderr": 0.049431107042371025 | |
}, | |
"harness|hendrycksTest-college_computer_science|5": { | |
"acc": 0.53, | |
"acc_stderr": 0.050161355804659205, | |
"acc_norm": 0.53, | |
"acc_norm_stderr": 0.050161355804659205 | |
}, | |
"harness|hendrycksTest-college_mathematics|5": { | |
"acc": 0.41, | |
"acc_stderr": 0.049431107042371025, | |
"acc_norm": 0.41, | |
"acc_norm_stderr": 0.049431107042371025 | |
}, | |
"harness|hendrycksTest-college_medicine|5": { | |
"acc": 0.6184971098265896, | |
"acc_stderr": 0.03703851193099522, | |
"acc_norm": 0.6184971098265896, | |
"acc_norm_stderr": 0.03703851193099522 | |
}, | |
"harness|hendrycksTest-college_physics|5": { | |
"acc": 0.4117647058823529, | |
"acc_stderr": 0.048971049527263666, | |
"acc_norm": 0.4117647058823529, | |
"acc_norm_stderr": 0.048971049527263666 | |
}, | |
"harness|hendrycksTest-computer_security|5": { | |
"acc": 0.7, | |
"acc_stderr": 0.046056618647183814, | |
"acc_norm": 0.7, | |
"acc_norm_stderr": 0.046056618647183814 | |
}, | |
"harness|hendrycksTest-conceptual_physics|5": { | |
"acc": 0.43829787234042555, | |
"acc_stderr": 0.03243618636108101, | |
"acc_norm": 0.43829787234042555, | |
"acc_norm_stderr": 0.03243618636108101 | |
}, | |
"harness|hendrycksTest-econometrics|5": { | |
"acc": 0.4473684210526316, | |
"acc_stderr": 0.04677473004491199, | |
"acc_norm": 0.4473684210526316, | |
"acc_norm_stderr": 0.04677473004491199 | |
}, | |
"harness|hendrycksTest-electrical_engineering|5": { | |
"acc": 0.5586206896551724, | |
"acc_stderr": 0.04137931034482757, | |
"acc_norm": 0.5586206896551724, | |
"acc_norm_stderr": 0.04137931034482757 | |
}, | |
"harness|hendrycksTest-elementary_mathematics|5": { | |
"acc": 0.4947089947089947, | |
"acc_stderr": 0.02574986828855657, | |
"acc_norm": 0.4947089947089947, | |
"acc_norm_stderr": 0.02574986828855657 | |
}, | |
"harness|hendrycksTest-formal_logic|5": { | |
"acc": 0.3968253968253968, | |
"acc_stderr": 0.0437588849272706, | |
"acc_norm": 0.3968253968253968, | |
"acc_norm_stderr": 0.0437588849272706 | |
}, | |
"harness|hendrycksTest-global_facts|5": { | |
"acc": 0.41, | |
"acc_stderr": 0.049431107042371025, | |
"acc_norm": 0.41, | |
"acc_norm_stderr": 0.049431107042371025 | |
}, | |
"harness|hendrycksTest-high_school_biology|5": { | |
"acc": 0.7032258064516129, | |
"acc_stderr": 0.025988500792411905, | |
"acc_norm": 0.7032258064516129, | |
"acc_norm_stderr": 0.025988500792411905 | |
}, | |
"harness|hendrycksTest-high_school_chemistry|5": { | |
"acc": 0.541871921182266, | |
"acc_stderr": 0.03505630140785741, | |
"acc_norm": 0.541871921182266, | |
"acc_norm_stderr": 0.03505630140785741 | |
}, | |
"harness|hendrycksTest-high_school_computer_science|5": { | |
"acc": 0.64, | |
"acc_stderr": 0.04824181513244218, | |
"acc_norm": 0.64, | |
"acc_norm_stderr": 0.04824181513244218 | |
}, | |
"harness|hendrycksTest-high_school_european_history|5": { | |
"acc": 0.4909090909090909, | |
"acc_stderr": 0.039036986477484395, | |
"acc_norm": 0.4909090909090909, | |
"acc_norm_stderr": 0.039036986477484395 | |
}, | |
"harness|hendrycksTest-high_school_geography|5": { | |
"acc": 0.797979797979798, | |
"acc_stderr": 0.02860620428922987, | |
"acc_norm": 0.797979797979798, | |
"acc_norm_stderr": 0.02860620428922987 | |
}, | |
"harness|hendrycksTest-high_school_government_and_politics|5": { | |
"acc": 0.7927461139896373, | |
"acc_stderr": 0.02925282329180363, | |
"acc_norm": 0.7927461139896373, | |
"acc_norm_stderr": 0.02925282329180363 | |
}, | |
"harness|hendrycksTest-high_school_macroeconomics|5": { | |
"acc": 0.6076923076923076, | |
"acc_stderr": 0.02475600038213095, | |
"acc_norm": 0.6076923076923076, | |
"acc_norm_stderr": 0.02475600038213095 | |
}, | |
"harness|hendrycksTest-high_school_mathematics|5": { | |
"acc": 0.362962962962963, | |
"acc_stderr": 0.02931820364520686, | |
"acc_norm": 0.362962962962963, | |
"acc_norm_stderr": 0.02931820364520686 | |
}, | |
"harness|hendrycksTest-high_school_microeconomics|5": { | |
"acc": 0.6680672268907563, | |
"acc_stderr": 0.03058869701378364, | |
"acc_norm": 0.6680672268907563, | |
"acc_norm_stderr": 0.03058869701378364 | |
}, | |
"harness|hendrycksTest-high_school_physics|5": { | |
"acc": 0.423841059602649, | |
"acc_stderr": 0.040348466786033974, | |
"acc_norm": 0.423841059602649, | |
"acc_norm_stderr": 0.040348466786033974 | |
}, | |
"harness|hendrycksTest-high_school_psychology|5": { | |
"acc": 0.7834862385321101, | |
"acc_stderr": 0.01765871059444313, | |
"acc_norm": 0.7834862385321101, | |
"acc_norm_stderr": 0.01765871059444313 | |
}, | |
"harness|hendrycksTest-high_school_statistics|5": { | |
"acc": 0.5046296296296297, | |
"acc_stderr": 0.03409825519163572, | |
"acc_norm": 0.5046296296296297, | |
"acc_norm_stderr": 0.03409825519163572 | |
}, | |
"harness|hendrycksTest-high_school_us_history|5": { | |
"acc": 0.6176470588235294, | |
"acc_stderr": 0.03410785338904719, | |
"acc_norm": 0.6176470588235294, | |
"acc_norm_stderr": 0.03410785338904719 | |
}, | |
"harness|hendrycksTest-high_school_world_history|5": { | |
"acc": 0.7637130801687764, | |
"acc_stderr": 0.02765215314415927, | |
"acc_norm": 0.7637130801687764, | |
"acc_norm_stderr": 0.02765215314415927 | |
}, | |
"harness|hendrycksTest-human_aging|5": { | |
"acc": 0.5874439461883408, | |
"acc_stderr": 0.03304062175449297, | |
"acc_norm": 0.5874439461883408, | |
"acc_norm_stderr": 0.03304062175449297 | |
}, | |
"harness|hendrycksTest-human_sexuality|5": { | |
"acc": 0.6946564885496184, | |
"acc_stderr": 0.0403931497872456, | |
"acc_norm": 0.6946564885496184, | |
"acc_norm_stderr": 0.0403931497872456 | |
}, | |
"harness|hendrycksTest-international_law|5": { | |
"acc": 0.7768595041322314, | |
"acc_stderr": 0.03800754475228733, | |
"acc_norm": 0.7768595041322314, | |
"acc_norm_stderr": 0.03800754475228733 | |
}, | |
"harness|hendrycksTest-jurisprudence|5": { | |
"acc": 0.6388888888888888, | |
"acc_stderr": 0.04643454608906275, | |
"acc_norm": 0.6388888888888888, | |
"acc_norm_stderr": 0.04643454608906275 | |
}, | |
"harness|hendrycksTest-logical_fallacies|5": { | |
"acc": 0.6748466257668712, | |
"acc_stderr": 0.03680350371286461, | |
"acc_norm": 0.6748466257668712, | |
"acc_norm_stderr": 0.03680350371286461 | |
}, | |
"harness|hendrycksTest-machine_learning|5": { | |
"acc": 0.4017857142857143, | |
"acc_stderr": 0.04653333146973646, | |
"acc_norm": 0.4017857142857143, | |
"acc_norm_stderr": 0.04653333146973646 | |
}, | |
"harness|hendrycksTest-management|5": { | |
"acc": 0.7864077669902912, | |
"acc_stderr": 0.040580420156460344, | |
"acc_norm": 0.7864077669902912, | |
"acc_norm_stderr": 0.040580420156460344 | |
}, | |
"harness|hendrycksTest-marketing|5": { | |
"acc": 0.8076923076923077, | |
"acc_stderr": 0.02581923325648372, | |
"acc_norm": 0.8076923076923077, | |
"acc_norm_stderr": 0.02581923325648372 | |
}, | |
"harness|hendrycksTest-medical_genetics|5": { | |
"acc": 0.69, | |
"acc_stderr": 0.04648231987117316, | |
"acc_norm": 0.69, | |
"acc_norm_stderr": 0.04648231987117316 | |
}, | |
"harness|hendrycksTest-miscellaneous|5": { | |
"acc": 0.7496807151979565, | |
"acc_stderr": 0.015491088951494583, | |
"acc_norm": 0.7496807151979565, | |
"acc_norm_stderr": 0.015491088951494583 | |
}, | |
"harness|hendrycksTest-moral_disputes|5": { | |
"acc": 0.6271676300578035, | |
"acc_stderr": 0.026033890613576277, | |
"acc_norm": 0.6271676300578035, | |
"acc_norm_stderr": 0.026033890613576277 | |
}, | |
"harness|hendrycksTest-moral_scenarios|5": { | |
"acc": 0.38212290502793295, | |
"acc_stderr": 0.016251139711570765, | |
"acc_norm": 0.38212290502793295, | |
"acc_norm_stderr": 0.016251139711570765 | |
}, | |
"harness|hendrycksTest-nutrition|5": { | |
"acc": 0.6928104575163399, | |
"acc_stderr": 0.026415601914388992, | |
"acc_norm": 0.6928104575163399, | |
"acc_norm_stderr": 0.026415601914388992 | |
}, | |
"harness|hendrycksTest-philosophy|5": { | |
"acc": 0.6527331189710611, | |
"acc_stderr": 0.027040745502307336, | |
"acc_norm": 0.6527331189710611, | |
"acc_norm_stderr": 0.027040745502307336 | |
}, | |
"harness|hendrycksTest-prehistory|5": { | |
"acc": 0.6388888888888888, | |
"acc_stderr": 0.026725868809100793, | |
"acc_norm": 0.6388888888888888, | |
"acc_norm_stderr": 0.026725868809100793 | |
}, | |
"harness|hendrycksTest-professional_accounting|5": { | |
"acc": 0.46099290780141844, | |
"acc_stderr": 0.02973659252642444, | |
"acc_norm": 0.46099290780141844, | |
"acc_norm_stderr": 0.02973659252642444 | |
}, | |
"harness|hendrycksTest-professional_law|5": { | |
"acc": 0.423728813559322, | |
"acc_stderr": 0.012620785155885996, | |
"acc_norm": 0.423728813559322, | |
"acc_norm_stderr": 0.012620785155885996 | |
}, | |
"harness|hendrycksTest-professional_medicine|5": { | |
"acc": 0.6286764705882353, | |
"acc_stderr": 0.02934980313976587, | |
"acc_norm": 0.6286764705882353, | |
"acc_norm_stderr": 0.02934980313976587 | |
}, | |
"harness|hendrycksTest-professional_psychology|5": { | |
"acc": 0.6062091503267973, | |
"acc_stderr": 0.019766211991073056, | |
"acc_norm": 0.6062091503267973, | |
"acc_norm_stderr": 0.019766211991073056 | |
}, | |
"harness|hendrycksTest-public_relations|5": { | |
"acc": 0.6090909090909091, | |
"acc_stderr": 0.04673752333670239, | |
"acc_norm": 0.6090909090909091, | |
"acc_norm_stderr": 0.04673752333670239 | |
}, | |
"harness|hendrycksTest-security_studies|5": { | |
"acc": 0.6857142857142857, | |
"acc_stderr": 0.02971932942241748, | |
"acc_norm": 0.6857142857142857, | |
"acc_norm_stderr": 0.02971932942241748 | |
}, | |
"harness|hendrycksTest-sociology|5": { | |
"acc": 0.7810945273631841, | |
"acc_stderr": 0.029239174636647, | |
"acc_norm": 0.7810945273631841, | |
"acc_norm_stderr": 0.029239174636647 | |
}, | |
"harness|hendrycksTest-us_foreign_policy|5": { | |
"acc": 0.82, | |
"acc_stderr": 0.038612291966536934, | |
"acc_norm": 0.82, | |
"acc_norm_stderr": 0.038612291966536934 | |
}, | |
"harness|hendrycksTest-virology|5": { | |
"acc": 0.4578313253012048, | |
"acc_stderr": 0.0387862677100236, | |
"acc_norm": 0.4578313253012048, | |
"acc_norm_stderr": 0.0387862677100236 | |
}, | |
"harness|hendrycksTest-world_religions|5": { | |
"acc": 0.783625730994152, | |
"acc_stderr": 0.03158149539338734, | |
"acc_norm": 0.783625730994152, | |
"acc_norm_stderr": 0.03158149539338734 | |
}, | |
"harness|truthfulqa:mc|0": { | |
"mc1": 0.3268053855569155, | |
"mc1_stderr": 0.01641987473113503, | |
"mc2": 0.4701781470729953, | |
"mc2_stderr": 0.014777434418052576 | |
}, | |
"all": { | |
"acc": 0.5990888068369461, | |
"acc_stderr": 0.03431005125414193, | |
"acc_norm": 0.6027895245963486, | |
"acc_norm_stderr": 0.03429409520845181, | |
"mc1": 0.3268053855569155, | |
"mc1_stderr": 0.01641987473113503, | |
"mc2": 0.4701781470729953, | |
"mc2_stderr": 0.014777434418052576 | |
} | |
}, | |
"versions": { | |
"harness|arc:challenge|25": 0, | |
"harness|hellaswag|10": 0, | |
"harness|hendrycksTest-abstract_algebra|5": 1, | |
"harness|hendrycksTest-anatomy|5": 1, | |
"harness|hendrycksTest-astronomy|5": 1, | |
"harness|hendrycksTest-business_ethics|5": 1, | |
"harness|hendrycksTest-clinical_knowledge|5": 1, | |
"harness|hendrycksTest-college_biology|5": 1, | |
"harness|hendrycksTest-college_chemistry|5": 1, | |
"harness|hendrycksTest-college_computer_science|5": 1, | |
"harness|hendrycksTest-college_mathematics|5": 1, | |
"harness|hendrycksTest-college_medicine|5": 1, | |
"harness|hendrycksTest-college_physics|5": 1, | |
"harness|hendrycksTest-computer_security|5": 1, | |
"harness|hendrycksTest-conceptual_physics|5": 1, | |
"harness|hendrycksTest-econometrics|5": 1, | |
"harness|hendrycksTest-electrical_engineering|5": 1, | |
"harness|hendrycksTest-elementary_mathematics|5": 1, | |
"harness|hendrycksTest-formal_logic|5": 1, | |
"harness|hendrycksTest-global_facts|5": 1, | |
"harness|hendrycksTest-high_school_biology|5": 1, | |
"harness|hendrycksTest-high_school_chemistry|5": 1, | |
"harness|hendrycksTest-high_school_computer_science|5": 1, | |
"harness|hendrycksTest-high_school_european_history|5": 1, | |
"harness|hendrycksTest-high_school_geography|5": 1, | |
"harness|hendrycksTest-high_school_government_and_politics|5": 1, | |
"harness|hendrycksTest-high_school_macroeconomics|5": 1, | |
"harness|hendrycksTest-high_school_mathematics|5": 1, | |
"harness|hendrycksTest-high_school_microeconomics|5": 1, | |
"harness|hendrycksTest-high_school_physics|5": 1, | |
"harness|hendrycksTest-high_school_psychology|5": 1, | |
"harness|hendrycksTest-high_school_statistics|5": 1, | |
"harness|hendrycksTest-high_school_us_history|5": 1, | |
"harness|hendrycksTest-high_school_world_history|5": 1, | |
"harness|hendrycksTest-human_aging|5": 1, | |
"harness|hendrycksTest-human_sexuality|5": 1, | |
"harness|hendrycksTest-international_law|5": 1, | |
"harness|hendrycksTest-jurisprudence|5": 1, | |
"harness|hendrycksTest-logical_fallacies|5": 1, | |
"harness|hendrycksTest-machine_learning|5": 1, | |
"harness|hendrycksTest-management|5": 1, | |
"harness|hendrycksTest-marketing|5": 1, | |
"harness|hendrycksTest-medical_genetics|5": 1, | |
"harness|hendrycksTest-miscellaneous|5": 1, | |
"harness|hendrycksTest-moral_disputes|5": 1, | |
"harness|hendrycksTest-moral_scenarios|5": 1, | |
"harness|hendrycksTest-nutrition|5": 1, | |
"harness|hendrycksTest-philosophy|5": 1, | |
"harness|hendrycksTest-prehistory|5": 1, | |
"harness|hendrycksTest-professional_accounting|5": 1, | |
"harness|hendrycksTest-professional_law|5": 1, | |
"harness|hendrycksTest-professional_medicine|5": 1, | |
"harness|hendrycksTest-professional_psychology|5": 1, | |
"harness|hendrycksTest-public_relations|5": 1, | |
"harness|hendrycksTest-security_studies|5": 1, | |
"harness|hendrycksTest-sociology|5": 1, | |
"harness|hendrycksTest-us_foreign_policy|5": 1, | |
"harness|hendrycksTest-virology|5": 1, | |
"harness|hendrycksTest-world_religions|5": 1, | |
"harness|truthfulqa:mc|0": 1, | |
"all": 0 | |
}, | |
"config_tasks": { | |
"harness|arc:challenge": "LM Harness task", | |
"harness|hellaswag": "LM Harness task", | |
"harness|hendrycksTest-abstract_algebra": "LM Harness task", | |
"harness|hendrycksTest-anatomy": "LM Harness task", | |
"harness|hendrycksTest-astronomy": "LM Harness task", | |
"harness|hendrycksTest-business_ethics": "LM Harness task", | |
"harness|hendrycksTest-clinical_knowledge": "LM Harness task", | |
"harness|hendrycksTest-college_biology": "LM Harness task", | |
"harness|hendrycksTest-college_chemistry": "LM Harness task", | |
"harness|hendrycksTest-college_computer_science": "LM Harness task", | |
"harness|hendrycksTest-college_mathematics": "LM Harness task", | |
"harness|hendrycksTest-college_medicine": "LM Harness task", | |
"harness|hendrycksTest-college_physics": "LM Harness task", | |
"harness|hendrycksTest-computer_security": "LM Harness task", | |
"harness|hendrycksTest-conceptual_physics": "LM Harness task", | |
"harness|hendrycksTest-econometrics": "LM Harness task", | |
"harness|hendrycksTest-electrical_engineering": "LM Harness task", | |
"harness|hendrycksTest-elementary_mathematics": "LM Harness task", | |
"harness|hendrycksTest-formal_logic": "LM Harness task", | |
"harness|hendrycksTest-global_facts": "LM Harness task", | |
"harness|hendrycksTest-high_school_biology": "LM Harness task", | |
"harness|hendrycksTest-high_school_chemistry": "LM Harness task", | |
"harness|hendrycksTest-high_school_computer_science": "LM Harness task", | |
"harness|hendrycksTest-high_school_european_history": "LM Harness task", | |
"harness|hendrycksTest-high_school_geography": "LM Harness task", | |
"harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", | |
"harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", | |
"harness|hendrycksTest-high_school_mathematics": "LM Harness task", | |
"harness|hendrycksTest-high_school_microeconomics": "LM Harness task", | |
"harness|hendrycksTest-high_school_physics": "LM Harness task", | |
"harness|hendrycksTest-high_school_psychology": "LM Harness task", | |
"harness|hendrycksTest-high_school_statistics": "LM Harness task", | |
"harness|hendrycksTest-high_school_us_history": "LM Harness task", | |
"harness|hendrycksTest-high_school_world_history": "LM Harness task", | |
"harness|hendrycksTest-human_aging": "LM Harness task", | |
"harness|hendrycksTest-human_sexuality": "LM Harness task", | |
"harness|hendrycksTest-international_law": "LM Harness task", | |
"harness|hendrycksTest-jurisprudence": "LM Harness task", | |
"harness|hendrycksTest-logical_fallacies": "LM Harness task", | |
"harness|hendrycksTest-machine_learning": "LM Harness task", | |
"harness|hendrycksTest-management": "LM Harness task", | |
"harness|hendrycksTest-marketing": "LM Harness task", | |
"harness|hendrycksTest-medical_genetics": "LM Harness task", | |
"harness|hendrycksTest-miscellaneous": "LM Harness task", | |
"harness|hendrycksTest-moral_disputes": "LM Harness task", | |
"harness|hendrycksTest-moral_scenarios": "LM Harness task", | |
"harness|hendrycksTest-nutrition": "LM Harness task", | |
"harness|hendrycksTest-philosophy": "LM Harness task", | |
"harness|hendrycksTest-prehistory": "LM Harness task", | |
"harness|hendrycksTest-professional_accounting": "LM Harness task", | |
"harness|hendrycksTest-professional_law": "LM Harness task", | |
"harness|hendrycksTest-professional_medicine": "LM Harness task", | |
"harness|hendrycksTest-professional_psychology": "LM Harness task", | |
"harness|hendrycksTest-public_relations": "LM Harness task", | |
"harness|hendrycksTest-security_studies": "LM Harness task", | |
"harness|hendrycksTest-sociology": "LM Harness task", | |
"harness|hendrycksTest-us_foreign_policy": "LM Harness task", | |
"harness|hendrycksTest-virology": "LM Harness task", | |
"harness|hendrycksTest-world_religions": "LM Harness task", | |
"harness|truthfulqa:mc": "LM Harness task" | |
}, | |
"summary_tasks": { | |
"harness|arc:challenge|25": { | |
"hashes": { | |
"hash_examples": "17b0cae357c0259e", | |
"hash_full_prompts": "045cbb916e5145c6", | |
"hash_input_tokens": "c991f8a5814f8d2f", | |
"hash_cont_tokens": "bc6e686b575268af" | |
}, | |
"truncated": 0, | |
"non-truncated": 4687, | |
"padded": 4687, | |
"non-padded": 0, | |
"effective_few_shots": 25.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hellaswag|10": { | |
"hashes": { | |
"hash_examples": "e1768ecb99d7ecf0", | |
"hash_full_prompts": "0b4c16983130f84f", | |
"hash_input_tokens": "9d221d28a199a09c", | |
"hash_cont_tokens": "e7e52367a92daa27" | |
}, | |
"truncated": 0, | |
"non-truncated": 40168, | |
"padded": 40052, | |
"non-padded": 116, | |
"effective_few_shots": 10.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-abstract_algebra|5": { | |
"hashes": { | |
"hash_examples": "280f9f325b40559a", | |
"hash_full_prompts": "2f776a367d23aea2", | |
"hash_input_tokens": "5afce491c120616a", | |
"hash_cont_tokens": "bc75e4dffef3dc0e" | |
}, | |
"truncated": 0, | |
"non-truncated": 400, | |
"padded": 400, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-anatomy|5": { | |
"hashes": { | |
"hash_examples": "2f83a4f1cab4ba18", | |
"hash_full_prompts": "516f74bef25df620", | |
"hash_input_tokens": "f59f8967e61fde18", | |
"hash_cont_tokens": "f9dae0f98ef7c0f2" | |
}, | |
"truncated": 0, | |
"non-truncated": 540, | |
"padded": 540, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-astronomy|5": { | |
"hashes": { | |
"hash_examples": "7d587b908da4d762", | |
"hash_full_prompts": "faf4e80f65de93ca", | |
"hash_input_tokens": "2efbd578c3185755", | |
"hash_cont_tokens": "dff84e206d2f1e0d" | |
}, | |
"truncated": 0, | |
"non-truncated": 608, | |
"padded": 608, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-business_ethics|5": { | |
"hashes": { | |
"hash_examples": "33e51740670de686", | |
"hash_full_prompts": "db01c3ef8e1479d4", | |
"hash_input_tokens": "e328ff0ca8fc7890", | |
"hash_cont_tokens": "bc75e4dffef3dc0e" | |
}, | |
"truncated": 0, | |
"non-truncated": 400, | |
"padded": 400, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-clinical_knowledge|5": { | |
"hashes": { | |
"hash_examples": "f3366dbe7eefffa4", | |
"hash_full_prompts": "49654f71d94b65c3", | |
"hash_input_tokens": "145fa357c13fe43c", | |
"hash_cont_tokens": "b81dd170f83789d1" | |
}, | |
"truncated": 0, | |
"non-truncated": 1060, | |
"padded": 1060, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-college_biology|5": { | |
"hashes": { | |
"hash_examples": "ca2b6753a0193e7f", | |
"hash_full_prompts": "2b460b75f1fdfefd", | |
"hash_input_tokens": "888579887c9a665a", | |
"hash_cont_tokens": "85c3400292af3bb8" | |
}, | |
"truncated": 0, | |
"non-truncated": 576, | |
"padded": 569, | |
"non-padded": 7, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-college_chemistry|5": { | |
"hashes": { | |
"hash_examples": "22ff85f1d34f42d1", | |
"hash_full_prompts": "242c9be6da583e95", | |
"hash_input_tokens": "e2ca7bc279c63b09", | |
"hash_cont_tokens": "bc75e4dffef3dc0e" | |
}, | |
"truncated": 0, | |
"non-truncated": 400, | |
"padded": 400, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-college_computer_science|5": { | |
"hashes": { | |
"hash_examples": "30318289d717a5cf", | |
"hash_full_prompts": "ed2bdb4e87c4b371", | |
"hash_input_tokens": "1671195b9f861e25", | |
"hash_cont_tokens": "bc75e4dffef3dc0e" | |
}, | |
"truncated": 0, | |
"non-truncated": 400, | |
"padded": 400, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-college_mathematics|5": { | |
"hashes": { | |
"hash_examples": "4944d1f0b6b5d911", | |
"hash_full_prompts": "770bc4281c973190", | |
"hash_input_tokens": "47ed680b9caddd90", | |
"hash_cont_tokens": "bc75e4dffef3dc0e" | |
}, | |
"truncated": 0, | |
"non-truncated": 400, | |
"padded": 400, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-college_medicine|5": { | |
"hashes": { | |
"hash_examples": "dd69cc33381275af", | |
"hash_full_prompts": "ad2a53e5250ab46e", | |
"hash_input_tokens": "f71f719c1032180b", | |
"hash_cont_tokens": "e5cb48f872b79ee7" | |
}, | |
"truncated": 0, | |
"non-truncated": 692, | |
"padded": 692, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-college_physics|5": { | |
"hashes": { | |
"hash_examples": "875dd26d22655b0d", | |
"hash_full_prompts": "833a0d7b55aed500", | |
"hash_input_tokens": "5bde7875f9f1d5dd", | |
"hash_cont_tokens": "40862171591ad909" | |
}, | |
"truncated": 0, | |
"non-truncated": 408, | |
"padded": 408, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-computer_security|5": { | |
"hashes": { | |
"hash_examples": "006451eedc0ededb", | |
"hash_full_prompts": "94034c97e85d8f46", | |
"hash_input_tokens": "6de5a5feab854eed", | |
"hash_cont_tokens": "bc75e4dffef3dc0e" | |
}, | |
"truncated": 0, | |
"non-truncated": 400, | |
"padded": 400, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-conceptual_physics|5": { | |
"hashes": { | |
"hash_examples": "8874ece872d2ca4c", | |
"hash_full_prompts": "e40d15a34640d6fa", | |
"hash_input_tokens": "9a95e6bc66294b33", | |
"hash_cont_tokens": "36bb2a47e8ff1bd8" | |
}, | |
"truncated": 0, | |
"non-truncated": 940, | |
"padded": 940, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-econometrics|5": { | |
"hashes": { | |
"hash_examples": "64d3623b0bfaa43f", | |
"hash_full_prompts": "612f340fae41338d", | |
"hash_input_tokens": "b581c488a50d149d", | |
"hash_cont_tokens": "433685e9aa542c2d" | |
}, | |
"truncated": 0, | |
"non-truncated": 456, | |
"padded": 456, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-electrical_engineering|5": { | |
"hashes": { | |
"hash_examples": "e98f51780c674d7e", | |
"hash_full_prompts": "10275b312d812ae6", | |
"hash_input_tokens": "0afd8e37a73e499b", | |
"hash_cont_tokens": "f086b291b3aa0628" | |
}, | |
"truncated": 0, | |
"non-truncated": 580, | |
"padded": 560, | |
"non-padded": 20, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-elementary_mathematics|5": { | |
"hashes": { | |
"hash_examples": "fc48208a5ac1c0ce", | |
"hash_full_prompts": "5ec274c6c82aca23", | |
"hash_input_tokens": "6300cbea203e27e1", | |
"hash_cont_tokens": "4f402da407619e4d" | |
}, | |
"truncated": 0, | |
"non-truncated": 1512, | |
"padded": 1512, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-formal_logic|5": { | |
"hashes": { | |
"hash_examples": "5a6525665f63ea72", | |
"hash_full_prompts": "07b92638c4a6b500", | |
"hash_input_tokens": "dff3e10f0162548b", | |
"hash_cont_tokens": "80d8e3e54d900608" | |
}, | |
"truncated": 0, | |
"non-truncated": 504, | |
"padded": 504, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-global_facts|5": { | |
"hashes": { | |
"hash_examples": "371d70d743b2b89b", | |
"hash_full_prompts": "332fdee50a1921b4", | |
"hash_input_tokens": "133115320d06c025", | |
"hash_cont_tokens": "bc75e4dffef3dc0e" | |
}, | |
"truncated": 0, | |
"non-truncated": 400, | |
"padded": 400, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-high_school_biology|5": { | |
"hashes": { | |
"hash_examples": "a79e1018b1674052", | |
"hash_full_prompts": "e624e26ede922561", | |
"hash_input_tokens": "b9c0577c9c2daf4b", | |
"hash_cont_tokens": "e07819899bd63630" | |
}, | |
"truncated": 0, | |
"non-truncated": 1240, | |
"padded": 1240, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-high_school_chemistry|5": { | |
"hashes": { | |
"hash_examples": "44bfc25c389f0e03", | |
"hash_full_prompts": "0e3e5f5d9246482a", | |
"hash_input_tokens": "154d573ba30378ad", | |
"hash_cont_tokens": "eb6259a94d61e372" | |
}, | |
"truncated": 0, | |
"non-truncated": 812, | |
"padded": 812, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-high_school_computer_science|5": { | |
"hashes": { | |
"hash_examples": "8b8cdb1084f24169", | |
"hash_full_prompts": "c00487e67c1813cc", | |
"hash_input_tokens": "91754fb26290a162", | |
"hash_cont_tokens": "bc75e4dffef3dc0e" | |
}, | |
"truncated": 0, | |
"non-truncated": 400, | |
"padded": 400, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-high_school_european_history|5": { | |
"hashes": { | |
"hash_examples": "11cd32d0ef440171", | |
"hash_full_prompts": "318f4513c537c6bf", | |
"hash_input_tokens": "2e32e47bd2233827", | |
"hash_cont_tokens": "c3336566c025bc59" | |
}, | |
"truncated": 0, | |
"non-truncated": 660, | |
"padded": 656, | |
"non-padded": 4, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-high_school_geography|5": { | |
"hashes": { | |
"hash_examples": "b60019b9e80b642f", | |
"hash_full_prompts": "ee5789fcc1a81b1e", | |
"hash_input_tokens": "23f9a0b07be2ba2e", | |
"hash_cont_tokens": "999a32d098465441" | |
}, | |
"truncated": 0, | |
"non-truncated": 792, | |
"padded": 792, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-high_school_government_and_politics|5": { | |
"hashes": { | |
"hash_examples": "d221ec983d143dc3", | |
"hash_full_prompts": "ac42d888e1ce1155", | |
"hash_input_tokens": "de99699b0f5b162d", | |
"hash_cont_tokens": "361410848e01f8ed" | |
}, | |
"truncated": 0, | |
"non-truncated": 772, | |
"padded": 772, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-high_school_macroeconomics|5": { | |
"hashes": { | |
"hash_examples": "59c2915cacfd3fbb", | |
"hash_full_prompts": "c6bd9d25158abd0e", | |
"hash_input_tokens": "c96ba9fc2d1deb87", | |
"hash_cont_tokens": "18f9ae57b2444806" | |
}, | |
"truncated": 0, | |
"non-truncated": 1560, | |
"padded": 1560, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-high_school_mathematics|5": { | |
"hashes": { | |
"hash_examples": "1f8ac897608de342", | |
"hash_full_prompts": "5d88f41fc2d643a8", | |
"hash_input_tokens": "00509312373e95f1", | |
"hash_cont_tokens": "a13496e646060699" | |
}, | |
"truncated": 0, | |
"non-truncated": 1080, | |
"padded": 1080, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-high_school_microeconomics|5": { | |
"hashes": { | |
"hash_examples": "ead6a0f2f6c83370", | |
"hash_full_prompts": "bfc393381298609e", | |
"hash_input_tokens": "56e5bf80535561ec", | |
"hash_cont_tokens": "791a7a25f0571e59" | |
}, | |
"truncated": 0, | |
"non-truncated": 952, | |
"padded": 952, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-high_school_physics|5": { | |
"hashes": { | |
"hash_examples": "c3f2025990afec64", | |
"hash_full_prompts": "fc78b4997e436734", | |
"hash_input_tokens": "c9b689b4034de87c", | |
"hash_cont_tokens": "9677b0687811cf73" | |
}, | |
"truncated": 0, | |
"non-truncated": 604, | |
"padded": 604, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-high_school_psychology|5": { | |
"hashes": { | |
"hash_examples": "21f8aab618f6d636", | |
"hash_full_prompts": "d5c76aa40b9dbc43", | |
"hash_input_tokens": "ccecbb5539c34c08", | |
"hash_cont_tokens": "6393201d9136920e" | |
}, | |
"truncated": 0, | |
"non-truncated": 2180, | |
"padded": 2180, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-high_school_statistics|5": { | |
"hashes": { | |
"hash_examples": "2386a60a11fc5de3", | |
"hash_full_prompts": "4c5c8be5aafac432", | |
"hash_input_tokens": "3f75abf85d2b9fe9", | |
"hash_cont_tokens": "17caccbb3a38c7bf" | |
}, | |
"truncated": 0, | |
"non-truncated": 864, | |
"padded": 864, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-high_school_us_history|5": { | |
"hashes": { | |
"hash_examples": "74961543be40f04f", | |
"hash_full_prompts": "5d5ca4840131ba21", | |
"hash_input_tokens": "f52124b61354d42e", | |
"hash_cont_tokens": "7128e2eeb930d3b3" | |
}, | |
"truncated": 0, | |
"non-truncated": 816, | |
"padded": 816, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-high_school_world_history|5": { | |
"hashes": { | |
"hash_examples": "2ad2f6b7198b2234", | |
"hash_full_prompts": "11845057459afd72", | |
"hash_input_tokens": "b5b75910265dc2ff", | |
"hash_cont_tokens": "48e22ae63ee54721" | |
}, | |
"truncated": 0, | |
"non-truncated": 948, | |
"padded": 948, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-human_aging|5": { | |
"hashes": { | |
"hash_examples": "1a7199dc733e779b", | |
"hash_full_prompts": "756b9096b8eaf892", | |
"hash_input_tokens": "a26fe13fa58cbbed", | |
"hash_cont_tokens": "0f40704815d5b3f6" | |
}, | |
"truncated": 0, | |
"non-truncated": 892, | |
"padded": 892, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-human_sexuality|5": { | |
"hashes": { | |
"hash_examples": "7acb8fdad97f88a6", | |
"hash_full_prompts": "731a52ff15b8cfdb", | |
"hash_input_tokens": "ad768773a7782c0c", | |
"hash_cont_tokens": "a9fdf5917bdddc9b" | |
}, | |
"truncated": 0, | |
"non-truncated": 524, | |
"padded": 524, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-international_law|5": { | |
"hashes": { | |
"hash_examples": "1300bfd0dfc59114", | |
"hash_full_prompts": "db2aefbff5eec996", | |
"hash_input_tokens": "5e16e7eb92789a03", | |
"hash_cont_tokens": "c63e45a81fbe97b2" | |
}, | |
"truncated": 0, | |
"non-truncated": 484, | |
"padded": 484, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-jurisprudence|5": { | |
"hashes": { | |
"hash_examples": "083b1e4904c48dc2", | |
"hash_full_prompts": "0f89ee3fe03d6a21", | |
"hash_input_tokens": "6346e2bce86e76fe", | |
"hash_cont_tokens": "9df89edb95ea3c08" | |
}, | |
"truncated": 0, | |
"non-truncated": 432, | |
"padded": 432, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-logical_fallacies|5": { | |
"hashes": { | |
"hash_examples": "709128f9926a634c", | |
"hash_full_prompts": "98a04b1f8f841069", | |
"hash_input_tokens": "76581e704996be9d", | |
"hash_cont_tokens": "5b4f21454680a984" | |
}, | |
"truncated": 0, | |
"non-truncated": 652, | |
"padded": 648, | |
"non-padded": 4, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-machine_learning|5": { | |
"hashes": { | |
"hash_examples": "88f22a636029ae47", | |
"hash_full_prompts": "2e1c8d4b1e0cc921", | |
"hash_input_tokens": "0425f5feb26f8c3f", | |
"hash_cont_tokens": "0c2fc7f9e9101fbb" | |
}, | |
"truncated": 0, | |
"non-truncated": 448, | |
"padded": 448, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-management|5": { | |
"hashes": { | |
"hash_examples": "8c8a1e07a2151dca", | |
"hash_full_prompts": "f51611f514b265b0", | |
"hash_input_tokens": "fb4ebd06a3a58fd2", | |
"hash_cont_tokens": "1279a23b3bc7b32c" | |
}, | |
"truncated": 0, | |
"non-truncated": 412, | |
"padded": 412, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-marketing|5": { | |
"hashes": { | |
"hash_examples": "2668953431f91e96", | |
"hash_full_prompts": "77562bef997c7650", | |
"hash_input_tokens": "33ff7687ba4867f3", | |
"hash_cont_tokens": "be76778b3b861344" | |
}, | |
"truncated": 0, | |
"non-truncated": 936, | |
"padded": 936, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-medical_genetics|5": { | |
"hashes": { | |
"hash_examples": "9c2dda34a2ea4fd2", | |
"hash_full_prompts": "202139046daa118f", | |
"hash_input_tokens": "e6e5c037eb26a498", | |
"hash_cont_tokens": "bc75e4dffef3dc0e" | |
}, | |
"truncated": 0, | |
"non-truncated": 400, | |
"padded": 400, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-miscellaneous|5": { | |
"hashes": { | |
"hash_examples": "41adb694024809c2", | |
"hash_full_prompts": "bffec9fc237bcf93", | |
"hash_input_tokens": "6db91a99fee03712", | |
"hash_cont_tokens": "c61a0f86b50f0556" | |
}, | |
"truncated": 0, | |
"non-truncated": 3132, | |
"padded": 3132, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-moral_disputes|5": { | |
"hashes": { | |
"hash_examples": "3171c13ba3c594c4", | |
"hash_full_prompts": "170831fc36f1d59e", | |
"hash_input_tokens": "11bf0e6ef564edfb", | |
"hash_cont_tokens": "a208a34c74088f6c" | |
}, | |
"truncated": 0, | |
"non-truncated": 1384, | |
"padded": 1380, | |
"non-padded": 4, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-moral_scenarios|5": { | |
"hashes": { | |
"hash_examples": "9873e077e83e0546", | |
"hash_full_prompts": "08f4ceba3131a068", | |
"hash_input_tokens": "42af46cc9aa77d99", | |
"hash_cont_tokens": "996ce7a5b6c4aef1" | |
}, | |
"truncated": 0, | |
"non-truncated": 3580, | |
"padded": 3580, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-nutrition|5": { | |
"hashes": { | |
"hash_examples": "7db1d8142ec14323", | |
"hash_full_prompts": "4c0e68e3586cb453", | |
"hash_input_tokens": "484b72f626c82f6b", | |
"hash_cont_tokens": "9d4280b06a73f2ad" | |
}, | |
"truncated": 0, | |
"non-truncated": 1224, | |
"padded": 1224, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-philosophy|5": { | |
"hashes": { | |
"hash_examples": "9b455b7d72811cc8", | |
"hash_full_prompts": "e467f822d8a0d3ff", | |
"hash_input_tokens": "d3b8f0fb55346a71", | |
"hash_cont_tokens": "9a708d21688a0b16" | |
}, | |
"truncated": 0, | |
"non-truncated": 1244, | |
"padded": 1244, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-prehistory|5": { | |
"hashes": { | |
"hash_examples": "8be90d0f538f1560", | |
"hash_full_prompts": "152187949bcd0921", | |
"hash_input_tokens": "f8bbc40534f54a72", | |
"hash_cont_tokens": "ed0ff6b6c4caf978" | |
}, | |
"truncated": 0, | |
"non-truncated": 1296, | |
"padded": 1296, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-professional_accounting|5": { | |
"hashes": { | |
"hash_examples": "8d377597916cd07e", | |
"hash_full_prompts": "0eb7345d6144ee0d", | |
"hash_input_tokens": "53f58b4e8af11f6e", | |
"hash_cont_tokens": "4fd1a023ef90b43a" | |
}, | |
"truncated": 0, | |
"non-truncated": 1128, | |
"padded": 1127, | |
"non-padded": 1, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-professional_law|5": { | |
"hashes": { | |
"hash_examples": "cd9dbc52b3c932d6", | |
"hash_full_prompts": "36ac764272bfb182", | |
"hash_input_tokens": "a95688e641cf31f1", | |
"hash_cont_tokens": "d2c1c75d7c0e6ec5" | |
}, | |
"truncated": 0, | |
"non-truncated": 6136, | |
"padded": 6136, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-professional_medicine|5": { | |
"hashes": { | |
"hash_examples": "b20e4e816c1e383e", | |
"hash_full_prompts": "7b8d69ea2acaf2f7", | |
"hash_input_tokens": "fc49c75113daa07a", | |
"hash_cont_tokens": "ff4c3ef8a56efe40" | |
}, | |
"truncated": 0, | |
"non-truncated": 1088, | |
"padded": 1088, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-professional_psychology|5": { | |
"hashes": { | |
"hash_examples": "d45b73b22f9cc039", | |
"hash_full_prompts": "fe8937e9ffc99771", | |
"hash_input_tokens": "92d9588dfc6ac3f9", | |
"hash_cont_tokens": "b4566ef91a66db7d" | |
}, | |
"truncated": 0, | |
"non-truncated": 2448, | |
"padded": 2448, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-public_relations|5": { | |
"hashes": { | |
"hash_examples": "0d25072e1761652a", | |
"hash_full_prompts": "f9adc39cfa9f42ba", | |
"hash_input_tokens": "6a093aeebb63f500", | |
"hash_cont_tokens": "b713ae56c89df822" | |
}, | |
"truncated": 0, | |
"non-truncated": 440, | |
"padded": 440, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-security_studies|5": { | |
"hashes": { | |
"hash_examples": "62bb8197e63d60d4", | |
"hash_full_prompts": "869c9c3ae196b7c3", | |
"hash_input_tokens": "2fef5cbd88ee376f", | |
"hash_cont_tokens": "89baef8c4b642ed0" | |
}, | |
"truncated": 0, | |
"non-truncated": 980, | |
"padded": 980, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-sociology|5": { | |
"hashes": { | |
"hash_examples": "e7959df87dea8672", | |
"hash_full_prompts": "1a1fc00e17b3a52a", | |
"hash_input_tokens": "35a984bdddcb71dc", | |
"hash_cont_tokens": "b92ed9d8dde61395" | |
}, | |
"truncated": 0, | |
"non-truncated": 804, | |
"padded": 796, | |
"non-padded": 8, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-us_foreign_policy|5": { | |
"hashes": { | |
"hash_examples": "4a56a01ddca44dca", | |
"hash_full_prompts": "0c7a7081c71c07b6", | |
"hash_input_tokens": "64e26afac44fd84d", | |
"hash_cont_tokens": "bc75e4dffef3dc0e" | |
}, | |
"truncated": 0, | |
"non-truncated": 400, | |
"padded": 400, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-virology|5": { | |
"hashes": { | |
"hash_examples": "451cc86a8c4f4fe9", | |
"hash_full_prompts": "01e95325d8b738e4", | |
"hash_input_tokens": "3bce3760b179a55c", | |
"hash_cont_tokens": "1c1bf88d7c979ef5" | |
}, | |
"truncated": 0, | |
"non-truncated": 664, | |
"padded": 664, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|hendrycksTest-world_religions|5": { | |
"hashes": { | |
"hash_examples": "3b29cfaf1a81c379", | |
"hash_full_prompts": "e0d79a15083dfdff", | |
"hash_input_tokens": "6554c1be40513fa9", | |
"hash_cont_tokens": "9fbfaba067301be2" | |
}, | |
"truncated": 0, | |
"non-truncated": 684, | |
"padded": 684, | |
"non-padded": 0, | |
"effective_few_shots": 5.0, | |
"num_truncated_few_shots": 0 | |
}, | |
"harness|truthfulqa:mc|0": { | |
"hashes": { | |
"hash_examples": "23176c0531c7b867", | |
"hash_full_prompts": "36a6d90e75d92d4a", | |
"hash_input_tokens": "c1ed17b2cce8daea", | |
"hash_cont_tokens": "ad4c4cfcbb927635" | |
}, | |
"truncated": 0, | |
"non-truncated": 9996, | |
"padded": 9996, | |
"non-padded": 0, | |
"effective_few_shots": 0.0, | |
"num_truncated_few_shots": 0 | |
} | |
}, | |
"summary_general": { | |
"hashes": { | |
"hash_examples": "d84d18e9a963753d", | |
"hash_full_prompts": "12b540783521a8e6", | |
"hash_input_tokens": "41fb26e769733d20", | |
"hash_cont_tokens": "d6b023af5cbcb9cf" | |
}, | |
"total_evaluation_time_secondes": "6007.52667593956", | |
"truncated": 0, | |
"non-truncated": 111019, | |
"padded": 110855, | |
"non-padded": 164, | |
"num_truncated_few_shots": 0 | |
} | |
} |