DreamGenX's picture
Upload folder using huggingface_hub
bb996d2 verified
raw
history blame contribute delete
No virus
93.2 kB
{
"config_general": {
"lighteval_sha": "a98210fd3a2d1e8bface1c32b72ebd5017173a4c",
"num_fewshot_seeds": 1,
"override_batch_size": -1,
"max_samples": null,
"job_id": "",
"start_time": 2236465.645254106,
"end_time": 2260133.586933212,
"total_evaluation_time_secondes": "23667.941679106094",
"model_name": "meta-llama/Meta-Llama-3-8B-Instruct",
"model_sha": "c4a54320a52ed5f88b7a2f84496903ea4ff07b45",
"model_dtype": "torch.bfloat16",
"model_size": "14.96 GB",
"config": null
},
"results": {
"leaderboard|arc:challenge|25": {
"acc": 0.5742320819112628,
"acc_stderr": 0.01444946427886881,
"acc_norm": 0.5827645051194539,
"acc_norm_stderr": 0.014409825518403082
},
"leaderboard|hellaswag|10": {
"acc": 0.5707030472017527,
"acc_stderr": 0.004939642460172585,
"acc_norm": 0.7310296753634734,
"acc_norm_stderr": 0.004425182676353211
},
"leaderboard|mmlu:abstract_algebra|5": {
"acc": 0.33,
"acc_stderr": 0.047258156262526045
},
"leaderboard|mmlu:anatomy|5": {
"acc": 0.6814814814814815,
"acc_stderr": 0.040247784019771096
},
"leaderboard|mmlu:astronomy|5": {
"acc": 0.75,
"acc_stderr": 0.03523807393012047
},
"leaderboard|mmlu:business_ethics|5": {
"acc": 0.7,
"acc_stderr": 0.046056618647183814
},
"leaderboard|mmlu:clinical_knowledge|5": {
"acc": 0.7471698113207547,
"acc_stderr": 0.026749899771241214
},
"leaderboard|mmlu:college_biology|5": {
"acc": 0.7916666666666666,
"acc_stderr": 0.033961162058453336
},
"leaderboard|mmlu:college_chemistry|5": {
"acc": 0.45,
"acc_stderr": 0.05
},
"leaderboard|mmlu:college_computer_science|5": {
"acc": 0.52,
"acc_stderr": 0.050211673156867795
},
"leaderboard|mmlu:college_mathematics|5": {
"acc": 0.39,
"acc_stderr": 0.04902071300001975
},
"leaderboard|mmlu:college_medicine|5": {
"acc": 0.6589595375722543,
"acc_stderr": 0.036146654241808254
},
"leaderboard|mmlu:college_physics|5": {
"acc": 0.43137254901960786,
"acc_stderr": 0.04928099597287533
},
"leaderboard|mmlu:computer_security|5": {
"acc": 0.79,
"acc_stderr": 0.04093601807403326
},
"leaderboard|mmlu:conceptual_physics|5": {
"acc": 0.5872340425531914,
"acc_stderr": 0.03218471141400351
},
"leaderboard|mmlu:econometrics|5": {
"acc": 0.543859649122807,
"acc_stderr": 0.046854730419077895
},
"leaderboard|mmlu:electrical_engineering|5": {
"acc": 0.6137931034482759,
"acc_stderr": 0.04057324734419035
},
"leaderboard|mmlu:elementary_mathematics|5": {
"acc": 0.46825396825396826,
"acc_stderr": 0.0256993528321318
},
"leaderboard|mmlu:formal_logic|5": {
"acc": 0.5317460317460317,
"acc_stderr": 0.04463112720677172
},
"leaderboard|mmlu:global_facts|5": {
"acc": 0.46,
"acc_stderr": 0.05009082659620332
},
"leaderboard|mmlu:high_school_biology|5": {
"acc": 0.8064516129032258,
"acc_stderr": 0.022475258525536057
},
"leaderboard|mmlu:high_school_chemistry|5": {
"acc": 0.541871921182266,
"acc_stderr": 0.03505630140785741
},
"leaderboard|mmlu:high_school_computer_science|5": {
"acc": 0.68,
"acc_stderr": 0.04688261722621505
},
"leaderboard|mmlu:high_school_european_history|5": {
"acc": 0.7393939393939394,
"acc_stderr": 0.034277431758165236
},
"leaderboard|mmlu:high_school_geography|5": {
"acc": 0.8131313131313131,
"acc_stderr": 0.027772533334218957
},
"leaderboard|mmlu:high_school_government_and_politics|5": {
"acc": 0.8963730569948186,
"acc_stderr": 0.02199531196364424
},
"leaderboard|mmlu:high_school_macroeconomics|5": {
"acc": 0.676923076923077,
"acc_stderr": 0.023710888501970555
},
"leaderboard|mmlu:high_school_mathematics|5": {
"acc": 0.32592592592592595,
"acc_stderr": 0.028578348365473072
},
"leaderboard|mmlu:high_school_microeconomics|5": {
"acc": 0.7563025210084033,
"acc_stderr": 0.027886828078380548
},
"leaderboard|mmlu:high_school_physics|5": {
"acc": 0.4105960264900662,
"acc_stderr": 0.04016689594849927
},
"leaderboard|mmlu:high_school_psychology|5": {
"acc": 0.8477064220183487,
"acc_stderr": 0.015405084393157074
},
"leaderboard|mmlu:high_school_statistics|5": {
"acc": 0.47685185185185186,
"acc_stderr": 0.03406315360711507
},
"leaderboard|mmlu:high_school_us_history|5": {
"acc": 0.7892156862745098,
"acc_stderr": 0.028626547912437406
},
"leaderboard|mmlu:high_school_world_history|5": {
"acc": 0.8396624472573839,
"acc_stderr": 0.023884380925965665
},
"leaderboard|mmlu:human_aging|5": {
"acc": 0.726457399103139,
"acc_stderr": 0.029918586707798827
},
"leaderboard|mmlu:human_sexuality|5": {
"acc": 0.7938931297709924,
"acc_stderr": 0.03547771004159462
},
"leaderboard|mmlu:international_law|5": {
"acc": 0.768595041322314,
"acc_stderr": 0.03849856098794088
},
"leaderboard|mmlu:jurisprudence|5": {
"acc": 0.7592592592592593,
"acc_stderr": 0.04133119440243839
},
"leaderboard|mmlu:logical_fallacies|5": {
"acc": 0.7607361963190185,
"acc_stderr": 0.033519538795212696
},
"leaderboard|mmlu:machine_learning|5": {
"acc": 0.5267857142857143,
"acc_stderr": 0.047389751192741546
},
"leaderboard|mmlu:management|5": {
"acc": 0.8155339805825242,
"acc_stderr": 0.03840423627288276
},
"leaderboard|mmlu:marketing|5": {
"acc": 0.905982905982906,
"acc_stderr": 0.019119892798924974
},
"leaderboard|mmlu:medical_genetics|5": {
"acc": 0.79,
"acc_stderr": 0.040936018074033256
},
"leaderboard|mmlu:miscellaneous|5": {
"acc": 0.8237547892720306,
"acc_stderr": 0.013625556907993455
},
"leaderboard|mmlu:moral_disputes|5": {
"acc": 0.7398843930635838,
"acc_stderr": 0.023618678310069356
},
"leaderboard|mmlu:moral_scenarios|5": {
"acc": 0.43575418994413406,
"acc_stderr": 0.016583881958602387
},
"leaderboard|mmlu:nutrition|5": {
"acc": 0.7549019607843137,
"acc_stderr": 0.024630048979824785
},
"leaderboard|mmlu:philosophy|5": {
"acc": 0.7331189710610932,
"acc_stderr": 0.025122637608816657
},
"leaderboard|mmlu:prehistory|5": {
"acc": 0.7469135802469136,
"acc_stderr": 0.024191808600713002
},
"leaderboard|mmlu:professional_accounting|5": {
"acc": 0.5177304964539007,
"acc_stderr": 0.02980873964223777
},
"leaderboard|mmlu:professional_law|5": {
"acc": 0.46479791395045633,
"acc_stderr": 0.012738547371303956
},
"leaderboard|mmlu:professional_medicine|5": {
"acc": 0.7279411764705882,
"acc_stderr": 0.027033041151681456
},
"leaderboard|mmlu:professional_psychology|5": {
"acc": 0.6928104575163399,
"acc_stderr": 0.018663359671463677
},
"leaderboard|mmlu:public_relations|5": {
"acc": 0.6636363636363637,
"acc_stderr": 0.04525393596302505
},
"leaderboard|mmlu:security_studies|5": {
"acc": 0.7306122448979592,
"acc_stderr": 0.02840125202902294
},
"leaderboard|mmlu:sociology|5": {
"acc": 0.8557213930348259,
"acc_stderr": 0.02484575321230604
},
"leaderboard|mmlu:us_foreign_policy|5": {
"acc": 0.86,
"acc_stderr": 0.03487350880197769
},
"leaderboard|mmlu:virology|5": {
"acc": 0.536144578313253,
"acc_stderr": 0.03882310850890594
},
"leaderboard|mmlu:world_religions|5": {
"acc": 0.7953216374269005,
"acc_stderr": 0.030944459778533193
},
"leaderboard|truthfulqa:mc|0": {
"truthfulqa_mc1": 0.37454100367197063,
"truthfulqa_mc1_stderr": 0.016943535128405338,
"truthfulqa_mc2": 0.5337684444397199,
"truthfulqa_mc2_stderr": 0.015971485281891525
},
"leaderboard|winogrande|5": {
"acc": 0.6929755327545383,
"acc_stderr": 0.012963688616969483
},
"leaderboard|gsm8k|5": {
"qem": 0.6808188021228203,
"qem_stderr": 0.012840345676251653
},
"leaderboard|mmlu:_average|5": {
"acc": 0.6661794809691,
"acc_stderr": 0.033327669029227354
},
"all": {
"acc": 0.6635023512851042,
"acc_stderr": 0.032200498833699506,
"acc_norm": 0.6568970902414637,
"acc_norm_stderr": 0.009417504097378147,
"truthfulqa_mc1": 0.37454100367197063,
"truthfulqa_mc1_stderr": 0.016943535128405338,
"truthfulqa_mc2": 0.5337684444397199,
"truthfulqa_mc2_stderr": 0.015971485281891525,
"qem": 0.6808188021228203,
"qem_stderr": 0.012840345676251653
}
},
"versions": {
"leaderboard|arc:challenge|25": 0,
"leaderboard|gsm8k|5": 0,
"leaderboard|hellaswag|10": 0,
"leaderboard|mmlu:abstract_algebra|5": 0,
"leaderboard|mmlu:anatomy|5": 0,
"leaderboard|mmlu:astronomy|5": 0,
"leaderboard|mmlu:business_ethics|5": 0,
"leaderboard|mmlu:clinical_knowledge|5": 0,
"leaderboard|mmlu:college_biology|5": 0,
"leaderboard|mmlu:college_chemistry|5": 0,
"leaderboard|mmlu:college_computer_science|5": 0,
"leaderboard|mmlu:college_mathematics|5": 0,
"leaderboard|mmlu:college_medicine|5": 0,
"leaderboard|mmlu:college_physics|5": 0,
"leaderboard|mmlu:computer_security|5": 0,
"leaderboard|mmlu:conceptual_physics|5": 0,
"leaderboard|mmlu:econometrics|5": 0,
"leaderboard|mmlu:electrical_engineering|5": 0,
"leaderboard|mmlu:elementary_mathematics|5": 0,
"leaderboard|mmlu:formal_logic|5": 0,
"leaderboard|mmlu:global_facts|5": 0,
"leaderboard|mmlu:high_school_biology|5": 0,
"leaderboard|mmlu:high_school_chemistry|5": 0,
"leaderboard|mmlu:high_school_computer_science|5": 0,
"leaderboard|mmlu:high_school_european_history|5": 0,
"leaderboard|mmlu:high_school_geography|5": 0,
"leaderboard|mmlu:high_school_government_and_politics|5": 0,
"leaderboard|mmlu:high_school_macroeconomics|5": 0,
"leaderboard|mmlu:high_school_mathematics|5": 0,
"leaderboard|mmlu:high_school_microeconomics|5": 0,
"leaderboard|mmlu:high_school_physics|5": 0,
"leaderboard|mmlu:high_school_psychology|5": 0,
"leaderboard|mmlu:high_school_statistics|5": 0,
"leaderboard|mmlu:high_school_us_history|5": 0,
"leaderboard|mmlu:high_school_world_history|5": 0,
"leaderboard|mmlu:human_aging|5": 0,
"leaderboard|mmlu:human_sexuality|5": 0,
"leaderboard|mmlu:international_law|5": 0,
"leaderboard|mmlu:jurisprudence|5": 0,
"leaderboard|mmlu:logical_fallacies|5": 0,
"leaderboard|mmlu:machine_learning|5": 0,
"leaderboard|mmlu:management|5": 0,
"leaderboard|mmlu:marketing|5": 0,
"leaderboard|mmlu:medical_genetics|5": 0,
"leaderboard|mmlu:miscellaneous|5": 0,
"leaderboard|mmlu:moral_disputes|5": 0,
"leaderboard|mmlu:moral_scenarios|5": 0,
"leaderboard|mmlu:nutrition|5": 0,
"leaderboard|mmlu:philosophy|5": 0,
"leaderboard|mmlu:prehistory|5": 0,
"leaderboard|mmlu:professional_accounting|5": 0,
"leaderboard|mmlu:professional_law|5": 0,
"leaderboard|mmlu:professional_medicine|5": 0,
"leaderboard|mmlu:professional_psychology|5": 0,
"leaderboard|mmlu:public_relations|5": 0,
"leaderboard|mmlu:security_studies|5": 0,
"leaderboard|mmlu:sociology|5": 0,
"leaderboard|mmlu:us_foreign_policy|5": 0,
"leaderboard|mmlu:virology|5": 0,
"leaderboard|mmlu:world_religions|5": 0,
"leaderboard|truthfulqa:mc|0": 0,
"leaderboard|winogrande|5": 0
},
"config_tasks": {
"leaderboard|arc:challenge": {
"name": "arc:challenge",
"prompt_function": "arc",
"hf_repo": "ai2_arc",
"hf_subset": "ARC-Challenge",
"metric": [
"loglikelihood_acc",
"loglikelihood_acc_norm_nospace"
],
"hf_avail_splits": [
"train",
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": "random_sampling_from_train",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"arc"
],
"original_num_docs": 1172,
"effective_num_docs": 1172,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|gsm8k": {
"name": "gsm8k",
"prompt_function": "gsm8k",
"hf_repo": "gsm8k",
"hf_subset": "main",
"metric": [
"quasi_exact_match_gsm8k"
],
"hf_avail_splits": [
"train",
"test"
],
"evaluation_splits": [
"test"
],
"few_shots_split": null,
"few_shots_select": "random_sampling_from_train",
"generation_size": 256,
"stop_sequence": [
"Question:",
"Question",
":"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard"
],
"original_num_docs": 1319,
"effective_num_docs": 1319,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|hellaswag": {
"name": "hellaswag",
"prompt_function": "hellaswag_harness",
"hf_repo": "hellaswag",
"hf_subset": "default",
"metric": [
"loglikelihood_acc",
"loglikelihood_acc_norm"
],
"hf_avail_splits": [
"train",
"test",
"validation"
],
"evaluation_splits": [
"validation"
],
"few_shots_split": null,
"few_shots_select": "random_sampling_from_train",
"generation_size": -1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard"
],
"original_num_docs": 10042,
"effective_num_docs": 10042,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:abstract_algebra": {
"name": "mmlu:abstract_algebra",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "abstract_algebra",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:anatomy": {
"name": "mmlu:anatomy",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "anatomy",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 135,
"effective_num_docs": 135,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:astronomy": {
"name": "mmlu:astronomy",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "astronomy",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 152,
"effective_num_docs": 152,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:business_ethics": {
"name": "mmlu:business_ethics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "business_ethics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:clinical_knowledge": {
"name": "mmlu:clinical_knowledge",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "clinical_knowledge",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 265,
"effective_num_docs": 265,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:college_biology": {
"name": "mmlu:college_biology",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "college_biology",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 144,
"effective_num_docs": 144,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:college_chemistry": {
"name": "mmlu:college_chemistry",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "college_chemistry",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:college_computer_science": {
"name": "mmlu:college_computer_science",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "college_computer_science",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:college_mathematics": {
"name": "mmlu:college_mathematics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "college_mathematics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:college_medicine": {
"name": "mmlu:college_medicine",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "college_medicine",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 173,
"effective_num_docs": 173,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:college_physics": {
"name": "mmlu:college_physics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "college_physics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 102,
"effective_num_docs": 102,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:computer_security": {
"name": "mmlu:computer_security",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "computer_security",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:conceptual_physics": {
"name": "mmlu:conceptual_physics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "conceptual_physics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 235,
"effective_num_docs": 235,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:econometrics": {
"name": "mmlu:econometrics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "econometrics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 114,
"effective_num_docs": 114,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:electrical_engineering": {
"name": "mmlu:electrical_engineering",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "electrical_engineering",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 145,
"effective_num_docs": 145,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:elementary_mathematics": {
"name": "mmlu:elementary_mathematics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "elementary_mathematics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 378,
"effective_num_docs": 378,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:formal_logic": {
"name": "mmlu:formal_logic",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "formal_logic",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 126,
"effective_num_docs": 126,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:global_facts": {
"name": "mmlu:global_facts",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "global_facts",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:high_school_biology": {
"name": "mmlu:high_school_biology",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_biology",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 310,
"effective_num_docs": 310,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:high_school_chemistry": {
"name": "mmlu:high_school_chemistry",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_chemistry",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 203,
"effective_num_docs": 203,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:high_school_computer_science": {
"name": "mmlu:high_school_computer_science",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_computer_science",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:high_school_european_history": {
"name": "mmlu:high_school_european_history",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_european_history",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 165,
"effective_num_docs": 165,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:high_school_geography": {
"name": "mmlu:high_school_geography",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_geography",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 198,
"effective_num_docs": 198,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:high_school_government_and_politics": {
"name": "mmlu:high_school_government_and_politics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_government_and_politics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 193,
"effective_num_docs": 193,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:high_school_macroeconomics": {
"name": "mmlu:high_school_macroeconomics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_macroeconomics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 390,
"effective_num_docs": 390,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:high_school_mathematics": {
"name": "mmlu:high_school_mathematics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_mathematics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 270,
"effective_num_docs": 270,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:high_school_microeconomics": {
"name": "mmlu:high_school_microeconomics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_microeconomics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 238,
"effective_num_docs": 238,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:high_school_physics": {
"name": "mmlu:high_school_physics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_physics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 151,
"effective_num_docs": 151,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:high_school_psychology": {
"name": "mmlu:high_school_psychology",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_psychology",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 545,
"effective_num_docs": 545,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:high_school_statistics": {
"name": "mmlu:high_school_statistics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_statistics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 216,
"effective_num_docs": 216,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:high_school_us_history": {
"name": "mmlu:high_school_us_history",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_us_history",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 204,
"effective_num_docs": 204,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:high_school_world_history": {
"name": "mmlu:high_school_world_history",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "high_school_world_history",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 237,
"effective_num_docs": 237,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:human_aging": {
"name": "mmlu:human_aging",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "human_aging",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 223,
"effective_num_docs": 223,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:human_sexuality": {
"name": "mmlu:human_sexuality",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "human_sexuality",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 131,
"effective_num_docs": 131,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:international_law": {
"name": "mmlu:international_law",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "international_law",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 121,
"effective_num_docs": 121,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:jurisprudence": {
"name": "mmlu:jurisprudence",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "jurisprudence",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 108,
"effective_num_docs": 108,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:logical_fallacies": {
"name": "mmlu:logical_fallacies",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "logical_fallacies",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 163,
"effective_num_docs": 163,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:machine_learning": {
"name": "mmlu:machine_learning",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "machine_learning",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 112,
"effective_num_docs": 112,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:management": {
"name": "mmlu:management",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "management",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 103,
"effective_num_docs": 103,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:marketing": {
"name": "mmlu:marketing",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "marketing",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 234,
"effective_num_docs": 234,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:medical_genetics": {
"name": "mmlu:medical_genetics",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "medical_genetics",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:miscellaneous": {
"name": "mmlu:miscellaneous",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "miscellaneous",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 783,
"effective_num_docs": 783,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:moral_disputes": {
"name": "mmlu:moral_disputes",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "moral_disputes",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 346,
"effective_num_docs": 346,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:moral_scenarios": {
"name": "mmlu:moral_scenarios",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "moral_scenarios",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 895,
"effective_num_docs": 895,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:nutrition": {
"name": "mmlu:nutrition",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "nutrition",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 306,
"effective_num_docs": 306,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:philosophy": {
"name": "mmlu:philosophy",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "philosophy",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 311,
"effective_num_docs": 311,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:prehistory": {
"name": "mmlu:prehistory",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "prehistory",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 324,
"effective_num_docs": 324,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:professional_accounting": {
"name": "mmlu:professional_accounting",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "professional_accounting",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 282,
"effective_num_docs": 282,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:professional_law": {
"name": "mmlu:professional_law",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "professional_law",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 1534,
"effective_num_docs": 1534,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:professional_medicine": {
"name": "mmlu:professional_medicine",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "professional_medicine",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 272,
"effective_num_docs": 272,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:professional_psychology": {
"name": "mmlu:professional_psychology",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "professional_psychology",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 612,
"effective_num_docs": 612,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:public_relations": {
"name": "mmlu:public_relations",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "public_relations",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 110,
"effective_num_docs": 110,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:security_studies": {
"name": "mmlu:security_studies",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "security_studies",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 245,
"effective_num_docs": 245,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:sociology": {
"name": "mmlu:sociology",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "sociology",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 201,
"effective_num_docs": 201,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:us_foreign_policy": {
"name": "mmlu:us_foreign_policy",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "us_foreign_policy",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 100,
"effective_num_docs": 100,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:virology": {
"name": "mmlu:virology",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "virology",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 166,
"effective_num_docs": 166,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|mmlu:world_religions": {
"name": "mmlu:world_religions",
"prompt_function": "mmlu_harness",
"hf_repo": "lighteval/mmlu",
"hf_subset": "world_religions",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"auxiliary_train",
"test",
"validation",
"dev"
],
"evaluation_splits": [
"test"
],
"few_shots_split": "dev",
"few_shots_select": "sequential",
"generation_size": 1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard",
"mmlu"
],
"original_num_docs": 171,
"effective_num_docs": 171,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|truthfulqa:mc": {
"name": "truthfulqa:mc",
"prompt_function": "truthful_qa_multiple_choice",
"hf_repo": "truthful_qa",
"hf_subset": "multiple_choice",
"metric": [
"truthfulqa_mc_metrics"
],
"hf_avail_splits": [
"validation"
],
"evaluation_splits": [
"validation"
],
"few_shots_split": null,
"few_shots_select": null,
"generation_size": -1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard"
],
"original_num_docs": 817,
"effective_num_docs": 817,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
},
"leaderboard|winogrande": {
"name": "winogrande",
"prompt_function": "winogrande",
"hf_repo": "winogrande",
"hf_subset": "winogrande_xl",
"metric": [
"loglikelihood_acc"
],
"hf_avail_splits": [
"train",
"test",
"validation"
],
"evaluation_splits": [
"validation"
],
"few_shots_split": null,
"few_shots_select": "random_sampling",
"generation_size": -1,
"stop_sequence": [
"\n"
],
"output_regex": null,
"num_samples": null,
"frozen": false,
"suite": [
"leaderboard"
],
"original_num_docs": 1267,
"effective_num_docs": 1267,
"trust_dataset": true,
"must_remove_duplicate_docs": null,
"version": 0
}
},
"summary_tasks": {
"leaderboard|arc:challenge|25": {
"hashes": {
"hash_examples": "17b0cae357c0259e",
"hash_full_prompts": "4aeb23a740784b86",
"hash_input_tokens": "2e9e18067d1f8ad8",
"hash_cont_tokens": "19baa8a044eaaac8"
},
"truncated": 0,
"non_truncated": 1172,
"padded": 4687,
"non_padded": 0,
"effective_few_shots": 25.0,
"num_truncated_few_shots": 0
},
"leaderboard|hellaswag|10": {
"hashes": {
"hash_examples": "31985c805c3a737e",
"hash_full_prompts": "3c2d3440e190b07b",
"hash_input_tokens": "412fc1d29623282b",
"hash_cont_tokens": "823c88a16c837063"
},
"truncated": 0,
"non_truncated": 10042,
"padded": 40105,
"non_padded": 63,
"effective_few_shots": 10.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:abstract_algebra|5": {
"hashes": {
"hash_examples": "4c76229e00c9c0e9",
"hash_full_prompts": "faefa0cccb952fe0",
"hash_input_tokens": "e7380c35f0e2c4b3",
"hash_cont_tokens": "a886b3552371a98b"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:anatomy|5": {
"hashes": {
"hash_examples": "6a1f8104dccbd33b",
"hash_full_prompts": "eacd03e46972fa59",
"hash_input_tokens": "2ee8bc2ef4561b6b",
"hash_cont_tokens": "9be31d13c42ead00"
},
"truncated": 0,
"non_truncated": 135,
"padded": 540,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:astronomy|5": {
"hashes": {
"hash_examples": "1302effa3a76ce4c",
"hash_full_prompts": "826cacbdf1f6bfd0",
"hash_input_tokens": "6ab8d24255ff03b3",
"hash_cont_tokens": "30cc2b2fc1294aac"
},
"truncated": 0,
"non_truncated": 152,
"padded": 608,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:business_ethics|5": {
"hashes": {
"hash_examples": "03cb8bce5336419a",
"hash_full_prompts": "518511169382ac39",
"hash_input_tokens": "8be4f0cc9ce448e1",
"hash_cont_tokens": "4e9d83c717b7deb8"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:clinical_knowledge|5": {
"hashes": {
"hash_examples": "ffbb9c7b2be257f9",
"hash_full_prompts": "0b07b0bc774fdfd9",
"hash_input_tokens": "413166c01db52a72",
"hash_cont_tokens": "40dd7263ce5af5de"
},
"truncated": 0,
"non_truncated": 265,
"padded": 1060,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:college_biology|5": {
"hashes": {
"hash_examples": "3ee77f176f38eb8e",
"hash_full_prompts": "22cbe0e8dabf98b1",
"hash_input_tokens": "0dcd583202383d43",
"hash_cont_tokens": "1892d80e82b394c0"
},
"truncated": 0,
"non_truncated": 144,
"padded": 576,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:college_chemistry|5": {
"hashes": {
"hash_examples": "ce61a69c46d47aeb",
"hash_full_prompts": "9c1288940a4afb59",
"hash_input_tokens": "59a4f0d36881d644",
"hash_cont_tokens": "b6bb78fb2d7e4e6f"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:college_computer_science|5": {
"hashes": {
"hash_examples": "32805b52d7d5daab",
"hash_full_prompts": "9522781d0cdf1a43",
"hash_input_tokens": "302a2f1d05b53513",
"hash_cont_tokens": "6a5da979260e607c"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:college_mathematics|5": {
"hashes": {
"hash_examples": "55da1a0a0bd33722",
"hash_full_prompts": "72fe6f46a57e6ca4",
"hash_input_tokens": "042f1988f13b8f9a",
"hash_cont_tokens": "62df3b0447bd3b12"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:college_medicine|5": {
"hashes": {
"hash_examples": "c33e143163049176",
"hash_full_prompts": "dee0989b2c8993f4",
"hash_input_tokens": "6dd81075c8e816e9",
"hash_cont_tokens": "933c01711a0757a0"
},
"truncated": 0,
"non_truncated": 173,
"padded": 692,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:college_physics|5": {
"hashes": {
"hash_examples": "ebdab1cdb7e555df",
"hash_full_prompts": "a1be6b64ea1948c3",
"hash_input_tokens": "37818fa59254732b",
"hash_cont_tokens": "d36569ab90faad7c"
},
"truncated": 0,
"non_truncated": 102,
"padded": 408,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:computer_security|5": {
"hashes": {
"hash_examples": "a24fd7d08a560921",
"hash_full_prompts": "01bc3fdfdefe67a4",
"hash_input_tokens": "d4957d5a9d5e83ec",
"hash_cont_tokens": "a886b3552371a98b"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:conceptual_physics|5": {
"hashes": {
"hash_examples": "8300977a79386993",
"hash_full_prompts": "b39315a8ada3ca79",
"hash_input_tokens": "c146a84803f78c9e",
"hash_cont_tokens": "6408f70f3d9ada31"
},
"truncated": 0,
"non_truncated": 235,
"padded": 940,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:econometrics|5": {
"hashes": {
"hash_examples": "ddde36788a04a46f",
"hash_full_prompts": "70bab37ca5fcc48f",
"hash_input_tokens": "086bc025be133096",
"hash_cont_tokens": "3befa885ca6e4b97"
},
"truncated": 0,
"non_truncated": 114,
"padded": 456,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:electrical_engineering|5": {
"hashes": {
"hash_examples": "acbc5def98c19b3f",
"hash_full_prompts": "86a4747481c11c61",
"hash_input_tokens": "b83507ac94ded59b",
"hash_cont_tokens": "e75df8f470aa4973"
},
"truncated": 0,
"non_truncated": 145,
"padded": 580,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:elementary_mathematics|5": {
"hashes": {
"hash_examples": "146e61d07497a9bd",
"hash_full_prompts": "1fe56333735325fa",
"hash_input_tokens": "8c3c868b34bad37b",
"hash_cont_tokens": "f09c97e7f7f9af71"
},
"truncated": 0,
"non_truncated": 378,
"padded": 1512,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:formal_logic|5": {
"hashes": {
"hash_examples": "8635216e1909a03f",
"hash_full_prompts": "cc83c1ede45f974c",
"hash_input_tokens": "bb0616a24585501c",
"hash_cont_tokens": "df96e75b4eb1d7b0"
},
"truncated": 0,
"non_truncated": 126,
"padded": 504,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:global_facts|5": {
"hashes": {
"hash_examples": "30b315aa6353ee47",
"hash_full_prompts": "3a2ec1e2785c69a5",
"hash_input_tokens": "5e840dc7f1c55a67",
"hash_cont_tokens": "a886b3552371a98b"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_biology|5": {
"hashes": {
"hash_examples": "c9136373af2180de",
"hash_full_prompts": "27646a569cf2a6f8",
"hash_input_tokens": "1dce672a00c5cbe1",
"hash_cont_tokens": "c6d11e73dc85157f"
},
"truncated": 0,
"non_truncated": 310,
"padded": 1240,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_chemistry|5": {
"hashes": {
"hash_examples": "b0661bfa1add6404",
"hash_full_prompts": "6905c6ca76f7b2b7",
"hash_input_tokens": "7fb2dd590b34e445",
"hash_cont_tokens": "208aff39cfca671a"
},
"truncated": 0,
"non_truncated": 203,
"padded": 812,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_computer_science|5": {
"hashes": {
"hash_examples": "80fc1d623a3d665f",
"hash_full_prompts": "b80092241e8b6c06",
"hash_input_tokens": "b2a9091fd8d00b66",
"hash_cont_tokens": "150a6d581009fbe0"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_european_history|5": {
"hashes": {
"hash_examples": "854da6e5af0fe1a1",
"hash_full_prompts": "a3bc32a5dc022ce7",
"hash_input_tokens": "393e215e8667fde4",
"hash_cont_tokens": "7b6f4c22b304c3cc"
},
"truncated": 0,
"non_truncated": 165,
"padded": 656,
"non_padded": 4,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_geography|5": {
"hashes": {
"hash_examples": "7dc963c7acd19ad8",
"hash_full_prompts": "53f91beae305905d",
"hash_input_tokens": "439ac435fc478534",
"hash_cont_tokens": "1a85c9e696d91a66"
},
"truncated": 0,
"non_truncated": 198,
"padded": 792,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_government_and_politics|5": {
"hashes": {
"hash_examples": "1f675dcdebc9758f",
"hash_full_prompts": "623fd7e3495f243f",
"hash_input_tokens": "2c5757b8545f7cf8",
"hash_cont_tokens": "a47a4530b8790081"
},
"truncated": 0,
"non_truncated": 193,
"padded": 772,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_macroeconomics|5": {
"hashes": {
"hash_examples": "2fb32cf2d80f0b35",
"hash_full_prompts": "378ac13c8abb6c5f",
"hash_input_tokens": "afea2ca30b1622ff",
"hash_cont_tokens": "e71e7c6acf44c3e5"
},
"truncated": 0,
"non_truncated": 390,
"padded": 1560,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_mathematics|5": {
"hashes": {
"hash_examples": "fd6646fdb5d58a1f",
"hash_full_prompts": "14d34e0b34750627",
"hash_input_tokens": "34e63b0902b32a2c",
"hash_cont_tokens": "e36b5624bdbe96b0"
},
"truncated": 0,
"non_truncated": 270,
"padded": 1080,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_microeconomics|5": {
"hashes": {
"hash_examples": "2118f21f71d87d84",
"hash_full_prompts": "9ac09e5d4da991c9",
"hash_input_tokens": "93d1c1ba5fe0bcbd",
"hash_cont_tokens": "a5f61d5beba13cc2"
},
"truncated": 0,
"non_truncated": 238,
"padded": 952,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_physics|5": {
"hashes": {
"hash_examples": "dc3ce06378548565",
"hash_full_prompts": "b4832a554d47d224",
"hash_input_tokens": "f5bf59bc9f6839fe",
"hash_cont_tokens": "df1d218ccbc258e8"
},
"truncated": 0,
"non_truncated": 151,
"padded": 604,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_psychology|5": {
"hashes": {
"hash_examples": "c8d1d98a40e11f2f",
"hash_full_prompts": "1e8cd27064546274",
"hash_input_tokens": "329851f26db67226",
"hash_cont_tokens": "6fb549a4eb8e6c47"
},
"truncated": 0,
"non_truncated": 545,
"padded": 2180,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_statistics|5": {
"hashes": {
"hash_examples": "666c8759b98ee4ff",
"hash_full_prompts": "e05ab41077ec0afa",
"hash_input_tokens": "7abad93393993e44",
"hash_cont_tokens": "d9528c65af653d67"
},
"truncated": 0,
"non_truncated": 216,
"padded": 864,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_us_history|5": {
"hashes": {
"hash_examples": "95fef1c4b7d3f81e",
"hash_full_prompts": "a4b275996a416b4a",
"hash_input_tokens": "e5def820604ad889",
"hash_cont_tokens": "8b827fc7dfd3c1c5"
},
"truncated": 0,
"non_truncated": 204,
"padded": 816,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:high_school_world_history|5": {
"hashes": {
"hash_examples": "7e5085b6184b0322",
"hash_full_prompts": "8adf16361f0f320a",
"hash_input_tokens": "aa85ae4eba20e53f",
"hash_cont_tokens": "82f19c159c69a66d"
},
"truncated": 0,
"non_truncated": 237,
"padded": 948,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:human_aging|5": {
"hashes": {
"hash_examples": "c17333e7c7c10797",
"hash_full_prompts": "918d91a3141aac4d",
"hash_input_tokens": "297fceccf01a2c64",
"hash_cont_tokens": "ca87074f1dc39668"
},
"truncated": 0,
"non_truncated": 223,
"padded": 892,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:human_sexuality|5": {
"hashes": {
"hash_examples": "4edd1e9045df5e3d",
"hash_full_prompts": "bcee39ecea32fcc8",
"hash_input_tokens": "7c66a375881d6788",
"hash_cont_tokens": "491a0ab53f54aeb9"
},
"truncated": 0,
"non_truncated": 131,
"padded": 524,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:international_law|5": {
"hashes": {
"hash_examples": "db2fa00d771a062a",
"hash_full_prompts": "ffe12a3b5bf350c2",
"hash_input_tokens": "dc0250213736abca",
"hash_cont_tokens": "e3d257d7ea257fc8"
},
"truncated": 0,
"non_truncated": 121,
"padded": 484,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:jurisprudence|5": {
"hashes": {
"hash_examples": "e956f86b124076fe",
"hash_full_prompts": "b4293c3c08bebaf7",
"hash_input_tokens": "c9ed773ed04cff64",
"hash_cont_tokens": "4c69d7671fa1ab1c"
},
"truncated": 0,
"non_truncated": 108,
"padded": 432,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:logical_fallacies|5": {
"hashes": {
"hash_examples": "956e0e6365ab79f1",
"hash_full_prompts": "8c1b7733e98cbe81",
"hash_input_tokens": "a4f6df541a56c41a",
"hash_cont_tokens": "57e78d3d09b7db81"
},
"truncated": 0,
"non_truncated": 163,
"padded": 652,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:machine_learning|5": {
"hashes": {
"hash_examples": "397997cc6f4d581e",
"hash_full_prompts": "24a206a1c639ab8d",
"hash_input_tokens": "f0dfd08579d1f727",
"hash_cont_tokens": "94d2ec6c52bb7b53"
},
"truncated": 0,
"non_truncated": 112,
"padded": 448,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:management|5": {
"hashes": {
"hash_examples": "2bcbe6f6ca63d740",
"hash_full_prompts": "77e1c79d988beecc",
"hash_input_tokens": "15925fd62ddd3ca4",
"hash_cont_tokens": "79499fecb18f1cb1"
},
"truncated": 0,
"non_truncated": 103,
"padded": 412,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:marketing|5": {
"hashes": {
"hash_examples": "8ddb20d964a1b065",
"hash_full_prompts": "83cec2fa6b681d9d",
"hash_input_tokens": "6eb177c438da2061",
"hash_cont_tokens": "c5e9cd86b1a58fac"
},
"truncated": 0,
"non_truncated": 234,
"padded": 936,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:medical_genetics|5": {
"hashes": {
"hash_examples": "182a71f4763d2cea",
"hash_full_prompts": "195eb7ff99749730",
"hash_input_tokens": "5adeca0d34767f29",
"hash_cont_tokens": "a886b3552371a98b"
},
"truncated": 0,
"non_truncated": 100,
"padded": 400,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:miscellaneous|5": {
"hashes": {
"hash_examples": "4c404fdbb4ca57fc",
"hash_full_prompts": "33539955c9a96851",
"hash_input_tokens": "52aee92a69c2b698",
"hash_cont_tokens": "8578b82c42cc7026"
},
"truncated": 0,
"non_truncated": 783,
"padded": 3132,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:moral_disputes|5": {
"hashes": {
"hash_examples": "60cbd2baa3fea5c9",
"hash_full_prompts": "009b7d0e7f819eff",
"hash_input_tokens": "f24c046b105c5e03",
"hash_cont_tokens": "26b0f808ec46464d"
},
"truncated": 0,
"non_truncated": 346,
"padded": 1384,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:moral_scenarios|5": {
"hashes": {
"hash_examples": "fd8b0431fbdd75ef",
"hash_full_prompts": "f6e63c9fb9d3bff0",
"hash_input_tokens": "08eee0e3d8e89710",
"hash_cont_tokens": "52fe77d28aefc1b3"
},
"truncated": 0,
"non_truncated": 895,
"padded": 3580,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:nutrition|5": {
"hashes": {
"hash_examples": "71e55e2b829b6528",
"hash_full_prompts": "8294d5e3ad435377",
"hash_input_tokens": "5b2c6686c8fc5e83",
"hash_cont_tokens": "25850a01b4a11b53"
},
"truncated": 0,
"non_truncated": 306,
"padded": 1224,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:philosophy|5": {
"hashes": {
"hash_examples": "a6d489a8d208fa4b",
"hash_full_prompts": "db68c0f4503e4793",
"hash_input_tokens": "7108ad04b556854f",
"hash_cont_tokens": "8c34ab2fa65c3b6e"
},
"truncated": 0,
"non_truncated": 311,
"padded": 1244,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:prehistory|5": {
"hashes": {
"hash_examples": "6cc50f032a19acaa",
"hash_full_prompts": "3972bcfa8c80e964",
"hash_input_tokens": "65cb6b1efc71921b",
"hash_cont_tokens": "89f21e5f9c7d81f2"
},
"truncated": 0,
"non_truncated": 324,
"padded": 1296,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:professional_accounting|5": {
"hashes": {
"hash_examples": "50f57ab32f5f6cea",
"hash_full_prompts": "25f0becc2483bd32",
"hash_input_tokens": "c1b1c1e1f1ca4a85",
"hash_cont_tokens": "c7c4930a659ca843"
},
"truncated": 0,
"non_truncated": 282,
"padded": 1120,
"non_padded": 8,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:professional_law|5": {
"hashes": {
"hash_examples": "a8fdc85c64f4b215",
"hash_full_prompts": "7a6f6c5706f00c7d",
"hash_input_tokens": "e7517115da0204cd",
"hash_cont_tokens": "6f36bd560ae36f02"
},
"truncated": 0,
"non_truncated": 1534,
"padded": 6136,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:professional_medicine|5": {
"hashes": {
"hash_examples": "c373a28a3050a73a",
"hash_full_prompts": "a74b6ac7c5c545d2",
"hash_input_tokens": "da6af6d03e682017",
"hash_cont_tokens": "ca4398b4ad3db5f1"
},
"truncated": 0,
"non_truncated": 272,
"padded": 1088,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:professional_psychology|5": {
"hashes": {
"hash_examples": "bf5254fe818356af",
"hash_full_prompts": "c53fa139ec25f502",
"hash_input_tokens": "c6dbaf3c7103ebe9",
"hash_cont_tokens": "ce4bb75e80359fe4"
},
"truncated": 0,
"non_truncated": 612,
"padded": 2448,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:public_relations|5": {
"hashes": {
"hash_examples": "b66d52e28e7d14e0",
"hash_full_prompts": "55b5eff05aa6bf13",
"hash_input_tokens": "deea75b6eec5b782",
"hash_cont_tokens": "680235f5ede0b353"
},
"truncated": 0,
"non_truncated": 110,
"padded": 440,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:security_studies|5": {
"hashes": {
"hash_examples": "514c14feaf000ad9",
"hash_full_prompts": "6690ecdc054f7b0c",
"hash_input_tokens": "deef3d39896aca43",
"hash_cont_tokens": "189956efcec12818"
},
"truncated": 0,
"non_truncated": 245,
"padded": 980,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:sociology|5": {
"hashes": {
"hash_examples": "f6c9bc9d18c80870",
"hash_full_prompts": "945fbdd091c72d64",
"hash_input_tokens": "330fffbccabf89e4",
"hash_cont_tokens": "2178ff937c0c1a29"
},
"truncated": 0,
"non_truncated": 201,
"padded": 804,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:us_foreign_policy|5": {
"hashes": {
"hash_examples": "ed7b78629db6678f",
"hash_full_prompts": "ebba6ea6eca4ae53",
"hash_input_tokens": "0ec87fa768a47632",
"hash_cont_tokens": "a886b3552371a98b"
},
"truncated": 0,
"non_truncated": 100,
"padded": 392,
"non_padded": 8,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:virology|5": {
"hashes": {
"hash_examples": "bc52ffdc3f9b994a",
"hash_full_prompts": "a2ee4984d6877fe3",
"hash_input_tokens": "cc264818195d14da",
"hash_cont_tokens": "ec5c187546c7c842"
},
"truncated": 0,
"non_truncated": 166,
"padded": 660,
"non_padded": 4,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|mmlu:world_religions|5": {
"hashes": {
"hash_examples": "ecdb4a4f94f62930",
"hash_full_prompts": "a89c8dddd1d8ced0",
"hash_input_tokens": "e7e781ba363743eb",
"hash_cont_tokens": "e52b573046cdfc5c"
},
"truncated": 0,
"non_truncated": 171,
"padded": 684,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|truthfulqa:mc|0": {
"hashes": {
"hash_examples": "36a6d90e75d92d4a",
"hash_full_prompts": "8d9ca0a8bd458a1c",
"hash_input_tokens": "4aad1a3bfe70acfc",
"hash_cont_tokens": "b0f64f6659d8c230"
},
"truncated": 0,
"non_truncated": 817,
"padded": 9996,
"non_padded": 0,
"effective_few_shots": 0.0,
"num_truncated_few_shots": 0
},
"leaderboard|winogrande|5": {
"hashes": {
"hash_examples": "087d5d1a1afd4c7b",
"hash_full_prompts": "35da55e47222e0e1",
"hash_input_tokens": "881c630a9e0034f7",
"hash_cont_tokens": "c466f4c92e3879cb"
},
"truncated": 0,
"non_truncated": 1267,
"padded": 2534,
"non_padded": 0,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
},
"leaderboard|gsm8k|5": {
"hashes": {
"hash_examples": "0ed016e24e7512fd",
"hash_full_prompts": "f7ab209f6467841e",
"hash_input_tokens": "deccfe61ad5cb3d5",
"hash_cont_tokens": "95cc4cc1148eb790"
},
"truncated": 1319,
"non_truncated": 0,
"padded": 1074,
"non_padded": 245,
"effective_few_shots": 5.0,
"num_truncated_few_shots": 0
}
},
"summary_general": {
"hashes": {
"hash_examples": "670666fa3a90ce5d",
"hash_full_prompts": "56c005e427046302",
"hash_input_tokens": "2a51da62c271a1a0",
"hash_cont_tokens": "a74619de92c05f2e"
},
"truncated": 1319,
"non_truncated": 27340,
"padded": 114540,
"non_padded": 332,
"num_truncated_few_shots": 0
}
}