{ "config_general": { "lighteval_sha": "a98210fd3a2d1e8bface1c32b72ebd5017173a4c", "num_fewshot_seeds": 1, "override_batch_size": -1, "max_samples": null, "job_id": "", "start_time": 2236465.645254106, "end_time": 2260133.586933212, "total_evaluation_time_secondes": "23667.941679106094", "model_name": "meta-llama/Meta-Llama-3-8B-Instruct", "model_sha": "c4a54320a52ed5f88b7a2f84496903ea4ff07b45", "model_dtype": "torch.bfloat16", "model_size": "14.96 GB", "config": null }, "results": { "leaderboard|arc:challenge|25": { "acc": 0.5742320819112628, "acc_stderr": 0.01444946427886881, "acc_norm": 0.5827645051194539, "acc_norm_stderr": 0.014409825518403082 }, "leaderboard|hellaswag|10": { "acc": 0.5707030472017527, "acc_stderr": 0.004939642460172585, "acc_norm": 0.7310296753634734, "acc_norm_stderr": 0.004425182676353211 }, "leaderboard|mmlu:abstract_algebra|5": { "acc": 0.33, "acc_stderr": 0.047258156262526045 }, "leaderboard|mmlu:anatomy|5": { "acc": 0.6814814814814815, "acc_stderr": 0.040247784019771096 }, "leaderboard|mmlu:astronomy|5": { "acc": 0.75, "acc_stderr": 0.03523807393012047 }, "leaderboard|mmlu:business_ethics|5": { "acc": 0.7, "acc_stderr": 0.046056618647183814 }, "leaderboard|mmlu:clinical_knowledge|5": { "acc": 0.7471698113207547, "acc_stderr": 0.026749899771241214 }, "leaderboard|mmlu:college_biology|5": { "acc": 0.7916666666666666, "acc_stderr": 0.033961162058453336 }, "leaderboard|mmlu:college_chemistry|5": { "acc": 0.45, "acc_stderr": 0.05 }, "leaderboard|mmlu:college_computer_science|5": { "acc": 0.52, "acc_stderr": 0.050211673156867795 }, "leaderboard|mmlu:college_mathematics|5": { "acc": 0.39, "acc_stderr": 0.04902071300001975 }, "leaderboard|mmlu:college_medicine|5": { "acc": 0.6589595375722543, "acc_stderr": 0.036146654241808254 }, "leaderboard|mmlu:college_physics|5": { "acc": 0.43137254901960786, "acc_stderr": 0.04928099597287533 }, "leaderboard|mmlu:computer_security|5": { "acc": 0.79, "acc_stderr": 0.04093601807403326 }, "leaderboard|mmlu:conceptual_physics|5": { "acc": 0.5872340425531914, "acc_stderr": 0.03218471141400351 }, "leaderboard|mmlu:econometrics|5": { "acc": 0.543859649122807, "acc_stderr": 0.046854730419077895 }, "leaderboard|mmlu:electrical_engineering|5": { "acc": 0.6137931034482759, "acc_stderr": 0.04057324734419035 }, "leaderboard|mmlu:elementary_mathematics|5": { "acc": 0.46825396825396826, "acc_stderr": 0.0256993528321318 }, "leaderboard|mmlu:formal_logic|5": { "acc": 0.5317460317460317, "acc_stderr": 0.04463112720677172 }, "leaderboard|mmlu:global_facts|5": { "acc": 0.46, "acc_stderr": 0.05009082659620332 }, "leaderboard|mmlu:high_school_biology|5": { "acc": 0.8064516129032258, "acc_stderr": 0.022475258525536057 }, "leaderboard|mmlu:high_school_chemistry|5": { "acc": 0.541871921182266, "acc_stderr": 0.03505630140785741 }, "leaderboard|mmlu:high_school_computer_science|5": { "acc": 0.68, "acc_stderr": 0.04688261722621505 }, "leaderboard|mmlu:high_school_european_history|5": { "acc": 0.7393939393939394, "acc_stderr": 0.034277431758165236 }, "leaderboard|mmlu:high_school_geography|5": { "acc": 0.8131313131313131, "acc_stderr": 0.027772533334218957 }, "leaderboard|mmlu:high_school_government_and_politics|5": { "acc": 0.8963730569948186, "acc_stderr": 0.02199531196364424 }, "leaderboard|mmlu:high_school_macroeconomics|5": { "acc": 0.676923076923077, "acc_stderr": 0.023710888501970555 }, "leaderboard|mmlu:high_school_mathematics|5": { "acc": 0.32592592592592595, "acc_stderr": 0.028578348365473072 }, "leaderboard|mmlu:high_school_microeconomics|5": { "acc": 0.7563025210084033, "acc_stderr": 0.027886828078380548 }, "leaderboard|mmlu:high_school_physics|5": { "acc": 0.4105960264900662, "acc_stderr": 0.04016689594849927 }, "leaderboard|mmlu:high_school_psychology|5": { "acc": 0.8477064220183487, "acc_stderr": 0.015405084393157074 }, "leaderboard|mmlu:high_school_statistics|5": { "acc": 0.47685185185185186, "acc_stderr": 0.03406315360711507 }, "leaderboard|mmlu:high_school_us_history|5": { "acc": 0.7892156862745098, "acc_stderr": 0.028626547912437406 }, "leaderboard|mmlu:high_school_world_history|5": { "acc": 0.8396624472573839, "acc_stderr": 0.023884380925965665 }, "leaderboard|mmlu:human_aging|5": { "acc": 0.726457399103139, "acc_stderr": 0.029918586707798827 }, "leaderboard|mmlu:human_sexuality|5": { "acc": 0.7938931297709924, "acc_stderr": 0.03547771004159462 }, "leaderboard|mmlu:international_law|5": { "acc": 0.768595041322314, "acc_stderr": 0.03849856098794088 }, "leaderboard|mmlu:jurisprudence|5": { "acc": 0.7592592592592593, "acc_stderr": 0.04133119440243839 }, "leaderboard|mmlu:logical_fallacies|5": { "acc": 0.7607361963190185, "acc_stderr": 0.033519538795212696 }, "leaderboard|mmlu:machine_learning|5": { "acc": 0.5267857142857143, "acc_stderr": 0.047389751192741546 }, "leaderboard|mmlu:management|5": { "acc": 0.8155339805825242, "acc_stderr": 0.03840423627288276 }, "leaderboard|mmlu:marketing|5": { "acc": 0.905982905982906, "acc_stderr": 0.019119892798924974 }, "leaderboard|mmlu:medical_genetics|5": { "acc": 0.79, "acc_stderr": 0.040936018074033256 }, "leaderboard|mmlu:miscellaneous|5": { "acc": 0.8237547892720306, "acc_stderr": 0.013625556907993455 }, "leaderboard|mmlu:moral_disputes|5": { "acc": 0.7398843930635838, "acc_stderr": 0.023618678310069356 }, "leaderboard|mmlu:moral_scenarios|5": { "acc": 0.43575418994413406, "acc_stderr": 0.016583881958602387 }, "leaderboard|mmlu:nutrition|5": { "acc": 0.7549019607843137, "acc_stderr": 0.024630048979824785 }, "leaderboard|mmlu:philosophy|5": { "acc": 0.7331189710610932, "acc_stderr": 0.025122637608816657 }, "leaderboard|mmlu:prehistory|5": { "acc": 0.7469135802469136, "acc_stderr": 0.024191808600713002 }, "leaderboard|mmlu:professional_accounting|5": { "acc": 0.5177304964539007, "acc_stderr": 0.02980873964223777 }, "leaderboard|mmlu:professional_law|5": { "acc": 0.46479791395045633, "acc_stderr": 0.012738547371303956 }, "leaderboard|mmlu:professional_medicine|5": { "acc": 0.7279411764705882, "acc_stderr": 0.027033041151681456 }, "leaderboard|mmlu:professional_psychology|5": { "acc": 0.6928104575163399, "acc_stderr": 0.018663359671463677 }, "leaderboard|mmlu:public_relations|5": { "acc": 0.6636363636363637, "acc_stderr": 0.04525393596302505 }, "leaderboard|mmlu:security_studies|5": { "acc": 0.7306122448979592, "acc_stderr": 0.02840125202902294 }, "leaderboard|mmlu:sociology|5": { "acc": 0.8557213930348259, "acc_stderr": 0.02484575321230604 }, "leaderboard|mmlu:us_foreign_policy|5": { "acc": 0.86, "acc_stderr": 0.03487350880197769 }, "leaderboard|mmlu:virology|5": { "acc": 0.536144578313253, "acc_stderr": 0.03882310850890594 }, "leaderboard|mmlu:world_religions|5": { "acc": 0.7953216374269005, "acc_stderr": 0.030944459778533193 }, "leaderboard|truthfulqa:mc|0": { "truthfulqa_mc1": 0.37454100367197063, "truthfulqa_mc1_stderr": 0.016943535128405338, "truthfulqa_mc2": 0.5337684444397199, "truthfulqa_mc2_stderr": 0.015971485281891525 }, "leaderboard|winogrande|5": { "acc": 0.6929755327545383, "acc_stderr": 0.012963688616969483 }, "leaderboard|gsm8k|5": { "qem": 0.6808188021228203, "qem_stderr": 0.012840345676251653 }, "leaderboard|mmlu:_average|5": { "acc": 0.6661794809691, "acc_stderr": 0.033327669029227354 }, "all": { "acc": 0.6635023512851042, "acc_stderr": 0.032200498833699506, "acc_norm": 0.6568970902414637, "acc_norm_stderr": 0.009417504097378147, "truthfulqa_mc1": 0.37454100367197063, "truthfulqa_mc1_stderr": 0.016943535128405338, "truthfulqa_mc2": 0.5337684444397199, "truthfulqa_mc2_stderr": 0.015971485281891525, "qem": 0.6808188021228203, "qem_stderr": 0.012840345676251653 } }, "versions": { "leaderboard|arc:challenge|25": 0, "leaderboard|gsm8k|5": 0, "leaderboard|hellaswag|10": 0, "leaderboard|mmlu:abstract_algebra|5": 0, "leaderboard|mmlu:anatomy|5": 0, "leaderboard|mmlu:astronomy|5": 0, "leaderboard|mmlu:business_ethics|5": 0, "leaderboard|mmlu:clinical_knowledge|5": 0, "leaderboard|mmlu:college_biology|5": 0, "leaderboard|mmlu:college_chemistry|5": 0, "leaderboard|mmlu:college_computer_science|5": 0, "leaderboard|mmlu:college_mathematics|5": 0, "leaderboard|mmlu:college_medicine|5": 0, "leaderboard|mmlu:college_physics|5": 0, "leaderboard|mmlu:computer_security|5": 0, "leaderboard|mmlu:conceptual_physics|5": 0, "leaderboard|mmlu:econometrics|5": 0, "leaderboard|mmlu:electrical_engineering|5": 0, "leaderboard|mmlu:elementary_mathematics|5": 0, "leaderboard|mmlu:formal_logic|5": 0, "leaderboard|mmlu:global_facts|5": 0, "leaderboard|mmlu:high_school_biology|5": 0, "leaderboard|mmlu:high_school_chemistry|5": 0, "leaderboard|mmlu:high_school_computer_science|5": 0, "leaderboard|mmlu:high_school_european_history|5": 0, "leaderboard|mmlu:high_school_geography|5": 0, "leaderboard|mmlu:high_school_government_and_politics|5": 0, "leaderboard|mmlu:high_school_macroeconomics|5": 0, "leaderboard|mmlu:high_school_mathematics|5": 0, "leaderboard|mmlu:high_school_microeconomics|5": 0, "leaderboard|mmlu:high_school_physics|5": 0, "leaderboard|mmlu:high_school_psychology|5": 0, "leaderboard|mmlu:high_school_statistics|5": 0, "leaderboard|mmlu:high_school_us_history|5": 0, "leaderboard|mmlu:high_school_world_history|5": 0, "leaderboard|mmlu:human_aging|5": 0, "leaderboard|mmlu:human_sexuality|5": 0, "leaderboard|mmlu:international_law|5": 0, "leaderboard|mmlu:jurisprudence|5": 0, "leaderboard|mmlu:logical_fallacies|5": 0, "leaderboard|mmlu:machine_learning|5": 0, "leaderboard|mmlu:management|5": 0, "leaderboard|mmlu:marketing|5": 0, "leaderboard|mmlu:medical_genetics|5": 0, "leaderboard|mmlu:miscellaneous|5": 0, "leaderboard|mmlu:moral_disputes|5": 0, "leaderboard|mmlu:moral_scenarios|5": 0, "leaderboard|mmlu:nutrition|5": 0, "leaderboard|mmlu:philosophy|5": 0, "leaderboard|mmlu:prehistory|5": 0, "leaderboard|mmlu:professional_accounting|5": 0, "leaderboard|mmlu:professional_law|5": 0, "leaderboard|mmlu:professional_medicine|5": 0, "leaderboard|mmlu:professional_psychology|5": 0, "leaderboard|mmlu:public_relations|5": 0, "leaderboard|mmlu:security_studies|5": 0, "leaderboard|mmlu:sociology|5": 0, "leaderboard|mmlu:us_foreign_policy|5": 0, "leaderboard|mmlu:virology|5": 0, "leaderboard|mmlu:world_religions|5": 0, "leaderboard|truthfulqa:mc|0": 0, "leaderboard|winogrande|5": 0 }, "config_tasks": { "leaderboard|arc:challenge": { "name": "arc:challenge", "prompt_function": "arc", "hf_repo": "ai2_arc", "hf_subset": "ARC-Challenge", "metric": [ "loglikelihood_acc", "loglikelihood_acc_norm_nospace" ], "hf_avail_splits": [ "train", "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": "random_sampling_from_train", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "arc" ], "original_num_docs": 1172, "effective_num_docs": 1172, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|gsm8k": { "name": "gsm8k", "prompt_function": "gsm8k", "hf_repo": "gsm8k", "hf_subset": "main", "metric": [ "quasi_exact_match_gsm8k" ], "hf_avail_splits": [ "train", "test" ], "evaluation_splits": [ "test" ], "few_shots_split": null, "few_shots_select": "random_sampling_from_train", "generation_size": 256, "stop_sequence": [ "Question:", "Question", ":" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard" ], "original_num_docs": 1319, "effective_num_docs": 1319, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|hellaswag": { "name": "hellaswag", "prompt_function": "hellaswag_harness", "hf_repo": "hellaswag", "hf_subset": "default", "metric": [ "loglikelihood_acc", "loglikelihood_acc_norm" ], "hf_avail_splits": [ "train", "test", "validation" ], "evaluation_splits": [ "validation" ], "few_shots_split": null, "few_shots_select": "random_sampling_from_train", "generation_size": -1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard" ], "original_num_docs": 10042, "effective_num_docs": 10042, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:abstract_algebra": { "name": "mmlu:abstract_algebra", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "abstract_algebra", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:anatomy": { "name": "mmlu:anatomy", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "anatomy", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 135, "effective_num_docs": 135, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:astronomy": { "name": "mmlu:astronomy", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "astronomy", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 152, "effective_num_docs": 152, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:business_ethics": { "name": "mmlu:business_ethics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "business_ethics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:clinical_knowledge": { "name": "mmlu:clinical_knowledge", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "clinical_knowledge", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 265, "effective_num_docs": 265, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:college_biology": { "name": "mmlu:college_biology", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "college_biology", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 144, "effective_num_docs": 144, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:college_chemistry": { "name": "mmlu:college_chemistry", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "college_chemistry", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:college_computer_science": { "name": "mmlu:college_computer_science", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "college_computer_science", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:college_mathematics": { "name": "mmlu:college_mathematics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "college_mathematics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:college_medicine": { "name": "mmlu:college_medicine", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "college_medicine", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 173, "effective_num_docs": 173, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:college_physics": { "name": "mmlu:college_physics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "college_physics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 102, "effective_num_docs": 102, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:computer_security": { "name": "mmlu:computer_security", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "computer_security", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:conceptual_physics": { "name": "mmlu:conceptual_physics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "conceptual_physics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 235, "effective_num_docs": 235, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:econometrics": { "name": "mmlu:econometrics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "econometrics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 114, "effective_num_docs": 114, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:electrical_engineering": { "name": "mmlu:electrical_engineering", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "electrical_engineering", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 145, "effective_num_docs": 145, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:elementary_mathematics": { "name": "mmlu:elementary_mathematics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "elementary_mathematics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 378, "effective_num_docs": 378, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:formal_logic": { "name": "mmlu:formal_logic", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "formal_logic", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 126, "effective_num_docs": 126, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:global_facts": { "name": "mmlu:global_facts", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "global_facts", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:high_school_biology": { "name": "mmlu:high_school_biology", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_biology", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 310, "effective_num_docs": 310, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:high_school_chemistry": { "name": "mmlu:high_school_chemistry", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_chemistry", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 203, "effective_num_docs": 203, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:high_school_computer_science": { "name": "mmlu:high_school_computer_science", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_computer_science", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:high_school_european_history": { "name": "mmlu:high_school_european_history", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_european_history", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 165, "effective_num_docs": 165, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:high_school_geography": { "name": "mmlu:high_school_geography", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_geography", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 198, "effective_num_docs": 198, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:high_school_government_and_politics": { "name": "mmlu:high_school_government_and_politics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_government_and_politics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 193, "effective_num_docs": 193, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:high_school_macroeconomics": { "name": "mmlu:high_school_macroeconomics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_macroeconomics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 390, "effective_num_docs": 390, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:high_school_mathematics": { "name": "mmlu:high_school_mathematics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_mathematics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 270, "effective_num_docs": 270, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:high_school_microeconomics": { "name": "mmlu:high_school_microeconomics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_microeconomics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 238, "effective_num_docs": 238, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:high_school_physics": { "name": "mmlu:high_school_physics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_physics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 151, "effective_num_docs": 151, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:high_school_psychology": { "name": "mmlu:high_school_psychology", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_psychology", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 545, "effective_num_docs": 545, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:high_school_statistics": { "name": "mmlu:high_school_statistics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_statistics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 216, "effective_num_docs": 216, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:high_school_us_history": { "name": "mmlu:high_school_us_history", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_us_history", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 204, "effective_num_docs": 204, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:high_school_world_history": { "name": "mmlu:high_school_world_history", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "high_school_world_history", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 237, "effective_num_docs": 237, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:human_aging": { "name": "mmlu:human_aging", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "human_aging", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 223, "effective_num_docs": 223, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:human_sexuality": { "name": "mmlu:human_sexuality", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "human_sexuality", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 131, "effective_num_docs": 131, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:international_law": { "name": "mmlu:international_law", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "international_law", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 121, "effective_num_docs": 121, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:jurisprudence": { "name": "mmlu:jurisprudence", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "jurisprudence", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 108, "effective_num_docs": 108, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:logical_fallacies": { "name": "mmlu:logical_fallacies", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "logical_fallacies", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 163, "effective_num_docs": 163, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:machine_learning": { "name": "mmlu:machine_learning", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "machine_learning", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 112, "effective_num_docs": 112, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:management": { "name": "mmlu:management", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "management", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 103, "effective_num_docs": 103, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:marketing": { "name": "mmlu:marketing", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "marketing", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 234, "effective_num_docs": 234, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:medical_genetics": { "name": "mmlu:medical_genetics", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "medical_genetics", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:miscellaneous": { "name": "mmlu:miscellaneous", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "miscellaneous", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 783, "effective_num_docs": 783, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:moral_disputes": { "name": "mmlu:moral_disputes", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "moral_disputes", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 346, "effective_num_docs": 346, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:moral_scenarios": { "name": "mmlu:moral_scenarios", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "moral_scenarios", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 895, "effective_num_docs": 895, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:nutrition": { "name": "mmlu:nutrition", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "nutrition", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 306, "effective_num_docs": 306, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:philosophy": { "name": "mmlu:philosophy", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "philosophy", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 311, "effective_num_docs": 311, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:prehistory": { "name": "mmlu:prehistory", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "prehistory", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 324, "effective_num_docs": 324, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:professional_accounting": { "name": "mmlu:professional_accounting", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "professional_accounting", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 282, "effective_num_docs": 282, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:professional_law": { "name": "mmlu:professional_law", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "professional_law", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 1534, "effective_num_docs": 1534, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:professional_medicine": { "name": "mmlu:professional_medicine", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "professional_medicine", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 272, "effective_num_docs": 272, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:professional_psychology": { "name": "mmlu:professional_psychology", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "professional_psychology", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 612, "effective_num_docs": 612, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:public_relations": { "name": "mmlu:public_relations", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "public_relations", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 110, "effective_num_docs": 110, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:security_studies": { "name": "mmlu:security_studies", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "security_studies", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 245, "effective_num_docs": 245, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:sociology": { "name": "mmlu:sociology", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "sociology", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 201, "effective_num_docs": 201, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:us_foreign_policy": { "name": "mmlu:us_foreign_policy", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "us_foreign_policy", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 100, "effective_num_docs": 100, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:virology": { "name": "mmlu:virology", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "virology", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 166, "effective_num_docs": 166, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|mmlu:world_religions": { "name": "mmlu:world_religions", "prompt_function": "mmlu_harness", "hf_repo": "lighteval/mmlu", "hf_subset": "world_religions", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "auxiliary_train", "test", "validation", "dev" ], "evaluation_splits": [ "test" ], "few_shots_split": "dev", "few_shots_select": "sequential", "generation_size": 1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard", "mmlu" ], "original_num_docs": 171, "effective_num_docs": 171, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|truthfulqa:mc": { "name": "truthfulqa:mc", "prompt_function": "truthful_qa_multiple_choice", "hf_repo": "truthful_qa", "hf_subset": "multiple_choice", "metric": [ "truthfulqa_mc_metrics" ], "hf_avail_splits": [ "validation" ], "evaluation_splits": [ "validation" ], "few_shots_split": null, "few_shots_select": null, "generation_size": -1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard" ], "original_num_docs": 817, "effective_num_docs": 817, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 }, "leaderboard|winogrande": { "name": "winogrande", "prompt_function": "winogrande", "hf_repo": "winogrande", "hf_subset": "winogrande_xl", "metric": [ "loglikelihood_acc" ], "hf_avail_splits": [ "train", "test", "validation" ], "evaluation_splits": [ "validation" ], "few_shots_split": null, "few_shots_select": "random_sampling", "generation_size": -1, "stop_sequence": [ "\n" ], "output_regex": null, "num_samples": null, "frozen": false, "suite": [ "leaderboard" ], "original_num_docs": 1267, "effective_num_docs": 1267, "trust_dataset": true, "must_remove_duplicate_docs": null, "version": 0 } }, "summary_tasks": { "leaderboard|arc:challenge|25": { "hashes": { "hash_examples": "17b0cae357c0259e", "hash_full_prompts": "4aeb23a740784b86", "hash_input_tokens": "2e9e18067d1f8ad8", "hash_cont_tokens": "19baa8a044eaaac8" }, "truncated": 0, "non_truncated": 1172, "padded": 4687, "non_padded": 0, "effective_few_shots": 25.0, "num_truncated_few_shots": 0 }, "leaderboard|hellaswag|10": { "hashes": { "hash_examples": "31985c805c3a737e", "hash_full_prompts": "3c2d3440e190b07b", "hash_input_tokens": "412fc1d29623282b", "hash_cont_tokens": "823c88a16c837063" }, "truncated": 0, "non_truncated": 10042, "padded": 40105, "non_padded": 63, "effective_few_shots": 10.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:abstract_algebra|5": { "hashes": { "hash_examples": "4c76229e00c9c0e9", "hash_full_prompts": "faefa0cccb952fe0", "hash_input_tokens": "e7380c35f0e2c4b3", "hash_cont_tokens": "a886b3552371a98b" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:anatomy|5": { "hashes": { "hash_examples": "6a1f8104dccbd33b", "hash_full_prompts": "eacd03e46972fa59", "hash_input_tokens": "2ee8bc2ef4561b6b", "hash_cont_tokens": "9be31d13c42ead00" }, "truncated": 0, "non_truncated": 135, "padded": 540, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:astronomy|5": { "hashes": { "hash_examples": "1302effa3a76ce4c", "hash_full_prompts": "826cacbdf1f6bfd0", "hash_input_tokens": "6ab8d24255ff03b3", "hash_cont_tokens": "30cc2b2fc1294aac" }, "truncated": 0, "non_truncated": 152, "padded": 608, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:business_ethics|5": { "hashes": { "hash_examples": "03cb8bce5336419a", "hash_full_prompts": "518511169382ac39", "hash_input_tokens": "8be4f0cc9ce448e1", "hash_cont_tokens": "4e9d83c717b7deb8" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:clinical_knowledge|5": { "hashes": { "hash_examples": "ffbb9c7b2be257f9", "hash_full_prompts": "0b07b0bc774fdfd9", "hash_input_tokens": "413166c01db52a72", "hash_cont_tokens": "40dd7263ce5af5de" }, "truncated": 0, "non_truncated": 265, "padded": 1060, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:college_biology|5": { "hashes": { "hash_examples": "3ee77f176f38eb8e", "hash_full_prompts": "22cbe0e8dabf98b1", "hash_input_tokens": "0dcd583202383d43", "hash_cont_tokens": "1892d80e82b394c0" }, "truncated": 0, "non_truncated": 144, "padded": 576, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:college_chemistry|5": { "hashes": { "hash_examples": "ce61a69c46d47aeb", "hash_full_prompts": "9c1288940a4afb59", "hash_input_tokens": "59a4f0d36881d644", "hash_cont_tokens": "b6bb78fb2d7e4e6f" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:college_computer_science|5": { "hashes": { "hash_examples": "32805b52d7d5daab", "hash_full_prompts": "9522781d0cdf1a43", "hash_input_tokens": "302a2f1d05b53513", "hash_cont_tokens": "6a5da979260e607c" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:college_mathematics|5": { "hashes": { "hash_examples": "55da1a0a0bd33722", "hash_full_prompts": "72fe6f46a57e6ca4", "hash_input_tokens": "042f1988f13b8f9a", "hash_cont_tokens": "62df3b0447bd3b12" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:college_medicine|5": { "hashes": { "hash_examples": "c33e143163049176", "hash_full_prompts": "dee0989b2c8993f4", "hash_input_tokens": "6dd81075c8e816e9", "hash_cont_tokens": "933c01711a0757a0" }, "truncated": 0, "non_truncated": 173, "padded": 692, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:college_physics|5": { "hashes": { "hash_examples": "ebdab1cdb7e555df", "hash_full_prompts": "a1be6b64ea1948c3", "hash_input_tokens": "37818fa59254732b", "hash_cont_tokens": "d36569ab90faad7c" }, "truncated": 0, "non_truncated": 102, "padded": 408, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:computer_security|5": { "hashes": { "hash_examples": "a24fd7d08a560921", "hash_full_prompts": "01bc3fdfdefe67a4", "hash_input_tokens": "d4957d5a9d5e83ec", "hash_cont_tokens": "a886b3552371a98b" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:conceptual_physics|5": { "hashes": { "hash_examples": "8300977a79386993", "hash_full_prompts": "b39315a8ada3ca79", "hash_input_tokens": "c146a84803f78c9e", "hash_cont_tokens": "6408f70f3d9ada31" }, "truncated": 0, "non_truncated": 235, "padded": 940, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:econometrics|5": { "hashes": { "hash_examples": "ddde36788a04a46f", "hash_full_prompts": "70bab37ca5fcc48f", "hash_input_tokens": "086bc025be133096", "hash_cont_tokens": "3befa885ca6e4b97" }, "truncated": 0, "non_truncated": 114, "padded": 456, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:electrical_engineering|5": { "hashes": { "hash_examples": "acbc5def98c19b3f", "hash_full_prompts": "86a4747481c11c61", "hash_input_tokens": "b83507ac94ded59b", "hash_cont_tokens": "e75df8f470aa4973" }, "truncated": 0, "non_truncated": 145, "padded": 580, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:elementary_mathematics|5": { "hashes": { "hash_examples": "146e61d07497a9bd", "hash_full_prompts": "1fe56333735325fa", "hash_input_tokens": "8c3c868b34bad37b", "hash_cont_tokens": "f09c97e7f7f9af71" }, "truncated": 0, "non_truncated": 378, "padded": 1512, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:formal_logic|5": { "hashes": { "hash_examples": "8635216e1909a03f", "hash_full_prompts": "cc83c1ede45f974c", "hash_input_tokens": "bb0616a24585501c", "hash_cont_tokens": "df96e75b4eb1d7b0" }, "truncated": 0, "non_truncated": 126, "padded": 504, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:global_facts|5": { "hashes": { "hash_examples": "30b315aa6353ee47", "hash_full_prompts": "3a2ec1e2785c69a5", "hash_input_tokens": "5e840dc7f1c55a67", "hash_cont_tokens": "a886b3552371a98b" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_biology|5": { "hashes": { "hash_examples": "c9136373af2180de", "hash_full_prompts": "27646a569cf2a6f8", "hash_input_tokens": "1dce672a00c5cbe1", "hash_cont_tokens": "c6d11e73dc85157f" }, "truncated": 0, "non_truncated": 310, "padded": 1240, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_chemistry|5": { "hashes": { "hash_examples": "b0661bfa1add6404", "hash_full_prompts": "6905c6ca76f7b2b7", "hash_input_tokens": "7fb2dd590b34e445", "hash_cont_tokens": "208aff39cfca671a" }, "truncated": 0, "non_truncated": 203, "padded": 812, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_computer_science|5": { "hashes": { "hash_examples": "80fc1d623a3d665f", "hash_full_prompts": "b80092241e8b6c06", "hash_input_tokens": "b2a9091fd8d00b66", "hash_cont_tokens": "150a6d581009fbe0" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_european_history|5": { "hashes": { "hash_examples": "854da6e5af0fe1a1", "hash_full_prompts": "a3bc32a5dc022ce7", "hash_input_tokens": "393e215e8667fde4", "hash_cont_tokens": "7b6f4c22b304c3cc" }, "truncated": 0, "non_truncated": 165, "padded": 656, "non_padded": 4, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_geography|5": { "hashes": { "hash_examples": "7dc963c7acd19ad8", "hash_full_prompts": "53f91beae305905d", "hash_input_tokens": "439ac435fc478534", "hash_cont_tokens": "1a85c9e696d91a66" }, "truncated": 0, "non_truncated": 198, "padded": 792, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_government_and_politics|5": { "hashes": { "hash_examples": "1f675dcdebc9758f", "hash_full_prompts": "623fd7e3495f243f", "hash_input_tokens": "2c5757b8545f7cf8", "hash_cont_tokens": "a47a4530b8790081" }, "truncated": 0, "non_truncated": 193, "padded": 772, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_macroeconomics|5": { "hashes": { "hash_examples": "2fb32cf2d80f0b35", "hash_full_prompts": "378ac13c8abb6c5f", "hash_input_tokens": "afea2ca30b1622ff", "hash_cont_tokens": "e71e7c6acf44c3e5" }, "truncated": 0, "non_truncated": 390, "padded": 1560, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_mathematics|5": { "hashes": { "hash_examples": "fd6646fdb5d58a1f", "hash_full_prompts": "14d34e0b34750627", "hash_input_tokens": "34e63b0902b32a2c", "hash_cont_tokens": "e36b5624bdbe96b0" }, "truncated": 0, "non_truncated": 270, "padded": 1080, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_microeconomics|5": { "hashes": { "hash_examples": "2118f21f71d87d84", "hash_full_prompts": "9ac09e5d4da991c9", "hash_input_tokens": "93d1c1ba5fe0bcbd", "hash_cont_tokens": "a5f61d5beba13cc2" }, "truncated": 0, "non_truncated": 238, "padded": 952, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_physics|5": { "hashes": { "hash_examples": "dc3ce06378548565", "hash_full_prompts": "b4832a554d47d224", "hash_input_tokens": "f5bf59bc9f6839fe", "hash_cont_tokens": "df1d218ccbc258e8" }, "truncated": 0, "non_truncated": 151, "padded": 604, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_psychology|5": { "hashes": { "hash_examples": "c8d1d98a40e11f2f", "hash_full_prompts": "1e8cd27064546274", "hash_input_tokens": "329851f26db67226", "hash_cont_tokens": "6fb549a4eb8e6c47" }, "truncated": 0, "non_truncated": 545, "padded": 2180, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_statistics|5": { "hashes": { "hash_examples": "666c8759b98ee4ff", "hash_full_prompts": "e05ab41077ec0afa", "hash_input_tokens": "7abad93393993e44", "hash_cont_tokens": "d9528c65af653d67" }, "truncated": 0, "non_truncated": 216, "padded": 864, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_us_history|5": { "hashes": { "hash_examples": "95fef1c4b7d3f81e", "hash_full_prompts": "a4b275996a416b4a", "hash_input_tokens": "e5def820604ad889", "hash_cont_tokens": "8b827fc7dfd3c1c5" }, "truncated": 0, "non_truncated": 204, "padded": 816, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:high_school_world_history|5": { "hashes": { "hash_examples": "7e5085b6184b0322", "hash_full_prompts": "8adf16361f0f320a", "hash_input_tokens": "aa85ae4eba20e53f", "hash_cont_tokens": "82f19c159c69a66d" }, "truncated": 0, "non_truncated": 237, "padded": 948, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:human_aging|5": { "hashes": { "hash_examples": "c17333e7c7c10797", "hash_full_prompts": "918d91a3141aac4d", "hash_input_tokens": "297fceccf01a2c64", "hash_cont_tokens": "ca87074f1dc39668" }, "truncated": 0, "non_truncated": 223, "padded": 892, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:human_sexuality|5": { "hashes": { "hash_examples": "4edd1e9045df5e3d", "hash_full_prompts": "bcee39ecea32fcc8", "hash_input_tokens": "7c66a375881d6788", "hash_cont_tokens": "491a0ab53f54aeb9" }, "truncated": 0, "non_truncated": 131, "padded": 524, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:international_law|5": { "hashes": { "hash_examples": "db2fa00d771a062a", "hash_full_prompts": "ffe12a3b5bf350c2", "hash_input_tokens": "dc0250213736abca", "hash_cont_tokens": "e3d257d7ea257fc8" }, "truncated": 0, "non_truncated": 121, "padded": 484, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:jurisprudence|5": { "hashes": { "hash_examples": "e956f86b124076fe", "hash_full_prompts": "b4293c3c08bebaf7", "hash_input_tokens": "c9ed773ed04cff64", "hash_cont_tokens": "4c69d7671fa1ab1c" }, "truncated": 0, "non_truncated": 108, "padded": 432, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:logical_fallacies|5": { "hashes": { "hash_examples": "956e0e6365ab79f1", "hash_full_prompts": "8c1b7733e98cbe81", "hash_input_tokens": "a4f6df541a56c41a", "hash_cont_tokens": "57e78d3d09b7db81" }, "truncated": 0, "non_truncated": 163, "padded": 652, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:machine_learning|5": { "hashes": { "hash_examples": "397997cc6f4d581e", "hash_full_prompts": "24a206a1c639ab8d", "hash_input_tokens": "f0dfd08579d1f727", "hash_cont_tokens": "94d2ec6c52bb7b53" }, "truncated": 0, "non_truncated": 112, "padded": 448, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:management|5": { "hashes": { "hash_examples": "2bcbe6f6ca63d740", "hash_full_prompts": "77e1c79d988beecc", "hash_input_tokens": "15925fd62ddd3ca4", "hash_cont_tokens": "79499fecb18f1cb1" }, "truncated": 0, "non_truncated": 103, "padded": 412, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:marketing|5": { "hashes": { "hash_examples": "8ddb20d964a1b065", "hash_full_prompts": "83cec2fa6b681d9d", "hash_input_tokens": "6eb177c438da2061", "hash_cont_tokens": "c5e9cd86b1a58fac" }, "truncated": 0, "non_truncated": 234, "padded": 936, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:medical_genetics|5": { "hashes": { "hash_examples": "182a71f4763d2cea", "hash_full_prompts": "195eb7ff99749730", "hash_input_tokens": "5adeca0d34767f29", "hash_cont_tokens": "a886b3552371a98b" }, "truncated": 0, "non_truncated": 100, "padded": 400, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:miscellaneous|5": { "hashes": { "hash_examples": "4c404fdbb4ca57fc", "hash_full_prompts": "33539955c9a96851", "hash_input_tokens": "52aee92a69c2b698", "hash_cont_tokens": "8578b82c42cc7026" }, "truncated": 0, "non_truncated": 783, "padded": 3132, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:moral_disputes|5": { "hashes": { "hash_examples": "60cbd2baa3fea5c9", "hash_full_prompts": "009b7d0e7f819eff", "hash_input_tokens": "f24c046b105c5e03", "hash_cont_tokens": "26b0f808ec46464d" }, "truncated": 0, "non_truncated": 346, "padded": 1384, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:moral_scenarios|5": { "hashes": { "hash_examples": "fd8b0431fbdd75ef", "hash_full_prompts": "f6e63c9fb9d3bff0", "hash_input_tokens": "08eee0e3d8e89710", "hash_cont_tokens": "52fe77d28aefc1b3" }, "truncated": 0, "non_truncated": 895, "padded": 3580, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:nutrition|5": { "hashes": { "hash_examples": "71e55e2b829b6528", "hash_full_prompts": "8294d5e3ad435377", "hash_input_tokens": "5b2c6686c8fc5e83", "hash_cont_tokens": "25850a01b4a11b53" }, "truncated": 0, "non_truncated": 306, "padded": 1224, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:philosophy|5": { "hashes": { "hash_examples": "a6d489a8d208fa4b", "hash_full_prompts": "db68c0f4503e4793", "hash_input_tokens": "7108ad04b556854f", "hash_cont_tokens": "8c34ab2fa65c3b6e" }, "truncated": 0, "non_truncated": 311, "padded": 1244, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:prehistory|5": { "hashes": { "hash_examples": "6cc50f032a19acaa", "hash_full_prompts": "3972bcfa8c80e964", "hash_input_tokens": "65cb6b1efc71921b", "hash_cont_tokens": "89f21e5f9c7d81f2" }, "truncated": 0, "non_truncated": 324, "padded": 1296, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:professional_accounting|5": { "hashes": { "hash_examples": "50f57ab32f5f6cea", "hash_full_prompts": "25f0becc2483bd32", "hash_input_tokens": "c1b1c1e1f1ca4a85", "hash_cont_tokens": "c7c4930a659ca843" }, "truncated": 0, "non_truncated": 282, "padded": 1120, "non_padded": 8, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:professional_law|5": { "hashes": { "hash_examples": "a8fdc85c64f4b215", "hash_full_prompts": "7a6f6c5706f00c7d", "hash_input_tokens": "e7517115da0204cd", "hash_cont_tokens": "6f36bd560ae36f02" }, "truncated": 0, "non_truncated": 1534, "padded": 6136, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:professional_medicine|5": { "hashes": { "hash_examples": "c373a28a3050a73a", "hash_full_prompts": "a74b6ac7c5c545d2", "hash_input_tokens": "da6af6d03e682017", "hash_cont_tokens": "ca4398b4ad3db5f1" }, "truncated": 0, "non_truncated": 272, "padded": 1088, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:professional_psychology|5": { "hashes": { "hash_examples": "bf5254fe818356af", "hash_full_prompts": "c53fa139ec25f502", "hash_input_tokens": "c6dbaf3c7103ebe9", "hash_cont_tokens": "ce4bb75e80359fe4" }, "truncated": 0, "non_truncated": 612, "padded": 2448, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:public_relations|5": { "hashes": { "hash_examples": "b66d52e28e7d14e0", "hash_full_prompts": "55b5eff05aa6bf13", "hash_input_tokens": "deea75b6eec5b782", "hash_cont_tokens": "680235f5ede0b353" }, "truncated": 0, "non_truncated": 110, "padded": 440, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:security_studies|5": { "hashes": { "hash_examples": "514c14feaf000ad9", "hash_full_prompts": "6690ecdc054f7b0c", "hash_input_tokens": "deef3d39896aca43", "hash_cont_tokens": "189956efcec12818" }, "truncated": 0, "non_truncated": 245, "padded": 980, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:sociology|5": { "hashes": { "hash_examples": "f6c9bc9d18c80870", "hash_full_prompts": "945fbdd091c72d64", "hash_input_tokens": "330fffbccabf89e4", "hash_cont_tokens": "2178ff937c0c1a29" }, "truncated": 0, "non_truncated": 201, "padded": 804, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:us_foreign_policy|5": { "hashes": { "hash_examples": "ed7b78629db6678f", "hash_full_prompts": "ebba6ea6eca4ae53", "hash_input_tokens": "0ec87fa768a47632", "hash_cont_tokens": "a886b3552371a98b" }, "truncated": 0, "non_truncated": 100, "padded": 392, "non_padded": 8, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:virology|5": { "hashes": { "hash_examples": "bc52ffdc3f9b994a", "hash_full_prompts": "a2ee4984d6877fe3", "hash_input_tokens": "cc264818195d14da", "hash_cont_tokens": "ec5c187546c7c842" }, "truncated": 0, "non_truncated": 166, "padded": 660, "non_padded": 4, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|mmlu:world_religions|5": { "hashes": { "hash_examples": "ecdb4a4f94f62930", "hash_full_prompts": "a89c8dddd1d8ced0", "hash_input_tokens": "e7e781ba363743eb", "hash_cont_tokens": "e52b573046cdfc5c" }, "truncated": 0, "non_truncated": 171, "padded": 684, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|truthfulqa:mc|0": { "hashes": { "hash_examples": "36a6d90e75d92d4a", "hash_full_prompts": "8d9ca0a8bd458a1c", "hash_input_tokens": "4aad1a3bfe70acfc", "hash_cont_tokens": "b0f64f6659d8c230" }, "truncated": 0, "non_truncated": 817, "padded": 9996, "non_padded": 0, "effective_few_shots": 0.0, "num_truncated_few_shots": 0 }, "leaderboard|winogrande|5": { "hashes": { "hash_examples": "087d5d1a1afd4c7b", "hash_full_prompts": "35da55e47222e0e1", "hash_input_tokens": "881c630a9e0034f7", "hash_cont_tokens": "c466f4c92e3879cb" }, "truncated": 0, "non_truncated": 1267, "padded": 2534, "non_padded": 0, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 }, "leaderboard|gsm8k|5": { "hashes": { "hash_examples": "0ed016e24e7512fd", "hash_full_prompts": "f7ab209f6467841e", "hash_input_tokens": "deccfe61ad5cb3d5", "hash_cont_tokens": "95cc4cc1148eb790" }, "truncated": 1319, "non_truncated": 0, "padded": 1074, "non_padded": 245, "effective_few_shots": 5.0, "num_truncated_few_shots": 0 } }, "summary_general": { "hashes": { "hash_examples": "670666fa3a90ce5d", "hash_full_prompts": "56c005e427046302", "hash_input_tokens": "2a51da62c271a1a0", "hash_cont_tokens": "a74619de92c05f2e" }, "truncated": 1319, "non_truncated": 27340, "padded": 114540, "non_padded": 332, "num_truncated_few_shots": 0 } }