diff --git "a/eval_results.json" "b/eval_results.json" new file mode 100644--- /dev/null +++ "b/eval_results.json" @@ -0,0 +1,3210 @@ +[ + { + "results": { + "arc_challenge": { + "acc,none": 0.5981228668941979, + "acc_stderr,none": 0.014327268614578274, + "acc_norm,none": 0.6348122866894198, + "acc_norm_stderr,none": 0.0140702655192688, + "alias": "arc_challenge" + } + }, + "configs": { + "arc_challenge": { + "task": "arc_challenge", + "group": [ + "ai2_arc" + ], + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Challenge", + "training_split": "train", + "validation_split": "validation", + "test_split": "test", + "doc_to_text": "Question: {{question}}\nAnswer:", + "doc_to_target": "{{choices.label.index(answerKey)}}", + "doc_to_choice": "{{choices.text}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 25, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "Question: {{question}}\nAnswer:", + "metadata": { + "version": 1 + } + } + }, + "versions": { + "arc_challenge": "Yaml" + }, + "n-shot": { + "arc_challenge": 25 + }, + "config": { + "model": "vllm", + "model_args": "pretrained=/workspace/dolphin-2.6-mistral-7b-hf,tensor_parallel_size=4,dtype=auto,trust_remote_code=True,gpu_memory_utilization=0.8", + "batch_size": "8", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null + }, + "git_hash": "46c79664" + }, + { + "results": { + "gsm8k": { + "exact_match,get-answer": 0.5428354814253222, + "exact_match_stderr,get-answer": 0.01372184996870972, + "alias": "gsm8k" + } + }, + "configs": { + "gsm8k": { + "task": "gsm8k", + "group": [ + "math_word_problems" + ], + "dataset_path": "gsm8k", + "dataset_name": "main", + "training_split": "train", + "test_split": "test", + "fewshot_split": "train", + "doc_to_text": "Question: {{question}}\nAnswer:", + "doc_to_target": "{{answer}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "exact_match", + "aggregation": "mean", + "higher_is_better": true, + "ignore_case": true, + "ignore_punctuation": false, + "regexes_to_ignore": [ + ",", + "\\$", + "(?s).*#### " + ] + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "\n\n", + "Question:" + ], + "do_sample": false, + "temperature": 0 + }, + "repeats": 1, + "filter_list": [ + { + "name": "get-answer", + "filter": [ + { + "function": "regex", + "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" + }, + { + "function": "take_first" + } + ] + } + ], + "should_decontaminate": false, + "metadata": { + "version": 1 + } + } + }, + "versions": { + "gsm8k": "Yaml" + }, + "n-shot": { + "gsm8k": 5 + }, + "config": { + "model": "vllm", + "model_args": "pretrained=/workspace/dolphin-2.6-mistral-7b-hf,tensor_parallel_size=4,dtype=auto,trust_remote_code=True,gpu_memory_utilization=0.8", + "batch_size": "8", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null + }, + "git_hash": "46c79664" + }, + { + "results": { + "hellaswag": { + "acc,none": 0.6453893646683927, + "acc_stderr,none": 0.0047741745902051365, + "acc_norm,none": 0.8346942840071699, + "acc_norm_stderr,none": 0.0037069708564110683, + "alias": "hellaswag" + } + }, + "configs": { + "hellaswag": { + "task": "hellaswag", + "group": [ + "multiple_choice" + ], + "dataset_path": "hellaswag", + "training_split": "train", + "validation_split": "validation", + "process_docs": "", + "doc_to_text": "{{query}}", + "doc_to_target": "{{label}}", + "doc_to_choice": "choices", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 10, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "acc_norm", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 1 + } + } + }, + "versions": { + "hellaswag": "Yaml" + }, + "n-shot": { + "hellaswag": 10 + }, + "config": { + "model": "vllm", + "model_args": "pretrained=/workspace/dolphin-2.6-mistral-7b-hf,tensor_parallel_size=4,dtype=auto,trust_remote_code=True,gpu_memory_utilization=0.8", + "batch_size": "8", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null + }, + "git_hash": "46c79664" + }, + { + "results": { + "mmlu": { + "acc,none": 0.6147272468309357, + "acc_stderr,none": 0.1279904236098431, + "alias": "mmlu" + }, + "mmlu_humanities": { + "alias": " - humanities", + "acc,none": 0.5636556854410202, + "acc_stderr,none": 0.11857886643596054 + }, + "mmlu_formal_logic": { + "alias": " - formal_logic", + "acc,none": 0.3968253968253968, + "acc_stderr,none": 0.04375888492727061 + }, + "mmlu_high_school_european_history": { + "alias": " - high_school_european_history", + "acc,none": 0.7636363636363637, + "acc_stderr,none": 0.033175059300091805 + }, + "mmlu_high_school_us_history": { + "alias": " - high_school_us_history", + "acc,none": 0.8137254901960784, + "acc_stderr,none": 0.027325470966716312 + }, + "mmlu_high_school_world_history": { + "alias": " - high_school_world_history", + "acc,none": 0.7721518987341772, + "acc_stderr,none": 0.02730348459906943 + }, + "mmlu_international_law": { + "alias": " - international_law", + "acc,none": 0.7768595041322314, + "acc_stderr,none": 0.03800754475228733 + }, + "mmlu_jurisprudence": { + "alias": " - jurisprudence", + "acc,none": 0.7962962962962963, + "acc_stderr,none": 0.03893542518824847 + }, + "mmlu_logical_fallacies": { + "alias": " - logical_fallacies", + "acc,none": 0.7055214723926381, + "acc_stderr,none": 0.03581165790474082 + }, + "mmlu_moral_disputes": { + "alias": " - moral_disputes", + "acc,none": 0.6994219653179191, + "acc_stderr,none": 0.024685316867257806 + }, + "mmlu_moral_scenarios": { + "alias": " - moral_scenarios", + "acc,none": 0.36089385474860336, + "acc_stderr,none": 0.01606229067111046 + }, + "mmlu_philosophy": { + "alias": " - philosophy", + "acc,none": 0.7041800643086816, + "acc_stderr,none": 0.025922371788818788 + }, + "mmlu_prehistory": { + "alias": " - prehistory", + "acc,none": 0.6975308641975309, + "acc_stderr,none": 0.02555765398186805 + }, + "mmlu_professional_law": { + "alias": " - professional_law", + "acc,none": 0.44198174706649285, + "acc_stderr,none": 0.012683972513598806 + }, + "mmlu_world_religions": { + "alias": " - world_religions", + "acc,none": 0.8421052631578947, + "acc_stderr,none": 0.02796678585916087 + }, + "mmlu_other": { + "alias": " - other", + "acc,none": 0.683617637592533, + "acc_stderr,none": 0.10929719513421464 + }, + "mmlu_business_ethics": { + "alias": " - business_ethics", + "acc,none": 0.58, + "acc_stderr,none": 0.049604496374885836 + }, + "mmlu_clinical_knowledge": { + "alias": " - clinical_knowledge", + "acc,none": 0.6867924528301886, + "acc_stderr,none": 0.028544793319055326 + }, + "mmlu_college_medicine": { + "alias": " - college_medicine", + "acc,none": 0.5953757225433526, + "acc_stderr,none": 0.03742461193887249 + }, + "mmlu_global_facts": { + "alias": " - global_facts", + "acc,none": 0.38, + "acc_stderr,none": 0.04878317312145632 + }, + "mmlu_human_aging": { + "alias": " - human_aging", + "acc,none": 0.6771300448430493, + "acc_stderr,none": 0.031381476375755 + }, + "mmlu_management": { + "alias": " - management", + "acc,none": 0.7961165048543689, + "acc_stderr,none": 0.0398913985953177 + }, + "mmlu_marketing": { + "alias": " - marketing", + "acc,none": 0.8547008547008547, + "acc_stderr,none": 0.023086635086841403 + }, + "mmlu_medical_genetics": { + "alias": " - medical_genetics", + "acc,none": 0.68, + "acc_stderr,none": 0.04688261722621504 + }, + "mmlu_miscellaneous": { + "alias": " - miscellaneous", + "acc,none": 0.8084291187739464, + "acc_stderr,none": 0.014072859310451945 + }, + "mmlu_nutrition": { + "alias": " - nutrition", + "acc,none": 0.7156862745098039, + "acc_stderr,none": 0.025829163272757465 + }, + "mmlu_professional_accounting": { + "alias": " - professional_accounting", + "acc,none": 0.4716312056737589, + "acc_stderr,none": 0.029779450957303055 + }, + "mmlu_professional_medicine": { + "alias": " - professional_medicine", + "acc,none": 0.625, + "acc_stderr,none": 0.029408372932278746 + }, + "mmlu_virology": { + "alias": " - virology", + "acc,none": 0.5240963855421686, + "acc_stderr,none": 0.038879718495972646 + }, + "mmlu_social_sciences": { + "alias": " - social_sciences", + "acc,none": 0.726356841078973, + "acc_stderr,none": 0.07276844314615243 + }, + "mmlu_econometrics": { + "alias": " - econometrics", + "acc,none": 0.45614035087719296, + "acc_stderr,none": 0.04685473041907789 + }, + "mmlu_high_school_geography": { + "alias": " - high_school_geography", + "acc,none": 0.7727272727272727, + "acc_stderr,none": 0.029857515673386396 + }, + "mmlu_high_school_government_and_politics": { + "alias": " - high_school_government_and_politics", + "acc,none": 0.8808290155440415, + "acc_stderr,none": 0.023381935348121417 + }, + "mmlu_high_school_macroeconomics": { + "alias": " - high_school_macroeconomics", + "acc,none": 0.6307692307692307, + "acc_stderr,none": 0.024468615241478926 + }, + "mmlu_high_school_microeconomics": { + "alias": " - high_school_microeconomics", + "acc,none": 0.6512605042016807, + "acc_stderr,none": 0.030956636328566548 + }, + "mmlu_high_school_psychology": { + "alias": " - high_school_psychology", + "acc,none": 0.8293577981651377, + "acc_stderr,none": 0.016129271025099888 + }, + "mmlu_human_sexuality": { + "alias": " - human_sexuality", + "acc,none": 0.7480916030534351, + "acc_stderr,none": 0.038073871163060866 + }, + "mmlu_professional_psychology": { + "alias": " - professional_psychology", + "acc,none": 0.6535947712418301, + "acc_stderr,none": 0.01924978569171721 + }, + "mmlu_public_relations": { + "alias": " - public_relations", + "acc,none": 0.6818181818181818, + "acc_stderr,none": 0.044612721759105085 + }, + "mmlu_security_studies": { + "alias": " - security_studies", + "acc,none": 0.7306122448979592, + "acc_stderr,none": 0.02840125202902294 + }, + "mmlu_sociology": { + "alias": " - sociology", + "acc,none": 0.835820895522388, + "acc_stderr,none": 0.026193923544454156 + }, + "mmlu_us_foreign_policy": { + "alias": " - us_foreign_policy", + "acc,none": 0.87, + "acc_stderr,none": 0.033799766898963086 + }, + "mmlu_stem": { + "alias": " - stem", + "acc,none": 0.5141135426577862, + "acc_stderr,none": 0.13904081800567097 + }, + "mmlu_abstract_algebra": { + "alias": " - abstract_algebra", + "acc,none": 0.34, + "acc_stderr,none": 0.04760952285695235 + }, + "mmlu_anatomy": { + "alias": " - anatomy", + "acc,none": 0.6296296296296297, + "acc_stderr,none": 0.041716541613545426 + }, + "mmlu_astronomy": { + "alias": " - astronomy", + "acc,none": 0.7105263157894737, + "acc_stderr,none": 0.036906779861372814 + }, + "mmlu_college_biology": { + "alias": " - college_biology", + "acc,none": 0.7361111111111112, + "acc_stderr,none": 0.03685651095897532 + }, + "mmlu_college_chemistry": { + "alias": " - college_chemistry", + "acc,none": 0.42, + "acc_stderr,none": 0.049604496374885836 + }, + "mmlu_college_computer_science": { + "alias": " - college_computer_science", + "acc,none": 0.54, + "acc_stderr,none": 0.05009082659620332 + }, + "mmlu_college_mathematics": { + "alias": " - college_mathematics", + "acc,none": 0.39, + "acc_stderr,none": 0.04902071300001975 + }, + "mmlu_college_physics": { + "alias": " - college_physics", + "acc,none": 0.3431372549019608, + "acc_stderr,none": 0.04724007352383888 + }, + "mmlu_computer_security": { + "alias": " - computer_security", + "acc,none": 0.76, + "acc_stderr,none": 0.042923469599092816 + }, + "mmlu_conceptual_physics": { + "alias": " - conceptual_physics", + "acc,none": 0.5276595744680851, + "acc_stderr,none": 0.03263597118409769 + }, + "mmlu_electrical_engineering": { + "alias": " - electrical_engineering", + "acc,none": 0.5793103448275863, + "acc_stderr,none": 0.04113914981189261 + }, + "mmlu_elementary_mathematics": { + "alias": " - elementary_mathematics", + "acc,none": 0.38095238095238093, + "acc_stderr,none": 0.025010749116137595 + }, + "mmlu_high_school_biology": { + "alias": " - high_school_biology", + "acc,none": 0.7612903225806451, + "acc_stderr,none": 0.024251071262208834 + }, + "mmlu_high_school_chemistry": { + "alias": " - high_school_chemistry", + "acc,none": 0.47783251231527096, + "acc_stderr,none": 0.03514528562175007 + }, + "mmlu_high_school_computer_science": { + "alias": " - high_school_computer_science", + "acc,none": 0.71, + "acc_stderr,none": 0.045604802157206845 + }, + "mmlu_high_school_mathematics": { + "alias": " - high_school_mathematics", + "acc,none": 0.3, + "acc_stderr,none": 0.02794045713622842 + }, + "mmlu_high_school_physics": { + "alias": " - high_school_physics", + "acc,none": 0.2913907284768212, + "acc_stderr,none": 0.03710185726119995 + }, + "mmlu_high_school_statistics": { + "alias": " - high_school_statistics", + "acc,none": 0.5, + "acc_stderr,none": 0.034099716973523674 + }, + "mmlu_machine_learning": { + "alias": " - machine_learning", + "acc,none": 0.4732142857142857, + "acc_stderr,none": 0.047389751192741546 + } + }, + "groups": { + "mmlu": { + "acc,none": 0.6147272468309357, + "acc_stderr,none": 0.1279904236098431, + "alias": "mmlu" + }, + "mmlu_humanities": { + "alias": " - humanities", + "acc,none": 0.5636556854410202, + "acc_stderr,none": 0.11857886643596054 + }, + "mmlu_other": { + "alias": " - other", + "acc,none": 0.683617637592533, + "acc_stderr,none": 0.10929719513421464 + }, + "mmlu_social_sciences": { + "alias": " - social_sciences", + "acc,none": 0.726356841078973, + "acc_stderr,none": 0.07276844314615243 + }, + "mmlu_stem": { + "alias": " - stem", + "acc,none": 0.5141135426577862, + "acc_stderr,none": 0.13904081800567097 + } + }, + "configs": { + "mmlu_abstract_algebra": { + "task": "mmlu_abstract_algebra", + "task_alias": "abstract_algebra", + "group": "mmlu_stem", + "group_alias": "stem", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "abstract_algebra", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_anatomy": { + "task": "mmlu_anatomy", + "task_alias": "anatomy", + "group": "mmlu_stem", + "group_alias": "stem", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "anatomy", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_astronomy": { + "task": "mmlu_astronomy", + "task_alias": "astronomy", + "group": "mmlu_stem", + "group_alias": "stem", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "astronomy", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_business_ethics": { + "task": "mmlu_business_ethics", + "task_alias": "business_ethics", + "group": "mmlu_other", + "group_alias": "other", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "business_ethics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_clinical_knowledge": { + "task": "mmlu_clinical_knowledge", + "task_alias": "clinical_knowledge", + "group": "mmlu_other", + "group_alias": "other", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "clinical_knowledge", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_college_biology": { + "task": "mmlu_college_biology", + "task_alias": "college_biology", + "group": "mmlu_stem", + "group_alias": "stem", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "college_biology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_college_chemistry": { + "task": "mmlu_college_chemistry", + "task_alias": "college_chemistry", + "group": "mmlu_stem", + "group_alias": "stem", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "college_chemistry", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_college_computer_science": { + "task": "mmlu_college_computer_science", + "task_alias": "college_computer_science", + "group": "mmlu_stem", + "group_alias": "stem", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "college_computer_science", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_college_mathematics": { + "task": "mmlu_college_mathematics", + "task_alias": "college_mathematics", + "group": "mmlu_stem", + "group_alias": "stem", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "college_mathematics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_college_medicine": { + "task": "mmlu_college_medicine", + "task_alias": "college_medicine", + "group": "mmlu_other", + "group_alias": "other", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "college_medicine", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_college_physics": { + "task": "mmlu_college_physics", + "task_alias": "college_physics", + "group": "mmlu_stem", + "group_alias": "stem", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "college_physics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about college physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_computer_security": { + "task": "mmlu_computer_security", + "task_alias": "computer_security", + "group": "mmlu_stem", + "group_alias": "stem", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "computer_security", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about computer security.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_conceptual_physics": { + "task": "mmlu_conceptual_physics", + "task_alias": "conceptual_physics", + "group": "mmlu_stem", + "group_alias": "stem", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "conceptual_physics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_econometrics": { + "task": "mmlu_econometrics", + "task_alias": "econometrics", + "group": "mmlu_social_sciences", + "group_alias": "social_sciences", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "econometrics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_electrical_engineering": { + "task": "mmlu_electrical_engineering", + "task_alias": "electrical_engineering", + "group": "mmlu_stem", + "group_alias": "stem", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "electrical_engineering", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_elementary_mathematics": { + "task": "mmlu_elementary_mathematics", + "task_alias": "elementary_mathematics", + "group": "mmlu_stem", + "group_alias": "stem", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "elementary_mathematics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_formal_logic": { + "task": "mmlu_formal_logic", + "task_alias": "formal_logic", + "group": "mmlu_humanities", + "group_alias": "humanities", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "formal_logic", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_global_facts": { + "task": "mmlu_global_facts", + "task_alias": "global_facts", + "group": "mmlu_other", + "group_alias": "other", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "global_facts", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about global facts.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_high_school_biology": { + "task": "mmlu_high_school_biology", + "task_alias": "high_school_biology", + "group": "mmlu_stem", + "group_alias": "stem", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "high_school_biology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_high_school_chemistry": { + "task": "mmlu_high_school_chemistry", + "task_alias": "high_school_chemistry", + "group": "mmlu_stem", + "group_alias": "stem", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "high_school_chemistry", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_high_school_computer_science": { + "task": "mmlu_high_school_computer_science", + "task_alias": "high_school_computer_science", + "group": "mmlu_stem", + "group_alias": "stem", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "high_school_computer_science", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_high_school_european_history": { + "task": "mmlu_high_school_european_history", + "task_alias": "high_school_european_history", + "group": "mmlu_humanities", + "group_alias": "humanities", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "high_school_european_history", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_high_school_geography": { + "task": "mmlu_high_school_geography", + "task_alias": "high_school_geography", + "group": "mmlu_social_sciences", + "group_alias": "social_sciences", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "high_school_geography", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_high_school_government_and_politics": { + "task": "mmlu_high_school_government_and_politics", + "task_alias": "high_school_government_and_politics", + "group": "mmlu_social_sciences", + "group_alias": "social_sciences", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "high_school_government_and_politics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_high_school_macroeconomics": { + "task": "mmlu_high_school_macroeconomics", + "task_alias": "high_school_macroeconomics", + "group": "mmlu_social_sciences", + "group_alias": "social_sciences", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "high_school_macroeconomics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_high_school_mathematics": { + "task": "mmlu_high_school_mathematics", + "task_alias": "high_school_mathematics", + "group": "mmlu_stem", + "group_alias": "stem", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "high_school_mathematics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_high_school_microeconomics": { + "task": "mmlu_high_school_microeconomics", + "task_alias": "high_school_microeconomics", + "group": "mmlu_social_sciences", + "group_alias": "social_sciences", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "high_school_microeconomics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_high_school_physics": { + "task": "mmlu_high_school_physics", + "task_alias": "high_school_physics", + "group": "mmlu_stem", + "group_alias": "stem", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "high_school_physics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_high_school_psychology": { + "task": "mmlu_high_school_psychology", + "task_alias": "high_school_psychology", + "group": "mmlu_social_sciences", + "group_alias": "social_sciences", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "high_school_psychology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_high_school_statistics": { + "task": "mmlu_high_school_statistics", + "task_alias": "high_school_statistics", + "group": "mmlu_stem", + "group_alias": "stem", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "high_school_statistics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_high_school_us_history": { + "task": "mmlu_high_school_us_history", + "task_alias": "high_school_us_history", + "group": "mmlu_humanities", + "group_alias": "humanities", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "high_school_us_history", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_high_school_world_history": { + "task": "mmlu_high_school_world_history", + "task_alias": "high_school_world_history", + "group": "mmlu_humanities", + "group_alias": "humanities", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "high_school_world_history", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_human_aging": { + "task": "mmlu_human_aging", + "task_alias": "human_aging", + "group": "mmlu_other", + "group_alias": "other", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "human_aging", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human aging.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_human_sexuality": { + "task": "mmlu_human_sexuality", + "task_alias": "human_sexuality", + "group": "mmlu_social_sciences", + "group_alias": "social_sciences", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "human_sexuality", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_international_law": { + "task": "mmlu_international_law", + "task_alias": "international_law", + "group": "mmlu_humanities", + "group_alias": "humanities", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "international_law", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about international law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_jurisprudence": { + "task": "mmlu_jurisprudence", + "task_alias": "jurisprudence", + "group": "mmlu_humanities", + "group_alias": "humanities", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "jurisprudence", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_logical_fallacies": { + "task": "mmlu_logical_fallacies", + "task_alias": "logical_fallacies", + "group": "mmlu_humanities", + "group_alias": "humanities", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "logical_fallacies", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_machine_learning": { + "task": "mmlu_machine_learning", + "task_alias": "machine_learning", + "group": "mmlu_stem", + "group_alias": "stem", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "machine_learning", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_management": { + "task": "mmlu_management", + "task_alias": "management", + "group": "mmlu_other", + "group_alias": "other", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "management", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about management.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_marketing": { + "task": "mmlu_marketing", + "task_alias": "marketing", + "group": "mmlu_other", + "group_alias": "other", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "marketing", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about marketing.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_medical_genetics": { + "task": "mmlu_medical_genetics", + "task_alias": "medical_genetics", + "group": "mmlu_other", + "group_alias": "other", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "medical_genetics", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_miscellaneous": { + "task": "mmlu_miscellaneous", + "task_alias": "miscellaneous", + "group": "mmlu_other", + "group_alias": "other", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "miscellaneous", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_moral_disputes": { + "task": "mmlu_moral_disputes", + "task_alias": "moral_disputes", + "group": "mmlu_humanities", + "group_alias": "humanities", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "moral_disputes", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_moral_scenarios": { + "task": "mmlu_moral_scenarios", + "task_alias": "moral_scenarios", + "group": "mmlu_humanities", + "group_alias": "humanities", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "moral_scenarios", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_nutrition": { + "task": "mmlu_nutrition", + "task_alias": "nutrition", + "group": "mmlu_other", + "group_alias": "other", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "nutrition", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_philosophy": { + "task": "mmlu_philosophy", + "task_alias": "philosophy", + "group": "mmlu_humanities", + "group_alias": "humanities", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "philosophy", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_prehistory": { + "task": "mmlu_prehistory", + "task_alias": "prehistory", + "group": "mmlu_humanities", + "group_alias": "humanities", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "prehistory", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_professional_accounting": { + "task": "mmlu_professional_accounting", + "task_alias": "professional_accounting", + "group": "mmlu_other", + "group_alias": "other", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "professional_accounting", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_professional_law": { + "task": "mmlu_professional_law", + "task_alias": "professional_law", + "group": "mmlu_humanities", + "group_alias": "humanities", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "professional_law", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional law.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_professional_medicine": { + "task": "mmlu_professional_medicine", + "task_alias": "professional_medicine", + "group": "mmlu_other", + "group_alias": "other", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "professional_medicine", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_professional_psychology": { + "task": "mmlu_professional_psychology", + "task_alias": "professional_psychology", + "group": "mmlu_social_sciences", + "group_alias": "social_sciences", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "professional_psychology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_public_relations": { + "task": "mmlu_public_relations", + "task_alias": "public_relations", + "group": "mmlu_social_sciences", + "group_alias": "social_sciences", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "public_relations", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about public relations.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_security_studies": { + "task": "mmlu_security_studies", + "task_alias": "security_studies", + "group": "mmlu_social_sciences", + "group_alias": "social_sciences", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "security_studies", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about security studies.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_sociology": { + "task": "mmlu_sociology", + "task_alias": "sociology", + "group": "mmlu_social_sciences", + "group_alias": "social_sciences", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "sociology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about sociology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_us_foreign_policy": { + "task": "mmlu_us_foreign_policy", + "task_alias": "us_foreign_policy", + "group": "mmlu_social_sciences", + "group_alias": "social_sciences", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "us_foreign_policy", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_virology": { + "task": "mmlu_virology", + "task_alias": "virology", + "group": "mmlu_other", + "group_alias": "other", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "virology", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about virology.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + }, + "mmlu_world_religions": { + "task": "mmlu_world_religions", + "task_alias": "world_religions", + "group": "mmlu_humanities", + "group_alias": "humanities", + "dataset_path": "hails/mmlu_no_train", + "dataset_name": "world_religions", + "test_split": "test", + "fewshot_split": "dev", + "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", + "doc_to_target": "answer", + "doc_to_choice": [ + "A", + "B", + "C", + "D" + ], + "description": "The following are multiple choice questions (with answers) about world religions.\n\n", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "fewshot_config": { + "sampler": "first_n" + }, + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": false, + "metadata": { + "version": 0 + } + } + }, + "versions": { + "mmlu": "N/A", + "mmlu_abstract_algebra": "Yaml", + "mmlu_anatomy": "Yaml", + "mmlu_astronomy": "Yaml", + "mmlu_business_ethics": "Yaml", + "mmlu_clinical_knowledge": "Yaml", + "mmlu_college_biology": "Yaml", + "mmlu_college_chemistry": "Yaml", + "mmlu_college_computer_science": "Yaml", + "mmlu_college_mathematics": "Yaml", + "mmlu_college_medicine": "Yaml", + "mmlu_college_physics": "Yaml", + "mmlu_computer_security": "Yaml", + "mmlu_conceptual_physics": "Yaml", + "mmlu_econometrics": "Yaml", + "mmlu_electrical_engineering": "Yaml", + "mmlu_elementary_mathematics": "Yaml", + "mmlu_formal_logic": "Yaml", + "mmlu_global_facts": "Yaml", + "mmlu_high_school_biology": "Yaml", + "mmlu_high_school_chemistry": "Yaml", + "mmlu_high_school_computer_science": "Yaml", + "mmlu_high_school_european_history": "Yaml", + "mmlu_high_school_geography": "Yaml", + "mmlu_high_school_government_and_politics": "Yaml", + "mmlu_high_school_macroeconomics": "Yaml", + "mmlu_high_school_mathematics": "Yaml", + "mmlu_high_school_microeconomics": "Yaml", + "mmlu_high_school_physics": "Yaml", + "mmlu_high_school_psychology": "Yaml", + "mmlu_high_school_statistics": "Yaml", + "mmlu_high_school_us_history": "Yaml", + "mmlu_high_school_world_history": "Yaml", + "mmlu_human_aging": "Yaml", + "mmlu_human_sexuality": "Yaml", + "mmlu_humanities": "N/A", + "mmlu_international_law": "Yaml", + "mmlu_jurisprudence": "Yaml", + "mmlu_logical_fallacies": "Yaml", + "mmlu_machine_learning": "Yaml", + "mmlu_management": "Yaml", + "mmlu_marketing": "Yaml", + "mmlu_medical_genetics": "Yaml", + "mmlu_miscellaneous": "Yaml", + "mmlu_moral_disputes": "Yaml", + "mmlu_moral_scenarios": "Yaml", + "mmlu_nutrition": "Yaml", + "mmlu_other": "N/A", + "mmlu_philosophy": "Yaml", + "mmlu_prehistory": "Yaml", + "mmlu_professional_accounting": "Yaml", + "mmlu_professional_law": "Yaml", + "mmlu_professional_medicine": "Yaml", + "mmlu_professional_psychology": "Yaml", + "mmlu_public_relations": "Yaml", + "mmlu_security_studies": "Yaml", + "mmlu_social_sciences": "N/A", + "mmlu_sociology": "Yaml", + "mmlu_stem": "N/A", + "mmlu_us_foreign_policy": "Yaml", + "mmlu_virology": "Yaml", + "mmlu_world_religions": "Yaml" + }, + "n-shot": { + "mmlu": 0, + "mmlu_abstract_algebra": 5, + "mmlu_anatomy": 5, + "mmlu_astronomy": 5, + "mmlu_business_ethics": 5, + "mmlu_clinical_knowledge": 5, + "mmlu_college_biology": 5, + "mmlu_college_chemistry": 5, + "mmlu_college_computer_science": 5, + "mmlu_college_mathematics": 5, + "mmlu_college_medicine": 5, + "mmlu_college_physics": 5, + "mmlu_computer_security": 5, + "mmlu_conceptual_physics": 5, + "mmlu_econometrics": 5, + "mmlu_electrical_engineering": 5, + "mmlu_elementary_mathematics": 5, + "mmlu_formal_logic": 5, + "mmlu_global_facts": 5, + "mmlu_high_school_biology": 5, + "mmlu_high_school_chemistry": 5, + "mmlu_high_school_computer_science": 5, + "mmlu_high_school_european_history": 5, + "mmlu_high_school_geography": 5, + "mmlu_high_school_government_and_politics": 5, + "mmlu_high_school_macroeconomics": 5, + "mmlu_high_school_mathematics": 5, + "mmlu_high_school_microeconomics": 5, + "mmlu_high_school_physics": 5, + "mmlu_high_school_psychology": 5, + "mmlu_high_school_statistics": 5, + "mmlu_high_school_us_history": 5, + "mmlu_high_school_world_history": 5, + "mmlu_human_aging": 5, + "mmlu_human_sexuality": 5, + "mmlu_humanities": 5, + "mmlu_international_law": 5, + "mmlu_jurisprudence": 5, + "mmlu_logical_fallacies": 5, + "mmlu_machine_learning": 5, + "mmlu_management": 5, + "mmlu_marketing": 5, + "mmlu_medical_genetics": 5, + "mmlu_miscellaneous": 5, + "mmlu_moral_disputes": 5, + "mmlu_moral_scenarios": 5, + "mmlu_nutrition": 5, + "mmlu_other": 5, + "mmlu_philosophy": 5, + "mmlu_prehistory": 5, + "mmlu_professional_accounting": 5, + "mmlu_professional_law": 5, + "mmlu_professional_medicine": 5, + "mmlu_professional_psychology": 5, + "mmlu_public_relations": 5, + "mmlu_security_studies": 5, + "mmlu_social_sciences": 5, + "mmlu_sociology": 5, + "mmlu_stem": 5, + "mmlu_us_foreign_policy": 5, + "mmlu_virology": 5, + "mmlu_world_religions": 5 + }, + "config": { + "model": "vllm", + "model_args": "pretrained=/workspace/dolphin-2.6-mistral-7b-hf,tensor_parallel_size=4,dtype=auto,trust_remote_code=True,gpu_memory_utilization=0.8", + "batch_size": "8", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null + }, + "git_hash": "46c79664" + }, + { + "results": { + "truthfulqa": { + "bleu_max,none": 14.638837106843457, + "bleu_max_stderr,none": 0.34362387539987554, + "bleu_acc,none": 0.4394124847001224, + "bleu_acc_stderr,none": 0.00030187396199728844, + "bleu_diff,none": 0.24542274719797535, + "bleu_diff_stderr,none": 0.2063430019358257, + "rouge1_max,none": 38.65517994793549, + "rouge1_max_stderr,none": 0.5858540895997846, + "rouge1_acc,none": 0.46511627906976744, + "rouge1_acc_stderr,none": 0.0003048812818799781, + "rouge1_diff,none": -0.2338327660242177, + "rouge1_diff_stderr,none": 0.3738211153435194, + "rouge2_max,none": 24.05741215628304, + "rouge2_max_stderr,none": 0.6481300865775006, + "rouge2_acc,none": 0.34149326805385555, + "rouge2_acc_stderr,none": 0.0002755828626565652, + "rouge2_diff,none": -0.26964949017748546, + "rouge2_diff_stderr,none": 0.46049881176190666, + "rougeL_max,none": 34.96464478044315, + "rougeL_max_stderr,none": 0.5902919473860816, + "rougeL_acc,none": 0.4418604651162791, + "rougeL_acc_stderr,none": 0.00030223014029841656, + "rougeL_diff,none": -0.4989046166400256, + "rougeL_diff_stderr,none": 0.38675419798108396, + "acc,none": 0.4369849253215213, + "acc_stderr,none": 0.05670747643267544, + "alias": "truthfulqa" + }, + "truthfulqa_gen": { + "bleu_max,none": 14.638837106843457, + "bleu_max_stderr,none": 0.5861944006896309, + "bleu_acc,none": 0.4394124847001224, + "bleu_acc_stderr,none": 0.01737452048251371, + "bleu_diff,none": 0.24542274719797535, + "bleu_diff_stderr,none": 0.45424993333607183, + "rouge1_max,none": 38.65517994793549, + "rouge1_max_stderr,none": 0.7654110592353527, + "rouge1_acc,none": 0.46511627906976744, + "rouge1_acc_stderr,none": 0.017460849975873972, + "rouge1_diff,none": -0.2338327660242177, + "rouge1_diff_stderr,none": 0.6114091227185929, + "rouge2_max,none": 24.05741215628304, + "rouge2_max_stderr,none": 0.8050652685201993, + "rouge2_acc,none": 0.34149326805385555, + "rouge2_acc_stderr,none": 0.016600688619950836, + "rouge2_diff,none": -0.26964949017748546, + "rouge2_diff_stderr,none": 0.6786006275873215, + "rougeL_max,none": 34.96464478044315, + "rougeL_max_stderr,none": 0.7683045928445837, + "rougeL_acc,none": 0.4418604651162791, + "rougeL_acc_stderr,none": 0.01738476747898621, + "rougeL_diff,none": -0.4989046166400256, + "rougeL_diff_stderr,none": 0.6218956487877078, + "alias": " - truthfulqa_gen" + }, + "truthfulqa_mc1": { + "acc,none": 0.379436964504284, + "acc_stderr,none": 0.01698703926614297, + "alias": " - truthfulqa_mc1" + }, + "truthfulqa_mc2": { + "acc,none": 0.552080846955996, + "acc_stderr,none": 0.015343375525702328, + "alias": " - truthfulqa_mc2" + } + }, + "groups": { + "truthfulqa": { + "bleu_max,none": 14.638837106843457, + "bleu_max_stderr,none": 0.34362387539987554, + "bleu_acc,none": 0.4394124847001224, + "bleu_acc_stderr,none": 0.00030187396199728844, + "bleu_diff,none": 0.24542274719797535, + "bleu_diff_stderr,none": 0.2063430019358257, + "rouge1_max,none": 38.65517994793549, + "rouge1_max_stderr,none": 0.5858540895997846, + "rouge1_acc,none": 0.46511627906976744, + "rouge1_acc_stderr,none": 0.0003048812818799781, + "rouge1_diff,none": -0.2338327660242177, + "rouge1_diff_stderr,none": 0.3738211153435194, + "rouge2_max,none": 24.05741215628304, + "rouge2_max_stderr,none": 0.6481300865775006, + "rouge2_acc,none": 0.34149326805385555, + "rouge2_acc_stderr,none": 0.0002755828626565652, + "rouge2_diff,none": -0.26964949017748546, + "rouge2_diff_stderr,none": 0.46049881176190666, + "rougeL_max,none": 34.96464478044315, + "rougeL_max_stderr,none": 0.5902919473860816, + "rougeL_acc,none": 0.4418604651162791, + "rougeL_acc_stderr,none": 0.00030223014029841656, + "rougeL_diff,none": -0.4989046166400256, + "rougeL_diff_stderr,none": 0.38675419798108396, + "acc,none": 0.4369849253215213, + "acc_stderr,none": 0.05670747643267544, + "alias": "truthfulqa" + } + }, + "configs": { + "truthfulqa_gen": { + "task": "truthfulqa_gen", + "group": [ + "truthfulqa" + ], + "dataset_path": "truthful_qa", + "dataset_name": "generation", + "validation_split": "validation", + "process_docs": "", + "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question}}", + "doc_to_target": " ", + "process_results": "", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "bleu_max", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "bleu_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "bleu_diff", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge1_max", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge1_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge1_diff", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge2_max", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge2_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rouge2_diff", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rougeL_max", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rougeL_acc", + "aggregation": "mean", + "higher_is_better": true + }, + { + "metric": "rougeL_diff", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "generate_until", + "generation_kwargs": { + "until": [ + "\n\n" + ], + "do_sample": false + }, + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 2 + } + }, + "truthfulqa_mc1": { + "task": "truthfulqa_mc1", + "group": [ + "truthfulqa" + ], + "dataset_path": "truthful_qa", + "dataset_name": "multiple_choice", + "validation_split": "validation", + "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}", + "doc_to_target": 0, + "doc_to_choice": "{{mc1_targets.choices}}", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 2 + } + }, + "truthfulqa_mc2": { + "task": "truthfulqa_mc2", + "group": [ + "truthfulqa" + ], + "dataset_path": "truthful_qa", + "dataset_name": "multiple_choice", + "validation_split": "validation", + "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}", + "doc_to_target": 0, + "doc_to_choice": "{{mc2_targets.choices}}", + "process_results": "", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 0, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "question", + "metadata": { + "version": 2 + } + } + }, + "versions": { + "truthfulqa": "N/A", + "truthfulqa_gen": "Yaml", + "truthfulqa_mc1": "Yaml", + "truthfulqa_mc2": "Yaml" + }, + "n-shot": { + "truthfulqa": 0, + "truthfulqa_gen": 0, + "truthfulqa_mc1": 0, + "truthfulqa_mc2": 0 + }, + "config": { + "model": "vllm", + "model_args": "pretrained=/workspace/dolphin-2.6-mistral-7b-hf,tensor_parallel_size=4,dtype=auto,trust_remote_code=True,gpu_memory_utilization=0.8", + "batch_size": "8", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null + }, + "git_hash": "46c79664" + }, + { + "results": { + "winogrande": { + "acc,none": 0.739542225730071, + "acc_stderr,none": 0.012334833671998292, + "alias": "winogrande" + } + }, + "configs": { + "winogrande": { + "task": "winogrande", + "dataset_path": "winogrande", + "dataset_name": "winogrande_xl", + "training_split": "train", + "validation_split": "validation", + "doc_to_text": "", + "doc_to_target": "", + "doc_to_choice": "", + "description": "", + "target_delimiter": " ", + "fewshot_delimiter": "\n\n", + "num_fewshot": 5, + "metric_list": [ + { + "metric": "acc", + "aggregation": "mean", + "higher_is_better": true + } + ], + "output_type": "multiple_choice", + "repeats": 1, + "should_decontaminate": true, + "doc_to_decontamination_query": "sentence", + "metadata": { + "version": 1 + } + } + }, + "versions": { + "winogrande": "Yaml" + }, + "n-shot": { + "winogrande": 5 + }, + "config": { + "model": "vllm", + "model_args": "pretrained=/workspace/dolphin-2.6-mistral-7b-hf,tensor_parallel_size=4,dtype=auto,trust_remote_code=True,gpu_memory_utilization=0.8", + "batch_size": "8", + "batch_sizes": [], + "device": null, + "use_cache": null, + "limit": null, + "bootstrap_iters": 100000, + "gen_kwargs": null + }, + "git_hash": "46c79664" + } +]