diff --git "a/eval_results.json" "b/eval_results.json"
new file mode 100644--- /dev/null
+++ "b/eval_results.json"
@@ -0,0 +1,3210 @@
+[
+  {
+    "results": {
+      "arc_challenge": {
+        "acc,none": 0.5981228668941979,
+        "acc_stderr,none": 0.014327268614578274,
+        "acc_norm,none": 0.6348122866894198,
+        "acc_norm_stderr,none": 0.0140702655192688,
+        "alias": "arc_challenge"
+      }
+    },
+    "configs": {
+      "arc_challenge": {
+        "task": "arc_challenge",
+        "group": [
+          "ai2_arc"
+        ],
+        "dataset_path": "ai2_arc",
+        "dataset_name": "ARC-Challenge",
+        "training_split": "train",
+        "validation_split": "validation",
+        "test_split": "test",
+        "doc_to_text": "Question: {{question}}\nAnswer:",
+        "doc_to_target": "{{choices.label.index(answerKey)}}",
+        "doc_to_choice": "{{choices.text}}",
+        "description": "",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "num_fewshot": 25,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          },
+          {
+            "metric": "acc_norm",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": true,
+        "doc_to_decontamination_query": "Question: {{question}}\nAnswer:",
+        "metadata": {
+          "version": 1
+        }
+      }
+    },
+    "versions": {
+      "arc_challenge": "Yaml"
+    },
+    "n-shot": {
+      "arc_challenge": 25
+    },
+    "config": {
+      "model": "vllm",
+      "model_args": "pretrained=/workspace/dolphin-2.6-mistral-7b-hf,tensor_parallel_size=4,dtype=auto,trust_remote_code=True,gpu_memory_utilization=0.8",
+      "batch_size": "8",
+      "batch_sizes": [],
+      "device": null,
+      "use_cache": null,
+      "limit": null,
+      "bootstrap_iters": 100000,
+      "gen_kwargs": null
+    },
+    "git_hash": "46c79664"
+  },
+  {
+    "results": {
+      "gsm8k": {
+        "exact_match,get-answer": 0.5428354814253222,
+        "exact_match_stderr,get-answer": 0.01372184996870972,
+        "alias": "gsm8k"
+      }
+    },
+    "configs": {
+      "gsm8k": {
+        "task": "gsm8k",
+        "group": [
+          "math_word_problems"
+        ],
+        "dataset_path": "gsm8k",
+        "dataset_name": "main",
+        "training_split": "train",
+        "test_split": "test",
+        "fewshot_split": "train",
+        "doc_to_text": "Question: {{question}}\nAnswer:",
+        "doc_to_target": "{{answer}}",
+        "description": "",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "exact_match",
+            "aggregation": "mean",
+            "higher_is_better": true,
+            "ignore_case": true,
+            "ignore_punctuation": false,
+            "regexes_to_ignore": [
+              ",",
+              "\\$",
+              "(?s).*#### "
+            ]
+          }
+        ],
+        "output_type": "generate_until",
+        "generation_kwargs": {
+          "until": [
+            "\n\n",
+            "Question:"
+          ],
+          "do_sample": false,
+          "temperature": 0
+        },
+        "repeats": 1,
+        "filter_list": [
+          {
+            "name": "get-answer",
+            "filter": [
+              {
+                "function": "regex",
+                "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
+              },
+              {
+                "function": "take_first"
+              }
+            ]
+          }
+        ],
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 1
+        }
+      }
+    },
+    "versions": {
+      "gsm8k": "Yaml"
+    },
+    "n-shot": {
+      "gsm8k": 5
+    },
+    "config": {
+      "model": "vllm",
+      "model_args": "pretrained=/workspace/dolphin-2.6-mistral-7b-hf,tensor_parallel_size=4,dtype=auto,trust_remote_code=True,gpu_memory_utilization=0.8",
+      "batch_size": "8",
+      "batch_sizes": [],
+      "device": null,
+      "use_cache": null,
+      "limit": null,
+      "bootstrap_iters": 100000,
+      "gen_kwargs": null
+    },
+    "git_hash": "46c79664"
+  },
+  {
+    "results": {
+      "hellaswag": {
+        "acc,none": 0.6453893646683927,
+        "acc_stderr,none": 0.0047741745902051365,
+        "acc_norm,none": 0.8346942840071699,
+        "acc_norm_stderr,none": 0.0037069708564110683,
+        "alias": "hellaswag"
+      }
+    },
+    "configs": {
+      "hellaswag": {
+        "task": "hellaswag",
+        "group": [
+          "multiple_choice"
+        ],
+        "dataset_path": "hellaswag",
+        "training_split": "train",
+        "validation_split": "validation",
+        "process_docs": "<function process_docs at 0x7f1c8e64af20>",
+        "doc_to_text": "{{query}}",
+        "doc_to_target": "{{label}}",
+        "doc_to_choice": "choices",
+        "description": "",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "num_fewshot": 10,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          },
+          {
+            "metric": "acc_norm",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 1
+        }
+      }
+    },
+    "versions": {
+      "hellaswag": "Yaml"
+    },
+    "n-shot": {
+      "hellaswag": 10
+    },
+    "config": {
+      "model": "vllm",
+      "model_args": "pretrained=/workspace/dolphin-2.6-mistral-7b-hf,tensor_parallel_size=4,dtype=auto,trust_remote_code=True,gpu_memory_utilization=0.8",
+      "batch_size": "8",
+      "batch_sizes": [],
+      "device": null,
+      "use_cache": null,
+      "limit": null,
+      "bootstrap_iters": 100000,
+      "gen_kwargs": null
+    },
+    "git_hash": "46c79664"
+  },
+  {
+    "results": {
+      "mmlu": {
+        "acc,none": 0.6147272468309357,
+        "acc_stderr,none": 0.1279904236098431,
+        "alias": "mmlu"
+      },
+      "mmlu_humanities": {
+        "alias": " - humanities",
+        "acc,none": 0.5636556854410202,
+        "acc_stderr,none": 0.11857886643596054
+      },
+      "mmlu_formal_logic": {
+        "alias": "  - formal_logic",
+        "acc,none": 0.3968253968253968,
+        "acc_stderr,none": 0.04375888492727061
+      },
+      "mmlu_high_school_european_history": {
+        "alias": "  - high_school_european_history",
+        "acc,none": 0.7636363636363637,
+        "acc_stderr,none": 0.033175059300091805
+      },
+      "mmlu_high_school_us_history": {
+        "alias": "  - high_school_us_history",
+        "acc,none": 0.8137254901960784,
+        "acc_stderr,none": 0.027325470966716312
+      },
+      "mmlu_high_school_world_history": {
+        "alias": "  - high_school_world_history",
+        "acc,none": 0.7721518987341772,
+        "acc_stderr,none": 0.02730348459906943
+      },
+      "mmlu_international_law": {
+        "alias": "  - international_law",
+        "acc,none": 0.7768595041322314,
+        "acc_stderr,none": 0.03800754475228733
+      },
+      "mmlu_jurisprudence": {
+        "alias": "  - jurisprudence",
+        "acc,none": 0.7962962962962963,
+        "acc_stderr,none": 0.03893542518824847
+      },
+      "mmlu_logical_fallacies": {
+        "alias": "  - logical_fallacies",
+        "acc,none": 0.7055214723926381,
+        "acc_stderr,none": 0.03581165790474082
+      },
+      "mmlu_moral_disputes": {
+        "alias": "  - moral_disputes",
+        "acc,none": 0.6994219653179191,
+        "acc_stderr,none": 0.024685316867257806
+      },
+      "mmlu_moral_scenarios": {
+        "alias": "  - moral_scenarios",
+        "acc,none": 0.36089385474860336,
+        "acc_stderr,none": 0.01606229067111046
+      },
+      "mmlu_philosophy": {
+        "alias": "  - philosophy",
+        "acc,none": 0.7041800643086816,
+        "acc_stderr,none": 0.025922371788818788
+      },
+      "mmlu_prehistory": {
+        "alias": "  - prehistory",
+        "acc,none": 0.6975308641975309,
+        "acc_stderr,none": 0.02555765398186805
+      },
+      "mmlu_professional_law": {
+        "alias": "  - professional_law",
+        "acc,none": 0.44198174706649285,
+        "acc_stderr,none": 0.012683972513598806
+      },
+      "mmlu_world_religions": {
+        "alias": "  - world_religions",
+        "acc,none": 0.8421052631578947,
+        "acc_stderr,none": 0.02796678585916087
+      },
+      "mmlu_other": {
+        "alias": " - other",
+        "acc,none": 0.683617637592533,
+        "acc_stderr,none": 0.10929719513421464
+      },
+      "mmlu_business_ethics": {
+        "alias": "  - business_ethics",
+        "acc,none": 0.58,
+        "acc_stderr,none": 0.049604496374885836
+      },
+      "mmlu_clinical_knowledge": {
+        "alias": "  - clinical_knowledge",
+        "acc,none": 0.6867924528301886,
+        "acc_stderr,none": 0.028544793319055326
+      },
+      "mmlu_college_medicine": {
+        "alias": "  - college_medicine",
+        "acc,none": 0.5953757225433526,
+        "acc_stderr,none": 0.03742461193887249
+      },
+      "mmlu_global_facts": {
+        "alias": "  - global_facts",
+        "acc,none": 0.38,
+        "acc_stderr,none": 0.04878317312145632
+      },
+      "mmlu_human_aging": {
+        "alias": "  - human_aging",
+        "acc,none": 0.6771300448430493,
+        "acc_stderr,none": 0.031381476375755
+      },
+      "mmlu_management": {
+        "alias": "  - management",
+        "acc,none": 0.7961165048543689,
+        "acc_stderr,none": 0.0398913985953177
+      },
+      "mmlu_marketing": {
+        "alias": "  - marketing",
+        "acc,none": 0.8547008547008547,
+        "acc_stderr,none": 0.023086635086841403
+      },
+      "mmlu_medical_genetics": {
+        "alias": "  - medical_genetics",
+        "acc,none": 0.68,
+        "acc_stderr,none": 0.04688261722621504
+      },
+      "mmlu_miscellaneous": {
+        "alias": "  - miscellaneous",
+        "acc,none": 0.8084291187739464,
+        "acc_stderr,none": 0.014072859310451945
+      },
+      "mmlu_nutrition": {
+        "alias": "  - nutrition",
+        "acc,none": 0.7156862745098039,
+        "acc_stderr,none": 0.025829163272757465
+      },
+      "mmlu_professional_accounting": {
+        "alias": "  - professional_accounting",
+        "acc,none": 0.4716312056737589,
+        "acc_stderr,none": 0.029779450957303055
+      },
+      "mmlu_professional_medicine": {
+        "alias": "  - professional_medicine",
+        "acc,none": 0.625,
+        "acc_stderr,none": 0.029408372932278746
+      },
+      "mmlu_virology": {
+        "alias": "  - virology",
+        "acc,none": 0.5240963855421686,
+        "acc_stderr,none": 0.038879718495972646
+      },
+      "mmlu_social_sciences": {
+        "alias": " - social_sciences",
+        "acc,none": 0.726356841078973,
+        "acc_stderr,none": 0.07276844314615243
+      },
+      "mmlu_econometrics": {
+        "alias": "  - econometrics",
+        "acc,none": 0.45614035087719296,
+        "acc_stderr,none": 0.04685473041907789
+      },
+      "mmlu_high_school_geography": {
+        "alias": "  - high_school_geography",
+        "acc,none": 0.7727272727272727,
+        "acc_stderr,none": 0.029857515673386396
+      },
+      "mmlu_high_school_government_and_politics": {
+        "alias": "  - high_school_government_and_politics",
+        "acc,none": 0.8808290155440415,
+        "acc_stderr,none": 0.023381935348121417
+      },
+      "mmlu_high_school_macroeconomics": {
+        "alias": "  - high_school_macroeconomics",
+        "acc,none": 0.6307692307692307,
+        "acc_stderr,none": 0.024468615241478926
+      },
+      "mmlu_high_school_microeconomics": {
+        "alias": "  - high_school_microeconomics",
+        "acc,none": 0.6512605042016807,
+        "acc_stderr,none": 0.030956636328566548
+      },
+      "mmlu_high_school_psychology": {
+        "alias": "  - high_school_psychology",
+        "acc,none": 0.8293577981651377,
+        "acc_stderr,none": 0.016129271025099888
+      },
+      "mmlu_human_sexuality": {
+        "alias": "  - human_sexuality",
+        "acc,none": 0.7480916030534351,
+        "acc_stderr,none": 0.038073871163060866
+      },
+      "mmlu_professional_psychology": {
+        "alias": "  - professional_psychology",
+        "acc,none": 0.6535947712418301,
+        "acc_stderr,none": 0.01924978569171721
+      },
+      "mmlu_public_relations": {
+        "alias": "  - public_relations",
+        "acc,none": 0.6818181818181818,
+        "acc_stderr,none": 0.044612721759105085
+      },
+      "mmlu_security_studies": {
+        "alias": "  - security_studies",
+        "acc,none": 0.7306122448979592,
+        "acc_stderr,none": 0.02840125202902294
+      },
+      "mmlu_sociology": {
+        "alias": "  - sociology",
+        "acc,none": 0.835820895522388,
+        "acc_stderr,none": 0.026193923544454156
+      },
+      "mmlu_us_foreign_policy": {
+        "alias": "  - us_foreign_policy",
+        "acc,none": 0.87,
+        "acc_stderr,none": 0.033799766898963086
+      },
+      "mmlu_stem": {
+        "alias": " - stem",
+        "acc,none": 0.5141135426577862,
+        "acc_stderr,none": 0.13904081800567097
+      },
+      "mmlu_abstract_algebra": {
+        "alias": "  - abstract_algebra",
+        "acc,none": 0.34,
+        "acc_stderr,none": 0.04760952285695235
+      },
+      "mmlu_anatomy": {
+        "alias": "  - anatomy",
+        "acc,none": 0.6296296296296297,
+        "acc_stderr,none": 0.041716541613545426
+      },
+      "mmlu_astronomy": {
+        "alias": "  - astronomy",
+        "acc,none": 0.7105263157894737,
+        "acc_stderr,none": 0.036906779861372814
+      },
+      "mmlu_college_biology": {
+        "alias": "  - college_biology",
+        "acc,none": 0.7361111111111112,
+        "acc_stderr,none": 0.03685651095897532
+      },
+      "mmlu_college_chemistry": {
+        "alias": "  - college_chemistry",
+        "acc,none": 0.42,
+        "acc_stderr,none": 0.049604496374885836
+      },
+      "mmlu_college_computer_science": {
+        "alias": "  - college_computer_science",
+        "acc,none": 0.54,
+        "acc_stderr,none": 0.05009082659620332
+      },
+      "mmlu_college_mathematics": {
+        "alias": "  - college_mathematics",
+        "acc,none": 0.39,
+        "acc_stderr,none": 0.04902071300001975
+      },
+      "mmlu_college_physics": {
+        "alias": "  - college_physics",
+        "acc,none": 0.3431372549019608,
+        "acc_stderr,none": 0.04724007352383888
+      },
+      "mmlu_computer_security": {
+        "alias": "  - computer_security",
+        "acc,none": 0.76,
+        "acc_stderr,none": 0.042923469599092816
+      },
+      "mmlu_conceptual_physics": {
+        "alias": "  - conceptual_physics",
+        "acc,none": 0.5276595744680851,
+        "acc_stderr,none": 0.03263597118409769
+      },
+      "mmlu_electrical_engineering": {
+        "alias": "  - electrical_engineering",
+        "acc,none": 0.5793103448275863,
+        "acc_stderr,none": 0.04113914981189261
+      },
+      "mmlu_elementary_mathematics": {
+        "alias": "  - elementary_mathematics",
+        "acc,none": 0.38095238095238093,
+        "acc_stderr,none": 0.025010749116137595
+      },
+      "mmlu_high_school_biology": {
+        "alias": "  - high_school_biology",
+        "acc,none": 0.7612903225806451,
+        "acc_stderr,none": 0.024251071262208834
+      },
+      "mmlu_high_school_chemistry": {
+        "alias": "  - high_school_chemistry",
+        "acc,none": 0.47783251231527096,
+        "acc_stderr,none": 0.03514528562175007
+      },
+      "mmlu_high_school_computer_science": {
+        "alias": "  - high_school_computer_science",
+        "acc,none": 0.71,
+        "acc_stderr,none": 0.045604802157206845
+      },
+      "mmlu_high_school_mathematics": {
+        "alias": "  - high_school_mathematics",
+        "acc,none": 0.3,
+        "acc_stderr,none": 0.02794045713622842
+      },
+      "mmlu_high_school_physics": {
+        "alias": "  - high_school_physics",
+        "acc,none": 0.2913907284768212,
+        "acc_stderr,none": 0.03710185726119995
+      },
+      "mmlu_high_school_statistics": {
+        "alias": "  - high_school_statistics",
+        "acc,none": 0.5,
+        "acc_stderr,none": 0.034099716973523674
+      },
+      "mmlu_machine_learning": {
+        "alias": "  - machine_learning",
+        "acc,none": 0.4732142857142857,
+        "acc_stderr,none": 0.047389751192741546
+      }
+    },
+    "groups": {
+      "mmlu": {
+        "acc,none": 0.6147272468309357,
+        "acc_stderr,none": 0.1279904236098431,
+        "alias": "mmlu"
+      },
+      "mmlu_humanities": {
+        "alias": " - humanities",
+        "acc,none": 0.5636556854410202,
+        "acc_stderr,none": 0.11857886643596054
+      },
+      "mmlu_other": {
+        "alias": " - other",
+        "acc,none": 0.683617637592533,
+        "acc_stderr,none": 0.10929719513421464
+      },
+      "mmlu_social_sciences": {
+        "alias": " - social_sciences",
+        "acc,none": 0.726356841078973,
+        "acc_stderr,none": 0.07276844314615243
+      },
+      "mmlu_stem": {
+        "alias": " - stem",
+        "acc,none": 0.5141135426577862,
+        "acc_stderr,none": 0.13904081800567097
+      }
+    },
+    "configs": {
+      "mmlu_abstract_algebra": {
+        "task": "mmlu_abstract_algebra",
+        "task_alias": "abstract_algebra",
+        "group": "mmlu_stem",
+        "group_alias": "stem",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "abstract_algebra",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_anatomy": {
+        "task": "mmlu_anatomy",
+        "task_alias": "anatomy",
+        "group": "mmlu_stem",
+        "group_alias": "stem",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "anatomy",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about anatomy.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_astronomy": {
+        "task": "mmlu_astronomy",
+        "task_alias": "astronomy",
+        "group": "mmlu_stem",
+        "group_alias": "stem",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "astronomy",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about astronomy.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_business_ethics": {
+        "task": "mmlu_business_ethics",
+        "task_alias": "business_ethics",
+        "group": "mmlu_other",
+        "group_alias": "other",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "business_ethics",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about business ethics.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_clinical_knowledge": {
+        "task": "mmlu_clinical_knowledge",
+        "task_alias": "clinical_knowledge",
+        "group": "mmlu_other",
+        "group_alias": "other",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "clinical_knowledge",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_college_biology": {
+        "task": "mmlu_college_biology",
+        "task_alias": "college_biology",
+        "group": "mmlu_stem",
+        "group_alias": "stem",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "college_biology",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about college biology.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_college_chemistry": {
+        "task": "mmlu_college_chemistry",
+        "task_alias": "college_chemistry",
+        "group": "mmlu_stem",
+        "group_alias": "stem",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "college_chemistry",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_college_computer_science": {
+        "task": "mmlu_college_computer_science",
+        "task_alias": "college_computer_science",
+        "group": "mmlu_stem",
+        "group_alias": "stem",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "college_computer_science",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about college computer science.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_college_mathematics": {
+        "task": "mmlu_college_mathematics",
+        "task_alias": "college_mathematics",
+        "group": "mmlu_stem",
+        "group_alias": "stem",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "college_mathematics",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_college_medicine": {
+        "task": "mmlu_college_medicine",
+        "task_alias": "college_medicine",
+        "group": "mmlu_other",
+        "group_alias": "other",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "college_medicine",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about college medicine.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_college_physics": {
+        "task": "mmlu_college_physics",
+        "task_alias": "college_physics",
+        "group": "mmlu_stem",
+        "group_alias": "stem",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "college_physics",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about college physics.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_computer_security": {
+        "task": "mmlu_computer_security",
+        "task_alias": "computer_security",
+        "group": "mmlu_stem",
+        "group_alias": "stem",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "computer_security",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about computer security.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_conceptual_physics": {
+        "task": "mmlu_conceptual_physics",
+        "task_alias": "conceptual_physics",
+        "group": "mmlu_stem",
+        "group_alias": "stem",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "conceptual_physics",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_econometrics": {
+        "task": "mmlu_econometrics",
+        "task_alias": "econometrics",
+        "group": "mmlu_social_sciences",
+        "group_alias": "social_sciences",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "econometrics",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about econometrics.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_electrical_engineering": {
+        "task": "mmlu_electrical_engineering",
+        "task_alias": "electrical_engineering",
+        "group": "mmlu_stem",
+        "group_alias": "stem",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "electrical_engineering",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_elementary_mathematics": {
+        "task": "mmlu_elementary_mathematics",
+        "task_alias": "elementary_mathematics",
+        "group": "mmlu_stem",
+        "group_alias": "stem",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "elementary_mathematics",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_formal_logic": {
+        "task": "mmlu_formal_logic",
+        "task_alias": "formal_logic",
+        "group": "mmlu_humanities",
+        "group_alias": "humanities",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "formal_logic",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about formal logic.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_global_facts": {
+        "task": "mmlu_global_facts",
+        "task_alias": "global_facts",
+        "group": "mmlu_other",
+        "group_alias": "other",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "global_facts",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about global facts.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_high_school_biology": {
+        "task": "mmlu_high_school_biology",
+        "task_alias": "high_school_biology",
+        "group": "mmlu_stem",
+        "group_alias": "stem",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "high_school_biology",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about high school biology.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_high_school_chemistry": {
+        "task": "mmlu_high_school_chemistry",
+        "task_alias": "high_school_chemistry",
+        "group": "mmlu_stem",
+        "group_alias": "stem",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "high_school_chemistry",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_high_school_computer_science": {
+        "task": "mmlu_high_school_computer_science",
+        "task_alias": "high_school_computer_science",
+        "group": "mmlu_stem",
+        "group_alias": "stem",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "high_school_computer_science",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_high_school_european_history": {
+        "task": "mmlu_high_school_european_history",
+        "task_alias": "high_school_european_history",
+        "group": "mmlu_humanities",
+        "group_alias": "humanities",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "high_school_european_history",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about high school european history.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_high_school_geography": {
+        "task": "mmlu_high_school_geography",
+        "task_alias": "high_school_geography",
+        "group": "mmlu_social_sciences",
+        "group_alias": "social_sciences",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "high_school_geography",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about high school geography.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_high_school_government_and_politics": {
+        "task": "mmlu_high_school_government_and_politics",
+        "task_alias": "high_school_government_and_politics",
+        "group": "mmlu_social_sciences",
+        "group_alias": "social_sciences",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "high_school_government_and_politics",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_high_school_macroeconomics": {
+        "task": "mmlu_high_school_macroeconomics",
+        "task_alias": "high_school_macroeconomics",
+        "group": "mmlu_social_sciences",
+        "group_alias": "social_sciences",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "high_school_macroeconomics",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_high_school_mathematics": {
+        "task": "mmlu_high_school_mathematics",
+        "task_alias": "high_school_mathematics",
+        "group": "mmlu_stem",
+        "group_alias": "stem",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "high_school_mathematics",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_high_school_microeconomics": {
+        "task": "mmlu_high_school_microeconomics",
+        "task_alias": "high_school_microeconomics",
+        "group": "mmlu_social_sciences",
+        "group_alias": "social_sciences",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "high_school_microeconomics",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_high_school_physics": {
+        "task": "mmlu_high_school_physics",
+        "task_alias": "high_school_physics",
+        "group": "mmlu_stem",
+        "group_alias": "stem",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "high_school_physics",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about high school physics.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_high_school_psychology": {
+        "task": "mmlu_high_school_psychology",
+        "task_alias": "high_school_psychology",
+        "group": "mmlu_social_sciences",
+        "group_alias": "social_sciences",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "high_school_psychology",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_high_school_statistics": {
+        "task": "mmlu_high_school_statistics",
+        "task_alias": "high_school_statistics",
+        "group": "mmlu_stem",
+        "group_alias": "stem",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "high_school_statistics",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_high_school_us_history": {
+        "task": "mmlu_high_school_us_history",
+        "task_alias": "high_school_us_history",
+        "group": "mmlu_humanities",
+        "group_alias": "humanities",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "high_school_us_history",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about high school us history.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_high_school_world_history": {
+        "task": "mmlu_high_school_world_history",
+        "task_alias": "high_school_world_history",
+        "group": "mmlu_humanities",
+        "group_alias": "humanities",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "high_school_world_history",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about high school world history.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_human_aging": {
+        "task": "mmlu_human_aging",
+        "task_alias": "human_aging",
+        "group": "mmlu_other",
+        "group_alias": "other",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "human_aging",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about human aging.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_human_sexuality": {
+        "task": "mmlu_human_sexuality",
+        "task_alias": "human_sexuality",
+        "group": "mmlu_social_sciences",
+        "group_alias": "social_sciences",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "human_sexuality",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_international_law": {
+        "task": "mmlu_international_law",
+        "task_alias": "international_law",
+        "group": "mmlu_humanities",
+        "group_alias": "humanities",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "international_law",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about international law.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_jurisprudence": {
+        "task": "mmlu_jurisprudence",
+        "task_alias": "jurisprudence",
+        "group": "mmlu_humanities",
+        "group_alias": "humanities",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "jurisprudence",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_logical_fallacies": {
+        "task": "mmlu_logical_fallacies",
+        "task_alias": "logical_fallacies",
+        "group": "mmlu_humanities",
+        "group_alias": "humanities",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "logical_fallacies",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_machine_learning": {
+        "task": "mmlu_machine_learning",
+        "task_alias": "machine_learning",
+        "group": "mmlu_stem",
+        "group_alias": "stem",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "machine_learning",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about machine learning.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_management": {
+        "task": "mmlu_management",
+        "task_alias": "management",
+        "group": "mmlu_other",
+        "group_alias": "other",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "management",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about management.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_marketing": {
+        "task": "mmlu_marketing",
+        "task_alias": "marketing",
+        "group": "mmlu_other",
+        "group_alias": "other",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "marketing",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about marketing.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_medical_genetics": {
+        "task": "mmlu_medical_genetics",
+        "task_alias": "medical_genetics",
+        "group": "mmlu_other",
+        "group_alias": "other",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "medical_genetics",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_miscellaneous": {
+        "task": "mmlu_miscellaneous",
+        "task_alias": "miscellaneous",
+        "group": "mmlu_other",
+        "group_alias": "other",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "miscellaneous",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_moral_disputes": {
+        "task": "mmlu_moral_disputes",
+        "task_alias": "moral_disputes",
+        "group": "mmlu_humanities",
+        "group_alias": "humanities",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "moral_disputes",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_moral_scenarios": {
+        "task": "mmlu_moral_scenarios",
+        "task_alias": "moral_scenarios",
+        "group": "mmlu_humanities",
+        "group_alias": "humanities",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "moral_scenarios",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_nutrition": {
+        "task": "mmlu_nutrition",
+        "task_alias": "nutrition",
+        "group": "mmlu_other",
+        "group_alias": "other",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "nutrition",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about nutrition.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_philosophy": {
+        "task": "mmlu_philosophy",
+        "task_alias": "philosophy",
+        "group": "mmlu_humanities",
+        "group_alias": "humanities",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "philosophy",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about philosophy.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_prehistory": {
+        "task": "mmlu_prehistory",
+        "task_alias": "prehistory",
+        "group": "mmlu_humanities",
+        "group_alias": "humanities",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "prehistory",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about prehistory.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_professional_accounting": {
+        "task": "mmlu_professional_accounting",
+        "task_alias": "professional_accounting",
+        "group": "mmlu_other",
+        "group_alias": "other",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "professional_accounting",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_professional_law": {
+        "task": "mmlu_professional_law",
+        "task_alias": "professional_law",
+        "group": "mmlu_humanities",
+        "group_alias": "humanities",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "professional_law",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about professional law.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_professional_medicine": {
+        "task": "mmlu_professional_medicine",
+        "task_alias": "professional_medicine",
+        "group": "mmlu_other",
+        "group_alias": "other",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "professional_medicine",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_professional_psychology": {
+        "task": "mmlu_professional_psychology",
+        "task_alias": "professional_psychology",
+        "group": "mmlu_social_sciences",
+        "group_alias": "social_sciences",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "professional_psychology",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_public_relations": {
+        "task": "mmlu_public_relations",
+        "task_alias": "public_relations",
+        "group": "mmlu_social_sciences",
+        "group_alias": "social_sciences",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "public_relations",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about public relations.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_security_studies": {
+        "task": "mmlu_security_studies",
+        "task_alias": "security_studies",
+        "group": "mmlu_social_sciences",
+        "group_alias": "social_sciences",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "security_studies",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about security studies.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_sociology": {
+        "task": "mmlu_sociology",
+        "task_alias": "sociology",
+        "group": "mmlu_social_sciences",
+        "group_alias": "social_sciences",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "sociology",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about sociology.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_us_foreign_policy": {
+        "task": "mmlu_us_foreign_policy",
+        "task_alias": "us_foreign_policy",
+        "group": "mmlu_social_sciences",
+        "group_alias": "social_sciences",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "us_foreign_policy",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_virology": {
+        "task": "mmlu_virology",
+        "task_alias": "virology",
+        "group": "mmlu_other",
+        "group_alias": "other",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "virology",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about virology.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      },
+      "mmlu_world_religions": {
+        "task": "mmlu_world_religions",
+        "task_alias": "world_religions",
+        "group": "mmlu_humanities",
+        "group_alias": "humanities",
+        "dataset_path": "hails/mmlu_no_train",
+        "dataset_name": "world_religions",
+        "test_split": "test",
+        "fewshot_split": "dev",
+        "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:",
+        "doc_to_target": "answer",
+        "doc_to_choice": [
+          "A",
+          "B",
+          "C",
+          "D"
+        ],
+        "description": "The following are multiple choice questions (with answers) about world religions.\n\n",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "fewshot_config": {
+          "sampler": "first_n"
+        },
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": false,
+        "metadata": {
+          "version": 0
+        }
+      }
+    },
+    "versions": {
+      "mmlu": "N/A",
+      "mmlu_abstract_algebra": "Yaml",
+      "mmlu_anatomy": "Yaml",
+      "mmlu_astronomy": "Yaml",
+      "mmlu_business_ethics": "Yaml",
+      "mmlu_clinical_knowledge": "Yaml",
+      "mmlu_college_biology": "Yaml",
+      "mmlu_college_chemistry": "Yaml",
+      "mmlu_college_computer_science": "Yaml",
+      "mmlu_college_mathematics": "Yaml",
+      "mmlu_college_medicine": "Yaml",
+      "mmlu_college_physics": "Yaml",
+      "mmlu_computer_security": "Yaml",
+      "mmlu_conceptual_physics": "Yaml",
+      "mmlu_econometrics": "Yaml",
+      "mmlu_electrical_engineering": "Yaml",
+      "mmlu_elementary_mathematics": "Yaml",
+      "mmlu_formal_logic": "Yaml",
+      "mmlu_global_facts": "Yaml",
+      "mmlu_high_school_biology": "Yaml",
+      "mmlu_high_school_chemistry": "Yaml",
+      "mmlu_high_school_computer_science": "Yaml",
+      "mmlu_high_school_european_history": "Yaml",
+      "mmlu_high_school_geography": "Yaml",
+      "mmlu_high_school_government_and_politics": "Yaml",
+      "mmlu_high_school_macroeconomics": "Yaml",
+      "mmlu_high_school_mathematics": "Yaml",
+      "mmlu_high_school_microeconomics": "Yaml",
+      "mmlu_high_school_physics": "Yaml",
+      "mmlu_high_school_psychology": "Yaml",
+      "mmlu_high_school_statistics": "Yaml",
+      "mmlu_high_school_us_history": "Yaml",
+      "mmlu_high_school_world_history": "Yaml",
+      "mmlu_human_aging": "Yaml",
+      "mmlu_human_sexuality": "Yaml",
+      "mmlu_humanities": "N/A",
+      "mmlu_international_law": "Yaml",
+      "mmlu_jurisprudence": "Yaml",
+      "mmlu_logical_fallacies": "Yaml",
+      "mmlu_machine_learning": "Yaml",
+      "mmlu_management": "Yaml",
+      "mmlu_marketing": "Yaml",
+      "mmlu_medical_genetics": "Yaml",
+      "mmlu_miscellaneous": "Yaml",
+      "mmlu_moral_disputes": "Yaml",
+      "mmlu_moral_scenarios": "Yaml",
+      "mmlu_nutrition": "Yaml",
+      "mmlu_other": "N/A",
+      "mmlu_philosophy": "Yaml",
+      "mmlu_prehistory": "Yaml",
+      "mmlu_professional_accounting": "Yaml",
+      "mmlu_professional_law": "Yaml",
+      "mmlu_professional_medicine": "Yaml",
+      "mmlu_professional_psychology": "Yaml",
+      "mmlu_public_relations": "Yaml",
+      "mmlu_security_studies": "Yaml",
+      "mmlu_social_sciences": "N/A",
+      "mmlu_sociology": "Yaml",
+      "mmlu_stem": "N/A",
+      "mmlu_us_foreign_policy": "Yaml",
+      "mmlu_virology": "Yaml",
+      "mmlu_world_religions": "Yaml"
+    },
+    "n-shot": {
+      "mmlu": 0,
+      "mmlu_abstract_algebra": 5,
+      "mmlu_anatomy": 5,
+      "mmlu_astronomy": 5,
+      "mmlu_business_ethics": 5,
+      "mmlu_clinical_knowledge": 5,
+      "mmlu_college_biology": 5,
+      "mmlu_college_chemistry": 5,
+      "mmlu_college_computer_science": 5,
+      "mmlu_college_mathematics": 5,
+      "mmlu_college_medicine": 5,
+      "mmlu_college_physics": 5,
+      "mmlu_computer_security": 5,
+      "mmlu_conceptual_physics": 5,
+      "mmlu_econometrics": 5,
+      "mmlu_electrical_engineering": 5,
+      "mmlu_elementary_mathematics": 5,
+      "mmlu_formal_logic": 5,
+      "mmlu_global_facts": 5,
+      "mmlu_high_school_biology": 5,
+      "mmlu_high_school_chemistry": 5,
+      "mmlu_high_school_computer_science": 5,
+      "mmlu_high_school_european_history": 5,
+      "mmlu_high_school_geography": 5,
+      "mmlu_high_school_government_and_politics": 5,
+      "mmlu_high_school_macroeconomics": 5,
+      "mmlu_high_school_mathematics": 5,
+      "mmlu_high_school_microeconomics": 5,
+      "mmlu_high_school_physics": 5,
+      "mmlu_high_school_psychology": 5,
+      "mmlu_high_school_statistics": 5,
+      "mmlu_high_school_us_history": 5,
+      "mmlu_high_school_world_history": 5,
+      "mmlu_human_aging": 5,
+      "mmlu_human_sexuality": 5,
+      "mmlu_humanities": 5,
+      "mmlu_international_law": 5,
+      "mmlu_jurisprudence": 5,
+      "mmlu_logical_fallacies": 5,
+      "mmlu_machine_learning": 5,
+      "mmlu_management": 5,
+      "mmlu_marketing": 5,
+      "mmlu_medical_genetics": 5,
+      "mmlu_miscellaneous": 5,
+      "mmlu_moral_disputes": 5,
+      "mmlu_moral_scenarios": 5,
+      "mmlu_nutrition": 5,
+      "mmlu_other": 5,
+      "mmlu_philosophy": 5,
+      "mmlu_prehistory": 5,
+      "mmlu_professional_accounting": 5,
+      "mmlu_professional_law": 5,
+      "mmlu_professional_medicine": 5,
+      "mmlu_professional_psychology": 5,
+      "mmlu_public_relations": 5,
+      "mmlu_security_studies": 5,
+      "mmlu_social_sciences": 5,
+      "mmlu_sociology": 5,
+      "mmlu_stem": 5,
+      "mmlu_us_foreign_policy": 5,
+      "mmlu_virology": 5,
+      "mmlu_world_religions": 5
+    },
+    "config": {
+      "model": "vllm",
+      "model_args": "pretrained=/workspace/dolphin-2.6-mistral-7b-hf,tensor_parallel_size=4,dtype=auto,trust_remote_code=True,gpu_memory_utilization=0.8",
+      "batch_size": "8",
+      "batch_sizes": [],
+      "device": null,
+      "use_cache": null,
+      "limit": null,
+      "bootstrap_iters": 100000,
+      "gen_kwargs": null
+    },
+    "git_hash": "46c79664"
+  },
+  {
+    "results": {
+      "truthfulqa": {
+        "bleu_max,none": 14.638837106843457,
+        "bleu_max_stderr,none": 0.34362387539987554,
+        "bleu_acc,none": 0.4394124847001224,
+        "bleu_acc_stderr,none": 0.00030187396199728844,
+        "bleu_diff,none": 0.24542274719797535,
+        "bleu_diff_stderr,none": 0.2063430019358257,
+        "rouge1_max,none": 38.65517994793549,
+        "rouge1_max_stderr,none": 0.5858540895997846,
+        "rouge1_acc,none": 0.46511627906976744,
+        "rouge1_acc_stderr,none": 0.0003048812818799781,
+        "rouge1_diff,none": -0.2338327660242177,
+        "rouge1_diff_stderr,none": 0.3738211153435194,
+        "rouge2_max,none": 24.05741215628304,
+        "rouge2_max_stderr,none": 0.6481300865775006,
+        "rouge2_acc,none": 0.34149326805385555,
+        "rouge2_acc_stderr,none": 0.0002755828626565652,
+        "rouge2_diff,none": -0.26964949017748546,
+        "rouge2_diff_stderr,none": 0.46049881176190666,
+        "rougeL_max,none": 34.96464478044315,
+        "rougeL_max_stderr,none": 0.5902919473860816,
+        "rougeL_acc,none": 0.4418604651162791,
+        "rougeL_acc_stderr,none": 0.00030223014029841656,
+        "rougeL_diff,none": -0.4989046166400256,
+        "rougeL_diff_stderr,none": 0.38675419798108396,
+        "acc,none": 0.4369849253215213,
+        "acc_stderr,none": 0.05670747643267544,
+        "alias": "truthfulqa"
+      },
+      "truthfulqa_gen": {
+        "bleu_max,none": 14.638837106843457,
+        "bleu_max_stderr,none": 0.5861944006896309,
+        "bleu_acc,none": 0.4394124847001224,
+        "bleu_acc_stderr,none": 0.01737452048251371,
+        "bleu_diff,none": 0.24542274719797535,
+        "bleu_diff_stderr,none": 0.45424993333607183,
+        "rouge1_max,none": 38.65517994793549,
+        "rouge1_max_stderr,none": 0.7654110592353527,
+        "rouge1_acc,none": 0.46511627906976744,
+        "rouge1_acc_stderr,none": 0.017460849975873972,
+        "rouge1_diff,none": -0.2338327660242177,
+        "rouge1_diff_stderr,none": 0.6114091227185929,
+        "rouge2_max,none": 24.05741215628304,
+        "rouge2_max_stderr,none": 0.8050652685201993,
+        "rouge2_acc,none": 0.34149326805385555,
+        "rouge2_acc_stderr,none": 0.016600688619950836,
+        "rouge2_diff,none": -0.26964949017748546,
+        "rouge2_diff_stderr,none": 0.6786006275873215,
+        "rougeL_max,none": 34.96464478044315,
+        "rougeL_max_stderr,none": 0.7683045928445837,
+        "rougeL_acc,none": 0.4418604651162791,
+        "rougeL_acc_stderr,none": 0.01738476747898621,
+        "rougeL_diff,none": -0.4989046166400256,
+        "rougeL_diff_stderr,none": 0.6218956487877078,
+        "alias": " - truthfulqa_gen"
+      },
+      "truthfulqa_mc1": {
+        "acc,none": 0.379436964504284,
+        "acc_stderr,none": 0.01698703926614297,
+        "alias": " - truthfulqa_mc1"
+      },
+      "truthfulqa_mc2": {
+        "acc,none": 0.552080846955996,
+        "acc_stderr,none": 0.015343375525702328,
+        "alias": " - truthfulqa_mc2"
+      }
+    },
+    "groups": {
+      "truthfulqa": {
+        "bleu_max,none": 14.638837106843457,
+        "bleu_max_stderr,none": 0.34362387539987554,
+        "bleu_acc,none": 0.4394124847001224,
+        "bleu_acc_stderr,none": 0.00030187396199728844,
+        "bleu_diff,none": 0.24542274719797535,
+        "bleu_diff_stderr,none": 0.2063430019358257,
+        "rouge1_max,none": 38.65517994793549,
+        "rouge1_max_stderr,none": 0.5858540895997846,
+        "rouge1_acc,none": 0.46511627906976744,
+        "rouge1_acc_stderr,none": 0.0003048812818799781,
+        "rouge1_diff,none": -0.2338327660242177,
+        "rouge1_diff_stderr,none": 0.3738211153435194,
+        "rouge2_max,none": 24.05741215628304,
+        "rouge2_max_stderr,none": 0.6481300865775006,
+        "rouge2_acc,none": 0.34149326805385555,
+        "rouge2_acc_stderr,none": 0.0002755828626565652,
+        "rouge2_diff,none": -0.26964949017748546,
+        "rouge2_diff_stderr,none": 0.46049881176190666,
+        "rougeL_max,none": 34.96464478044315,
+        "rougeL_max_stderr,none": 0.5902919473860816,
+        "rougeL_acc,none": 0.4418604651162791,
+        "rougeL_acc_stderr,none": 0.00030223014029841656,
+        "rougeL_diff,none": -0.4989046166400256,
+        "rougeL_diff_stderr,none": 0.38675419798108396,
+        "acc,none": 0.4369849253215213,
+        "acc_stderr,none": 0.05670747643267544,
+        "alias": "truthfulqa"
+      }
+    },
+    "configs": {
+      "truthfulqa_gen": {
+        "task": "truthfulqa_gen",
+        "group": [
+          "truthfulqa"
+        ],
+        "dataset_path": "truthful_qa",
+        "dataset_name": "generation",
+        "validation_split": "validation",
+        "process_docs": "<function process_docs_gen at 0x7f493b3a5260>",
+        "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question}}",
+        "doc_to_target": " ",
+        "process_results": "<function process_results_gen at 0x7f493b3a58a0>",
+        "description": "",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "num_fewshot": 0,
+        "metric_list": [
+          {
+            "metric": "bleu_max",
+            "aggregation": "mean",
+            "higher_is_better": true
+          },
+          {
+            "metric": "bleu_acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          },
+          {
+            "metric": "bleu_diff",
+            "aggregation": "mean",
+            "higher_is_better": true
+          },
+          {
+            "metric": "rouge1_max",
+            "aggregation": "mean",
+            "higher_is_better": true
+          },
+          {
+            "metric": "rouge1_acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          },
+          {
+            "metric": "rouge1_diff",
+            "aggregation": "mean",
+            "higher_is_better": true
+          },
+          {
+            "metric": "rouge2_max",
+            "aggregation": "mean",
+            "higher_is_better": true
+          },
+          {
+            "metric": "rouge2_acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          },
+          {
+            "metric": "rouge2_diff",
+            "aggregation": "mean",
+            "higher_is_better": true
+          },
+          {
+            "metric": "rougeL_max",
+            "aggregation": "mean",
+            "higher_is_better": true
+          },
+          {
+            "metric": "rougeL_acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          },
+          {
+            "metric": "rougeL_diff",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "generate_until",
+        "generation_kwargs": {
+          "until": [
+            "\n\n"
+          ],
+          "do_sample": false
+        },
+        "repeats": 1,
+        "should_decontaminate": true,
+        "doc_to_decontamination_query": "question",
+        "metadata": {
+          "version": 2
+        }
+      },
+      "truthfulqa_mc1": {
+        "task": "truthfulqa_mc1",
+        "group": [
+          "truthfulqa"
+        ],
+        "dataset_path": "truthful_qa",
+        "dataset_name": "multiple_choice",
+        "validation_split": "validation",
+        "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}",
+        "doc_to_target": 0,
+        "doc_to_choice": "{{mc1_targets.choices}}",
+        "description": "",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "num_fewshot": 0,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": true,
+        "doc_to_decontamination_query": "question",
+        "metadata": {
+          "version": 2
+        }
+      },
+      "truthfulqa_mc2": {
+        "task": "truthfulqa_mc2",
+        "group": [
+          "truthfulqa"
+        ],
+        "dataset_path": "truthful_qa",
+        "dataset_name": "multiple_choice",
+        "validation_split": "validation",
+        "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}",
+        "doc_to_target": 0,
+        "doc_to_choice": "{{mc2_targets.choices}}",
+        "process_results": "<function process_results_mc2 at 0x7f493b3a5b20>",
+        "description": "",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "num_fewshot": 0,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": true,
+        "doc_to_decontamination_query": "question",
+        "metadata": {
+          "version": 2
+        }
+      }
+    },
+    "versions": {
+      "truthfulqa": "N/A",
+      "truthfulqa_gen": "Yaml",
+      "truthfulqa_mc1": "Yaml",
+      "truthfulqa_mc2": "Yaml"
+    },
+    "n-shot": {
+      "truthfulqa": 0,
+      "truthfulqa_gen": 0,
+      "truthfulqa_mc1": 0,
+      "truthfulqa_mc2": 0
+    },
+    "config": {
+      "model": "vllm",
+      "model_args": "pretrained=/workspace/dolphin-2.6-mistral-7b-hf,tensor_parallel_size=4,dtype=auto,trust_remote_code=True,gpu_memory_utilization=0.8",
+      "batch_size": "8",
+      "batch_sizes": [],
+      "device": null,
+      "use_cache": null,
+      "limit": null,
+      "bootstrap_iters": 100000,
+      "gen_kwargs": null
+    },
+    "git_hash": "46c79664"
+  },
+  {
+    "results": {
+      "winogrande": {
+        "acc,none": 0.739542225730071,
+        "acc_stderr,none": 0.012334833671998292,
+        "alias": "winogrande"
+      }
+    },
+    "configs": {
+      "winogrande": {
+        "task": "winogrande",
+        "dataset_path": "winogrande",
+        "dataset_name": "winogrande_xl",
+        "training_split": "train",
+        "validation_split": "validation",
+        "doc_to_text": "<function doc_to_text at 0x7fb7afbfe700>",
+        "doc_to_target": "<function doc_to_target at 0x7fb7afbfea20>",
+        "doc_to_choice": "<function doc_to_choice at 0x7fb7afbfed40>",
+        "description": "",
+        "target_delimiter": " ",
+        "fewshot_delimiter": "\n\n",
+        "num_fewshot": 5,
+        "metric_list": [
+          {
+            "metric": "acc",
+            "aggregation": "mean",
+            "higher_is_better": true
+          }
+        ],
+        "output_type": "multiple_choice",
+        "repeats": 1,
+        "should_decontaminate": true,
+        "doc_to_decontamination_query": "sentence",
+        "metadata": {
+          "version": 1
+        }
+      }
+    },
+    "versions": {
+      "winogrande": "Yaml"
+    },
+    "n-shot": {
+      "winogrande": 5
+    },
+    "config": {
+      "model": "vllm",
+      "model_args": "pretrained=/workspace/dolphin-2.6-mistral-7b-hf,tensor_parallel_size=4,dtype=auto,trust_remote_code=True,gpu_memory_utilization=0.8",
+      "batch_size": "8",
+      "batch_sizes": [],
+      "device": null,
+      "use_cache": null,
+      "limit": null,
+      "bootstrap_iters": 100000,
+      "gen_kwargs": null
+    },
+    "git_hash": "46c79664"
+  }
+]