{ "results": { "b4b": { "acc,none": 0.7475339087546239, "acc_stderr,none": 0.11087824048509952, "acc_norm,none": 0.7475339087546239, "acc_norm_stderr,none": 0.11087824048509952, "alias": "b4b" }, "b4bqa": { "acc,none": 0.8610491071428571, "acc_stderr,none": 0.008173288677884256, "acc_norm,none": 0.8610491071428571, "acc_norm_stderr,none": 0.008173288677884256, "alias": " - b4bqa" }, "medmcqa_g2b": { "acc,none": 0.5545977011494253, "acc_stderr,none": 0.026680902895795475, "acc_norm,none": 0.5545977011494253, "acc_norm_stderr,none": 0.026680902895795475, "alias": " - medmcqa_g2b" }, "medmcqa_orig_filtered": { "acc,none": 0.6494252873563219, "acc_stderr,none": 0.025614751890362768, "acc_norm,none": 0.6494252873563219, "acc_norm_stderr,none": 0.025614751890362768, "alias": " - medmcqa_orig_filtered" }, "medqa_4options_g2b": { "acc,none": 0.6005291005291006, "acc_stderr,none": 0.025225450284067932, "acc_norm,none": 0.6005291005291006, "acc_norm_stderr,none": 0.025225450284067932, "alias": " - medqa_4options_g2b" }, "medqa_4options_orig_filtered": { "acc,none": 0.6243386243386243, "acc_stderr,none": 0.02494236893115979, "acc_norm,none": 0.6243386243386243, "acc_norm_stderr,none": 0.02494236893115979, "alias": " - medqa_4options_orig_filtered" } }, "groups": { "b4b": { "acc,none": 0.7475339087546239, "acc_stderr,none": 0.11087824048509952, "acc_norm,none": 0.7475339087546239, "acc_norm_stderr,none": 0.11087824048509952, "alias": "b4b" } }, "configs": { "b4bqa": { "task": "b4bqa", "dataset_path": "AIM-Harvard/b4b_drug_qa", "test_split": "test", "doc_to_text": "", "doc_to_target": "correct_choice", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true }, { "metric": "acc_norm", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false }, "medmcqa_g2b": { "task": "medmcqa_g2b", "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand", "training_split": "train", "validation_split": "validation", "test_split": "validation", "doc_to_text": "", "doc_to_target": "cop", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true }, { "metric": "acc_norm", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": true, "doc_to_decontamination_query": "{{question}}" }, "medmcqa_orig_filtered": { "task": "medmcqa_orig_filtered", "dataset_path": "AIM-Harvard/medmcqa_original", "training_split": "train", "validation_split": "validation", "test_split": "validation", "doc_to_text": "", "doc_to_target": "cop", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true }, { "metric": "acc_norm", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": true, "doc_to_decontamination_query": "{{question}}" }, "medqa_4options_g2b": { "task": "medqa_4options_g2b", "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand", "training_split": "train", "validation_split": "validation", "test_split": "test", "doc_to_text": "", "doc_to_target": "", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true }, { "metric": "acc_norm", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false }, "medqa_4options_orig_filtered": { "task": "medqa_4options_orig_filtered", "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original", "training_split": "train", "validation_split": "validation", "test_split": "test", "doc_to_text": "", "doc_to_target": "", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true }, { "metric": "acc_norm", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false } }, "versions": { "b4b": "N/A", "b4bqa": "Yaml", "medmcqa_g2b": "Yaml", "medmcqa_orig_filtered": "Yaml", "medqa_4options_g2b": "Yaml", "medqa_4options_orig_filtered": "Yaml" }, "n-shot": { "b4b": 0, "b4bqa": 0, "medmcqa_g2b": 0, "medmcqa_orig_filtered": 0, "medqa_4options_g2b": 0, "medqa_4options_orig_filtered": 0 }, "config": { "model": "hf", "model_args": "pretrained=mistralai/Mixtral-8x7B-v0.1,load_in_4bit=True", "batch_size": "4", "batch_sizes": [], "device": "cuda:0", "use_cache": null, "limit": null, "bootstrap_iters": 100000, "gen_kwargs": null }, "git_hash": "928c7657" }