{ "results": { "b4b": { "acc,none": 0.7475339087546239, "acc_stderr,none": 0.0611860272880456, "acc_norm,none": 0.7475339087546239, "acc_norm_stderr,none": 0.0611860272880456, "alias": "b4b" }, "b4bqa": { "acc,none": 0.7193080357142857, "acc_stderr,none": 0.01061755826614456, "acc_norm,none": 0.7193080357142857, "acc_norm_stderr,none": 0.01061755826614456, "alias": " - b4bqa" }, "medmcqa_g2b": { "acc,none": 0.6522988505747126, "acc_stderr,none": 0.025565932174194388, "acc_norm,none": 0.6522988505747126, "acc_norm_stderr,none": 0.025565932174194388, "alias": " - medmcqa_g2b" }, "medmcqa_orig_filtered": { "acc,none": 0.8017241379310345, "acc_stderr,none": 0.021403394960161685, "acc_norm,none": 0.8017241379310345, "acc_norm_stderr,none": 0.021403394960161685, "alias": " - medmcqa_orig_filtered" }, "medqa_4options_g2b": { "acc,none": 0.7645502645502645, "acc_stderr,none": 0.021851509822031715, "acc_norm,none": 0.7645502645502645, "acc_norm_stderr,none": 0.021851509822031715, "alias": " - medqa_4options_g2b" }, "medqa_4options_orig_filtered": { "acc,none": 0.9021164021164021, "acc_stderr,none": 0.015304374225091422, "acc_norm,none": 0.9021164021164021, "acc_norm_stderr,none": 0.015304374225091422, "alias": " - medqa_4options_orig_filtered" } }, "groups": { "b4b": { "acc,none": 0.7475339087546239, "acc_stderr,none": 0.0611860272880456, "acc_norm,none": 0.7475339087546239, "acc_norm_stderr,none": 0.0611860272880456, "alias": "b4b" } }, "configs": { "b4bqa": { "task": "b4bqa", "dataset_path": "AIM-Harvard/b4b_drug_qa", "test_split": "test", "doc_to_text": "", "doc_to_target": "correct_choice", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true }, { "metric": "acc_norm", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false }, "medmcqa_g2b": { "task": "medmcqa_g2b", "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand", "training_split": "train", "validation_split": "validation", "test_split": "validation", "doc_to_text": "", "doc_to_target": "cop", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true }, { "metric": "acc_norm", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": true, "doc_to_decontamination_query": "{{question}}" }, "medmcqa_orig_filtered": { "task": "medmcqa_orig_filtered", "dataset_path": "AIM-Harvard/medmcqa_original", "training_split": "train", "validation_split": "validation", "test_split": "validation", "doc_to_text": "", "doc_to_target": "cop", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true }, { "metric": "acc_norm", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": true, "doc_to_decontamination_query": "{{question}}" }, "medqa_4options_g2b": { "task": "medqa_4options_g2b", "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand", "training_split": "train", "validation_split": "validation", "test_split": "test", "doc_to_text": "", "doc_to_target": "", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true }, { "metric": "acc_norm", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false }, "medqa_4options_orig_filtered": { "task": "medqa_4options_orig_filtered", "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original", "training_split": "train", "validation_split": "validation", "test_split": "test", "doc_to_text": "", "doc_to_target": "", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true }, { "metric": "acc_norm", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false } }, "versions": { "b4b": "N/A", "b4bqa": "Yaml", "medmcqa_g2b": "Yaml", "medmcqa_orig_filtered": "Yaml", "medqa_4options_g2b": "Yaml", "medqa_4options_orig_filtered": "Yaml" }, "n-shot": { "b4b": 0, "b4bqa": 0, "medmcqa_g2b": 0, "medmcqa_orig_filtered": 0, "medqa_4options_g2b": 0, "medqa_4options_orig_filtered": 0 }, "config": { "model": "hf", "model_args": "pretrained=ProbeMedicalYonseiMAILab/medllama3-v20,parallelize=True,load_in_4bit=True", "batch_size": "auto", "batch_sizes": [ 32 ], "device": null, "use_cache": null, "limit": null, "bootstrap_iters": 100000, "gen_kwargs": null }, "git_hash": "928c7657" }