{ "results": { "b4b": { "acc,none": 0.7876078914919852, "acc_stderr,none": 0.06728010300021042, "acc_norm,none": 0.7876078914919852, "acc_norm_stderr,none": 0.06728010300021042, "alias": "b4b" }, "b4bqa": { "acc,none": 0.8510044642857143, "acc_stderr,none": 0.008414043525477657, "acc_norm,none": 0.8510044642857143, "acc_norm_stderr,none": 0.008414043525477657, "alias": " - b4bqa" }, "medmcqa_g2b": { "acc,none": 0.632183908045977, "acc_stderr,none": 0.025886440903166212, "acc_norm,none": 0.632183908045977, "acc_norm_stderr,none": 0.025886440903166212, "alias": " - medmcqa_g2b" }, "medmcqa_orig_filtered": { "acc,none": 0.7385057471264368, "acc_stderr,none": 0.023590833013480327, "acc_norm,none": 0.7385057471264368, "acc_norm_stderr,none": 0.023590833013480327, "alias": " - medmcqa_orig_filtered" }, "medqa_4options_g2b": { "acc,none": 0.708994708994709, "acc_stderr,none": 0.02339382650048486, "acc_norm,none": 0.708994708994709, "acc_norm_stderr,none": 0.02339382650048486, "alias": " - medqa_4options_g2b" }, "medqa_4options_orig_filtered": { "acc,none": 0.753968253968254, "acc_stderr,none": 0.022182037202948368, "acc_norm,none": 0.753968253968254, "acc_norm_stderr,none": 0.022182037202948368, "alias": " - medqa_4options_orig_filtered" } }, "groups": { "b4b": { "acc,none": 0.7876078914919852, "acc_stderr,none": 0.06728010300021042, "acc_norm,none": 0.7876078914919852, "acc_norm_stderr,none": 0.06728010300021042, "alias": "b4b" } }, "configs": { "b4bqa": { "task": "b4bqa", "dataset_path": "AIM-Harvard/b4b_drug_qa", "test_split": "test", "doc_to_text": "", "doc_to_target": "correct_choice", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true }, { "metric": "acc_norm", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false }, "medmcqa_g2b": { "task": "medmcqa_g2b", "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand", "training_split": "train", "validation_split": "validation", "test_split": "validation", "doc_to_text": "", "doc_to_target": "cop", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true }, { "metric": "acc_norm", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": true, "doc_to_decontamination_query": "{{question}}" }, "medmcqa_orig_filtered": { "task": "medmcqa_orig_filtered", "dataset_path": "AIM-Harvard/medmcqa_original", "training_split": "train", "validation_split": "validation", "test_split": "validation", "doc_to_text": "", "doc_to_target": "cop", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true }, { "metric": "acc_norm", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": true, "doc_to_decontamination_query": "{{question}}" }, "medqa_4options_g2b": { "task": "medqa_4options_g2b", "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand", "training_split": "train", "validation_split": "validation", "test_split": "test", "doc_to_text": "", "doc_to_target": "", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true }, { "metric": "acc_norm", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false }, "medqa_4options_orig_filtered": { "task": "medqa_4options_orig_filtered", "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original", "training_split": "train", "validation_split": "validation", "test_split": "test", "doc_to_text": "", "doc_to_target": "", "doc_to_choice": [ "A", "B", "C", "D" ], "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true }, { "metric": "acc_norm", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false } }, "versions": { "b4b": "N/A", "b4bqa": "Yaml", "medmcqa_g2b": "Yaml", "medmcqa_orig_filtered": "Yaml", "medqa_4options_g2b": "Yaml", "medqa_4options_orig_filtered": "Yaml" }, "n-shot": { "b4b": 0, "b4bqa": 0, "medmcqa_g2b": 0, "medmcqa_orig_filtered": 0, "medqa_4options_g2b": 0, "medqa_4options_orig_filtered": 0 }, "config": { "model": "hf", "model_args": "pretrained=aaditya/Llama3-OpenBioLLM-70B,parallelize=True,load_in_4bit=True", "batch_size": "auto", "batch_sizes": [ 32 ], "device": null, "use_cache": null, "limit": null, "bootstrap_iters": 100000, "gen_kwargs": null }, "git_hash": "928c7657" }