Spaces:

AIM-Harvard
/

rabbits-leaderboard

Running

App Files Files Community

magilogi commited on Jun 11

Commit

4c59875

•

1 Parent(s): cfb403c

rabbits-leaderboard-v0.1

Browse files

Files changed (26) hide show

app.py +163 -0
custom.css +62 -0
data/csv/models_data.csv +20 -0
data/raw-eval-outputs/01-ai-Yi-1.5-34B_results.json +252 -0
data/raw-eval-outputs/CohereForAI-aya-23-35B_results.json +250 -0
data/raw-eval-outputs/CohereForAI-c4ai-command-r-plus_results.json +250 -0
data/raw-eval-outputs/ProbeMedicalYonseiMAILab-medllama3-v20_results.json +252 -0
data/raw-eval-outputs/Qwen-Qwen2-72B_results.json +250 -0
data/raw-eval-outputs/Qwen-Qwen2-7B_results.json +316 -0
data/raw-eval-outputs/aaditya-Llama3-OpenBioLLM-70B_results.json +252 -0
data/raw-eval-outputs/johnsnowlabs-JSL-MedLlama-3-8B-v9_results.json +252 -0
data/raw-eval-outputs/meta-llama-Llama-2-70B-hf_results.json +250 -0
data/raw-eval-outputs/meta-llama-Llama-2-7b-hf_results.json +250 -0
data/raw-eval-outputs/meta-llama-Meta-Llama-3-70B_results.json +250 -0
data/raw-eval-outputs/meta-llama-Meta-Llama-3-8B_results.json +250 -0
data/raw-eval-outputs/microsoft-Phi-3-medium-4k-instruct_results.json +316 -0
data/raw-eval-outputs/microsoft-phi-1_5_results.json +316 -0
data/raw-eval-outputs/microsoft-phi-1_results.json +316 -0
data/raw-eval-outputs/microsoft-phi-2_results.json +316 -0
data/raw-eval-outputs/mistralai-Mistral-7B-v0.3_results.json +316 -0
data/raw-eval-outputs/mistralai-Mixtral-8x22B-v0.1_results.json +252 -0
data/raw-eval-outputs/mistralai-Mixtral-8x7B-v0.1_results.json +250 -0
src/__pycache__/model_links.cpython-311.pyc +0 -0
src/__pycache__/models_info.cpython-311.pyc +0 -0
src/json2df.py +67 -0
src/models_info.py +79 -0

app.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import pandas as pd
+import gradio as gr
+import plotly.express as px
+import plotly.graph_objects as go
+df = pd.read_csv("data/csv/models_data.csv")
+filter_mapping = {
+    "all": "all",
+    "🟢 Pre-trained": "🟢",
+    "🟩 Continuously pre-trained": "🟩",
+    "🔶 Fine-tuned on domain-specific data": "🔶",
+    "💬 Chat-models (RLHF, DPO, IFT, ...)": "💬"
+}
+def filter_items(df, query):
+    if query == "all":
+        return df
+    filter_value = filter_mapping[query]
+    return df[df["T"].str.contains(filter_value, na=False)]
+def create_scatter_plot(df, x_col, y_col, title, x_title, y_title):
+    fig = px.scatter(df, x=x_col, y=y_col, color='Model', title=title)
+    fig.add_trace(
+        go.Scatter(
+            x=[0, 100],
+            y=[0, 100],
+            mode="lines",
+            name="y=x line",
+            line=dict(color='black', dash='dash')
+        )
+    )
+    fig.update_layout(
+        xaxis_title=x_title,
+        yaxis_title=y_title,
+        xaxis=dict(range=[0, 100]),
+        yaxis=dict(range=[0, 100]),
+        legend_title_text='Model'
+    )
+    fig.update_traces(marker=dict(size=10), selector=dict(mode='markers'))
+    return fig
+with gr.Blocks(css="custom.css") as demo:
+    with gr.Row():
+        gr.Markdown(
+            """<div style="text-align: center;"><h1> <span style='color: #6aa84f;'>🐰 RABBITS:</span> <span style='color: #6aa84f;'>R</span>obust <span style='color: #6aa84f;'>A</span>ssessment of <span style='color: #6aa84f;'>B</span>iomedical <span style='color: #6aa84f;'>B</span>enchmarks <span style='color: #6aa84f;'>I</span>nvolving drug
+<span style='color: #6aa84f;'>T</span>erm <span style='color: #6aa84f;'>S</span>ubstitutions for Language Models <span style='color: #6aa84f;'></span></h1></div>\
+            <br>\
+            <p class='markdown-text'>Robust language models are crucial in the medical domain and the RABBITS project tests the robustness of LLMs by evaluating their handling of synonyms, specifically brand and generic drug names. We assessed 16 open-source language models from Hugging Face using systematic synonym substitution on MedQA and MedMCQA tasks. Our results show a consistent decline in performance across all model sizes, highlighting challenges in synonym comprehension. Additionally, we discovered significant dataset contamination by identifying overlaps between MedQA, MedMCQA test sets, and the Dolma 1.6 dataset using an 8-gram analysis. This highlights the need to improve model robustness and address contamination in open-source datasets</p>"""
+        )
+    with gr.Tabs(elem_classes="tab-buttons"):
+        with gr.TabItem("🔍 Evaluation table"):
+            with gr.Column():
+                with gr.Accordion("➡️ Filter by Column", open=False):
+                    shown_columns = gr.CheckboxGroup(
+                        choices=df.columns.tolist(),
+                        value=df.columns.tolist(),
+                        label="Select Columns",
+                        interactive=True,
+                    )
+                with gr.Row():
+                    search_bar = gr.Textbox(
+                        placeholder="🔍 Search for your model and press ENTER...",
+                        show_label=False,
+                        elem_id="search-bar"
+                    )
+                    filter_columns = gr.Radio(
+                        label="⏚ Filter model types",
+                        choices=[
+                            "all",
+                            "🟢 Pre-trained",
+                            "🟩 Continuously pre-trained",
+                            "🔶 Fine-tuned on domain-specific data",
+                            "💬 Chat-models (RLHF, DPO, IFT, ...)"
+                        ],
+                        value="all",
+                        elem_id="filter-columns",
+                    )
+                leaderboard_df = gr.Dataframe(
+                    value=df,
+                    headers="keys",
+                    datatype=["html" if col == "Model" else "str" for col in df.columns],
+                    interactive=False,
+                    elem_id="leaderboard-table"
+                )
+                def update_leaderboard(search_query):
+                    filtered_df = df[df["Model"].str.contains(search_query, case=False)]
+                    return filtered_df
+                search_bar.submit(
+                    update_leaderboard,
+                    inputs=search_bar,
+                    outputs=leaderboard_df
+                )
+                def filter_update(query):
+                    filtered_df = filter_items(df, query)
+                    return filtered_df
+                filter_columns.change(
+                    filter_update,
+                    inputs=filter_columns,
+                    outputs=leaderboard_df
+                )
+                shown_columns.change(
+                    lambda cols: df[cols],
+                    inputs=shown_columns,
+                    outputs=leaderboard_df
+                )
+        with gr.TabItem("📊 Evaluation Plots"):
+            with gr.Column():
+                with gr.Row():
+                    scatter1 = gr.Plot(
+                        value=create_scatter_plot(df, "medmcqa_orig_filtered", "medmcqa_g2b",
+                                                "MedMCQA: Orig vs G2B", "medmcqa_orig_filtered", "medmcqa_g2b"),
+                        elem_id="scatter1"
+                    )
+                    scatter2 = gr.Plot(
+                        value=create_scatter_plot(df, "medqa_4options_orig_filtered", "medqa_4options_g2b",
+                                                "MedQA: Orig vs G2B", "medqa_4options_orig_filtered", "medqa_4options_g2b"),
+                        elem_id="scatter2"
+                    )
+                with gr.Row():
+                    scatter3 = gr.Plot(
+                        value=create_scatter_plot(df, "b4bqa", "b4b",
+                                                "b4bqa vs b4b", "b4bqa", "b4b"),
+                        elem_id="scatter3"
+                    )
+        with gr.TabItem("📝 About"):
+            gr.Markdown(
+                """<div style="text-align: center;">
+                <h2>About RABBITS LLM Leaderboard</h2>
+                <p>This leaderboard ...</p>
+                <p>It is designed to ...</p>
+                </div>""",
+                elem_classes="markdown-text"
+            )
+        with gr.TabItem("🚀 Submit Here!"):
+            gr.Markdown(
+                """<div style="text-align: center;">
+                <h2>Submit Your Model Results</h2>
+                <p>If you have new model results that you would like to add to the leaderboard, please follow the submission guidelines below:</p>
+                <ul>
+                    <li>COMING SOON</li>
+                </ul>
+                <p>COMING SOON</p>
+                </div>""",
+                elem_classes="markdown-text"
+            )
+if __name__ == "__main__":
+    demo.launch()

custom.css ADDED Viewed

	@@ -0,0 +1,62 @@

+#changelog-text {
+    font-size: 16px !important;
+}
+#changelog-text h2 {
+    font-size: 18px !important;
+}
+.markdown-text {
+    font-size: 16px !important;
+}
+#models-to-add-text {
+    font-size: 18px !important;
+}
+#citation-button span {
+    font-size: 16px !important;
+}
+#citation-button textarea {
+    font-size: 16px !important;
+}
+#citation-button > label > button {
+    margin: 6px;
+    transform: scale(1.3);
+}
+#leaderboard-table {
+    margin-top: 15px
+}
+#leaderboard-table-lite {
+    margin-top: 15px
+}
+#search-bar-table-box > div:first-child {
+    background: none;
+    border: none;
+}
+#search-bar {
+    padding: 0px;
+}
+/* Hides the final AutoEvalColumn */
+#llm-benchmark-tab-table table td:last-child,
+#llm-benchmark-tab-table table th:last-child {
+    display: none;
+}
+/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
+table td:first-child,
+table th:first-child {
+    max-width: 400px;
+    overflow: auto;
+    white-space: nowrap;
+}
+.tab-buttons button {
+    font-size: 20px;
+}
+#scale-logo {
+    border-style: none !important;
+    box-shadow: none;
+    display: block;
+    margin-left: auto;
+    margin-right: auto;
+    max-width: 600px;
+}
+#scale-logo .download {
+    display: none;
+}

data/csv/models_data.csv ADDED Viewed

	@@ -0,0 +1,20 @@

+T,Model,b4bqa,b4b,medmcqa_g2b,medmcqa_orig_filtered,medmcqa_diff,medqa_4options_g2b,medqa_4options_orig_filtered,medqa_diff
+🔶,"<a target=""_blank"" href=""https://huggingface.co/01-ai/Yi-1.5-34B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">01-ai-Yi-1.5-34B</a>",85.16,75.37,59.77,69.25,-9.48,59.79,64.55,-4.76
+🔶,"<a target=""_blank"" href=""https://huggingface.co/aaditya/Llama3-OpenBioLLM-70B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">aaditya-Llama3-OpenBioLLM-70B</a>",85.1,78.76,63.22,73.85,-10.63,70.9,75.4,-4.5
+🔶,"<a target=""_blank"" href=""https://huggingface.co/CohereForAI/aya-23-35B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">CohereForAI-aya-23-35B</a>",78.4,65.72,48.56,52.87,-4.31,47.88,51.06,-3.18
+💬,"<a target=""_blank"" href=""https://huggingface.co/CohereForAI/c4ai-command-r-plus"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">CohereForAI-c4ai-command-r-plus</a>",84.93,72.41,49.14,61.49,-12.35,56.61,60.32,-3.71
+🔶,"<a target=""_blank"" href=""https://huggingface.co/johnsnowlabs/JSL-MedLlama-3-8B-v9"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">johnsnowlabs-JSL-MedLlama-3-8B-v9</a>",75.17,74.45,64.08,77.01,-12.93,70.63,82.01,-11.38
+🟢,"<a target=""_blank"" href=""https://huggingface.co/meta-llama/Llama-2-70B-hf"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">meta-llama-Llama-2-70B-hf</a>",77.01,65.63,45.98,52.3,-6.32,52.65,55.03,-2.38
+🟢,"<a target=""_blank"" href=""https://huggingface.co/meta-llama/Llama-2-7b-hf"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">meta-llama-Llama-2-7b-hf</a>",36.83,36.0,33.91,34.2,-0.29,34.39,37.3,-2.91
+🟢,"<a target=""_blank"" href=""https://huggingface.co/meta-llama/Meta-Llama-3-70B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">meta-llama-Meta-Llama-3-70B</a>",90.12,82.55,66.67,78.16,-11.49,72.75,75.13,-2.38
+🟢,"<a target=""_blank"" href=""https://huggingface.co/meta-llama/Meta-Llama-3-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">meta-llama-Meta-Llama-3-8B</a>",82.7,71.21,52.87,59.2,-6.33,55.03,60.85,-5.82
+🟢,"<a target=""_blank"" href=""https://huggingface.co/microsoft/phi-1_5"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">microsoft-phi-1_5</a>",28.01,30.24,31.61,30.46,1.15,34.92,34.66,0.26
+🟢,"<a target=""_blank"" href=""https://huggingface.co/microsoft/phi-1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">microsoft-phi-1</a>",19.64,21.18,24.14,25.86,-1.72,21.69,20.9,0.79
+🟢,"<a target=""_blank"" href=""https://huggingface.co/microsoft/phi-2"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">microsoft-phi-2</a>",47.49,44.79,37.64,42.24,-4.6,41.8,43.92,-2.12
+💬,"<a target=""_blank"" href=""https://huggingface.co/microsoft/Phi-3-medium-4k-instruct"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">microsoft-Phi-3-medium-4k-instruct</a>",69.98,65.94,60.34,72.41,-12.07,53.44,58.47,-5.03
+🟩,"<a target=""_blank"" href=""https://huggingface.co/mistralai/Mistral-7B-v0.3"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">mistralai-Mistral-7B-v0.3</a>",70.31,61.99,48.28,56.9,-8.62,48.68,53.17,-4.49
+🟩,"<a target=""_blank"" href=""https://huggingface.co/mistralai/Mixtral-8x22B-v0.1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">mistralai-Mixtral-8x22B-v0.1</a>",87.72,78.82,61.78,70.4,-8.62,67.46,71.43,-3.97
+🟩,"<a target=""_blank"" href=""https://huggingface.co/mistralai/Mixtral-8x7B-v0.1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">mistralai-Mixtral-8x7B-v0.1</a>",86.1,74.75,55.46,64.94,-9.48,60.05,62.43,-2.38
+🔶,"<a target=""_blank"" href=""https://huggingface.co/ProbeMedicalYonseiMAILab/medllama3-v20"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ProbeMedicalYonseiMAILab-medllama3-v20</a>",71.93,74.75,65.23,80.17,-14.94,76.46,90.21,-13.75
+🟢,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen2-72B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Qwen-Qwen2-72B</a>",91.02,83.72,71.55,77.87,-6.32,74.07,75.4,-1.33
+🟢,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen2-7B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Qwen-Qwen2-7B</a>",80.41,70.28,55.17,63.51,-8.34,53.7,58.99,-5.29

data/raw-eval-outputs/01-ai-Yi-1.5-34B_results.json ADDED Viewed

	@@ -0,0 +1,252 @@

+{
+  "results": {
+    "b4b": {
+      "acc,none": 0.7536991368680641,
+      "acc_stderr,none": 0.09728135187806679,
+      "acc_norm,none": 0.7536991368680641,
+      "acc_norm_stderr,none": 0.09728135187806679,
+      "alias": "b4b"
+    },
+    "b4bqa": {
+      "acc,none": 0.8515625,
+      "acc_stderr,none": 0.008401025189152976,
+      "acc_norm,none": 0.8515625,
+      "acc_norm_stderr,none": 0.008401025189152976,
+      "alias": " - b4bqa"
+    },
+    "medmcqa_g2b": {
+      "acc,none": 0.5977011494252874,
+      "acc_stderr,none": 0.026323989201783506,
+      "acc_norm,none": 0.5977011494252874,
+      "acc_norm_stderr,none": 0.026323989201783506,
+      "alias": " - medmcqa_g2b"
+    },
+    "medmcqa_orig_filtered": {
+      "acc,none": 0.6925287356321839,
+      "acc_stderr,none": 0.024771735192072118,
+      "acc_norm,none": 0.6925287356321839,
+      "acc_norm_stderr,none": 0.024771735192072118,
+      "alias": " - medmcqa_orig_filtered"
+    },
+    "medqa_4options_g2b": {
+      "acc,none": 0.5978835978835979,
+      "acc_stderr,none": 0.025253032554997695,
+      "acc_norm,none": 0.5978835978835979,
+      "acc_norm_stderr,none": 0.025253032554997695,
+      "alias": " - medqa_4options_g2b"
+    },
+    "medqa_4options_orig_filtered": {
+      "acc,none": 0.6455026455026455,
+      "acc_stderr,none": 0.024636830602842,
+      "acc_norm,none": 0.6455026455026455,
+      "acc_norm_stderr,none": 0.024636830602842,
+      "alias": " - medqa_4options_orig_filtered"
+    }
+  },
+  "groups": {
+    "b4b": {
+      "acc,none": 0.7536991368680641,
+      "acc_stderr,none": 0.09728135187806679,
+      "acc_norm,none": 0.7536991368680641,
+      "acc_norm_stderr,none": 0.09728135187806679,
+      "alias": "b4b"
+    }
+  },
+  "configs": {
+    "b4bqa": {
+      "task": "b4bqa",
+      "dataset_path": "AIM-Harvard/b4b_drug_qa",
+      "test_split": "test",
+      "doc_to_text": "<function process_cd at 0x7f7ab0e88700>",
+      "doc_to_target": "correct_choice",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medmcqa_g2b": {
+      "task": "medmcqa_g2b",
+      "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7f7ab0ed1f30>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medmcqa_orig_filtered": {
+      "task": "medmcqa_orig_filtered",
+      "dataset_path": "AIM-Harvard/medmcqa_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7f7aa1ac9120>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medqa_4options_g2b": {
+      "task": "medqa_4options_g2b",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7f7ab0d90700>",
+      "doc_to_target": "<function doc_to_target at 0x7f7ab0d90a60>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medqa_4options_orig_filtered": {
+      "task": "medqa_4options_orig_filtered",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7f7ab289e560>",
+      "doc_to_target": "<function doc_to_target at 0x7f7ab0e6d000>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    }
+  },
+  "versions": {
+    "b4b": "N/A",
+    "b4bqa": "Yaml",
+    "medmcqa_g2b": "Yaml",
+    "medmcqa_orig_filtered": "Yaml",
+    "medqa_4options_g2b": "Yaml",
+    "medqa_4options_orig_filtered": "Yaml"
+  },
+  "n-shot": {
+    "b4b": 0,
+    "b4bqa": 0,
+    "medmcqa_g2b": 0,
+    "medmcqa_orig_filtered": 0,
+    "medqa_4options_g2b": 0,
+    "medqa_4options_orig_filtered": 0
+  },
+  "config": {
+    "model": "hf",
+    "model_args": "pretrained=01-ai/Yi-1.5-34B,parallelize=True,load_in_4bit=True",
+    "batch_size": "auto",
+    "batch_sizes": [
+      64
+    ],
+    "device": null,
+    "use_cache": null,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "gen_kwargs": null
+  },
+  "git_hash": "928c7657"
+}

data/raw-eval-outputs/CohereForAI-aya-23-35B_results.json ADDED Viewed

	@@ -0,0 +1,250 @@

+{
+  "results": {
+    "b4b": {
+      "acc,none": 0.657213316892725,
+      "acc_stderr,none": 0.12271990860540663,
+      "acc_norm,none": 0.657213316892725,
+      "acc_norm_stderr,none": 0.12271990860540663,
+      "alias": "b4b"
+    },
+    "b4bqa": {
+      "acc,none": 0.7840401785714286,
+      "acc_stderr,none": 0.009723169269065642,
+      "acc_norm,none": 0.7840401785714286,
+      "acc_norm_stderr,none": 0.009723169269065642,
+      "alias": " - b4bqa"
+    },
+    "medmcqa_g2b": {
+      "acc,none": 0.48563218390804597,
+      "acc_stderr,none": 0.026830322100875627,
+      "acc_norm,none": 0.48563218390804597,
+      "acc_norm_stderr,none": 0.026830322100875627,
+      "alias": " - medmcqa_g2b"
+    },
+    "medmcqa_orig_filtered": {
+      "acc,none": 0.5287356321839081,
+      "acc_stderr,none": 0.026797041830104146,
+      "acc_norm,none": 0.5287356321839081,
+      "acc_norm_stderr,none": 0.026797041830104146,
+      "alias": " - medmcqa_orig_filtered"
+    },
+    "medqa_4options_g2b": {
+      "acc,none": 0.47883597883597884,
+      "acc_stderr,none": 0.025728230952130723,
+      "acc_norm,none": 0.47883597883597884,
+      "acc_norm_stderr,none": 0.025728230952130723,
+      "alias": " - medqa_4options_g2b"
+    },
+    "medqa_4options_orig_filtered": {
+      "acc,none": 0.5105820105820106,
+      "acc_stderr,none": 0.02574554227604548,
+      "acc_norm,none": 0.5105820105820106,
+      "acc_norm_stderr,none": 0.02574554227604548,
+      "alias": " - medqa_4options_orig_filtered"
+    }
+  },
+  "groups": {
+    "b4b": {
+      "acc,none": 0.657213316892725,
+      "acc_stderr,none": 0.12271990860540663,
+      "acc_norm,none": 0.657213316892725,
+      "acc_norm_stderr,none": 0.12271990860540663,
+      "alias": "b4b"
+    }
+  },
+  "configs": {
+    "b4bqa": {
+      "task": "b4bqa",
+      "dataset_path": "AIM-Harvard/b4b_drug_qa",
+      "test_split": "test",
+      "doc_to_text": "<function process_cd at 0x7f77e7a6d090>",
+      "doc_to_target": "correct_choice",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medmcqa_g2b": {
+      "task": "medmcqa_g2b",
+      "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7f77e770c550>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medmcqa_orig_filtered": {
+      "task": "medmcqa_orig_filtered",
+      "dataset_path": "AIM-Harvard/medmcqa_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7f77e770c700>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medqa_4options_g2b": {
+      "task": "medqa_4options_g2b",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7f77e770f6d0>",
+      "doc_to_target": "<function doc_to_target at 0x7f77e770fa30>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medqa_4options_orig_filtered": {
+      "task": "medqa_4options_orig_filtered",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7f77e770fc70>",
+      "doc_to_target": "<function doc_to_target at 0x7f77e770feb0>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    }
+  },
+  "versions": {
+    "b4b": "N/A",
+    "b4bqa": "Yaml",
+    "medmcqa_g2b": "Yaml",
+    "medmcqa_orig_filtered": "Yaml",
+    "medqa_4options_g2b": "Yaml",
+    "medqa_4options_orig_filtered": "Yaml"
+  },
+  "n-shot": {
+    "b4b": 0,
+    "b4bqa": 0,
+    "medmcqa_g2b": 0,
+    "medmcqa_orig_filtered": 0,
+    "medqa_4options_g2b": 0,
+    "medqa_4options_orig_filtered": 0
+  },
+  "config": {
+    "model": "hf",
+    "model_args": "pretrained=CohereForAI/aya-23-35B,load_in_4bit=True",
+    "batch_size": "4",
+    "batch_sizes": [],
+    "device": "cuda:0",
+    "use_cache": null,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "gen_kwargs": null
+  },
+  "git_hash": "928c7657"
+}

data/raw-eval-outputs/CohereForAI-c4ai-command-r-plus_results.json ADDED Viewed

	@@ -0,0 +1,250 @@

+{
+  "results": {
+    "b4b": {
+      "acc,none": 0.7241060419235512,
+      "acc_stderr,none": 0.12287593035527263,
+      "acc_norm,none": 0.7241060419235512,
+      "acc_norm_stderr,none": 0.12287593035527263,
+      "alias": "b4b"
+    },
+    "b4bqa": {
+      "acc,none": 0.8493303571428571,
+      "acc_stderr,none": 0.00845285482249418,
+      "acc_norm,none": 0.8493303571428571,
+      "acc_norm_stderr,none": 0.00845285482249418,
+      "alias": " - b4bqa"
+    },
+    "medmcqa_g2b": {
+      "acc,none": 0.49137931034482757,
+      "acc_stderr,none": 0.026837416550737143,
+      "acc_norm,none": 0.49137931034482757,
+      "acc_norm_stderr,none": 0.026837416550737143,
+      "alias": " - medmcqa_g2b"
+    },
+    "medmcqa_orig_filtered": {
+      "acc,none": 0.6149425287356322,
+      "acc_stderr,none": 0.026122534084516178,
+      "acc_norm,none": 0.6149425287356322,
+      "acc_norm_stderr,none": 0.026122534084516178,
+      "alias": " - medmcqa_orig_filtered"
+    },
+    "medqa_4options_g2b": {
+      "acc,none": 0.5661375661375662,
+      "acc_stderr,none": 0.0255250343824749,
+      "acc_norm,none": 0.5661375661375662,
+      "acc_norm_stderr,none": 0.0255250343824749,
+      "alias": " - medqa_4options_g2b"
+    },
+    "medqa_4options_orig_filtered": {
+      "acc,none": 0.6031746031746031,
+      "acc_stderr,none": 0.025197101074246483,
+      "acc_norm,none": 0.6031746031746031,
+      "acc_norm_stderr,none": 0.025197101074246483,
+      "alias": " - medqa_4options_orig_filtered"
+    }
+  },
+  "groups": {
+    "b4b": {
+      "acc,none": 0.7241060419235512,
+      "acc_stderr,none": 0.12287593035527263,
+      "acc_norm,none": 0.7241060419235512,
+      "acc_norm_stderr,none": 0.12287593035527263,
+      "alias": "b4b"
+    }
+  },
+  "configs": {
+    "b4bqa": {
+      "task": "b4bqa",
+      "dataset_path": "AIM-Harvard/b4b_drug_qa",
+      "test_split": "test",
+      "doc_to_text": "<function process_cd at 0x7f6d9dc51090>",
+      "doc_to_target": "correct_choice",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medmcqa_g2b": {
+      "task": "medmcqa_g2b",
+      "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7f6d9d85c550>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medmcqa_orig_filtered": {
+      "task": "medmcqa_orig_filtered",
+      "dataset_path": "AIM-Harvard/medmcqa_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7f6d9d85c700>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medqa_4options_g2b": {
+      "task": "medqa_4options_g2b",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7f6d9d85f6d0>",
+      "doc_to_target": "<function doc_to_target at 0x7f6d9d85fa30>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medqa_4options_orig_filtered": {
+      "task": "medqa_4options_orig_filtered",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7f6d9d85fc70>",
+      "doc_to_target": "<function doc_to_target at 0x7f6d9d85feb0>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    }
+  },
+  "versions": {
+    "b4b": "N/A",
+    "b4bqa": "Yaml",
+    "medmcqa_g2b": "Yaml",
+    "medmcqa_orig_filtered": "Yaml",
+    "medqa_4options_g2b": "Yaml",
+    "medqa_4options_orig_filtered": "Yaml"
+  },
+  "n-shot": {
+    "b4b": 0,
+    "b4bqa": 0,
+    "medmcqa_g2b": 0,
+    "medmcqa_orig_filtered": 0,
+    "medqa_4options_g2b": 0,
+    "medqa_4options_orig_filtered": 0
+  },
+  "config": {
+    "model": "hf",
+    "model_args": "pretrained=CohereForAI/c4ai-command-r-plus,load_in_4bit=True",
+    "batch_size": "4",
+    "batch_sizes": [],
+    "device": "cuda:0",
+    "use_cache": null,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "gen_kwargs": null
+  },
+  "git_hash": "928c7657"
+}

data/raw-eval-outputs/ProbeMedicalYonseiMAILab-medllama3-v20_results.json ADDED Viewed

	@@ -0,0 +1,252 @@

+{
+  "results": {
+    "b4b": {
+      "acc,none": 0.7475339087546239,
+      "acc_stderr,none": 0.0611860272880456,
+      "acc_norm,none": 0.7475339087546239,
+      "acc_norm_stderr,none": 0.0611860272880456,
+      "alias": "b4b"
+    },
+    "b4bqa": {
+      "acc,none": 0.7193080357142857,
+      "acc_stderr,none": 0.01061755826614456,
+      "acc_norm,none": 0.7193080357142857,
+      "acc_norm_stderr,none": 0.01061755826614456,
+      "alias": " - b4bqa"
+    },
+    "medmcqa_g2b": {
+      "acc,none": 0.6522988505747126,
+      "acc_stderr,none": 0.025565932174194388,
+      "acc_norm,none": 0.6522988505747126,
+      "acc_norm_stderr,none": 0.025565932174194388,
+      "alias": " - medmcqa_g2b"
+    },
+    "medmcqa_orig_filtered": {
+      "acc,none": 0.8017241379310345,
+      "acc_stderr,none": 0.021403394960161685,
+      "acc_norm,none": 0.8017241379310345,
+      "acc_norm_stderr,none": 0.021403394960161685,
+      "alias": " - medmcqa_orig_filtered"
+    },
+    "medqa_4options_g2b": {
+      "acc,none": 0.7645502645502645,
+      "acc_stderr,none": 0.021851509822031715,
+      "acc_norm,none": 0.7645502645502645,
+      "acc_norm_stderr,none": 0.021851509822031715,
+      "alias": " - medqa_4options_g2b"
+    },
+    "medqa_4options_orig_filtered": {
+      "acc,none": 0.9021164021164021,
+      "acc_stderr,none": 0.015304374225091422,
+      "acc_norm,none": 0.9021164021164021,
+      "acc_norm_stderr,none": 0.015304374225091422,
+      "alias": " - medqa_4options_orig_filtered"
+    }
+  },
+  "groups": {
+    "b4b": {
+      "acc,none": 0.7475339087546239,
+      "acc_stderr,none": 0.0611860272880456,
+      "acc_norm,none": 0.7475339087546239,
+      "acc_norm_stderr,none": 0.0611860272880456,
+      "alias": "b4b"
+    }
+  },
+  "configs": {
+    "b4bqa": {
+      "task": "b4bqa",
+      "dataset_path": "AIM-Harvard/b4b_drug_qa",
+      "test_split": "test",
+      "doc_to_text": "<function process_cd at 0x7f59e4b48820>",
+      "doc_to_target": "correct_choice",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medmcqa_g2b": {
+      "task": "medmcqa_g2b",
+      "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7f59e4b92050>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medmcqa_orig_filtered": {
+      "task": "medmcqa_orig_filtered",
+      "dataset_path": "AIM-Harvard/medmcqa_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7f59d579d240>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medqa_4options_g2b": {
+      "task": "medqa_4options_g2b",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7f59e4a54820>",
+      "doc_to_target": "<function doc_to_target at 0x7f59e4a54b80>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medqa_4options_orig_filtered": {
+      "task": "medqa_4options_orig_filtered",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7f59e4b0e680>",
+      "doc_to_target": "<function doc_to_target at 0x7f59e4b2d120>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    }
+  },
+  "versions": {
+    "b4b": "N/A",
+    "b4bqa": "Yaml",
+    "medmcqa_g2b": "Yaml",
+    "medmcqa_orig_filtered": "Yaml",
+    "medqa_4options_g2b": "Yaml",
+    "medqa_4options_orig_filtered": "Yaml"
+  },
+  "n-shot": {
+    "b4b": 0,
+    "b4bqa": 0,
+    "medmcqa_g2b": 0,
+    "medmcqa_orig_filtered": 0,
+    "medqa_4options_g2b": 0,
+    "medqa_4options_orig_filtered": 0
+  },
+  "config": {
+    "model": "hf",
+    "model_args": "pretrained=ProbeMedicalYonseiMAILab/medllama3-v20,parallelize=True,load_in_4bit=True",
+    "batch_size": "auto",
+    "batch_sizes": [
+      32
+    ],
+    "device": null,
+    "use_cache": null,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "gen_kwargs": null
+  },
+  "git_hash": "928c7657"
+}

data/raw-eval-outputs/Qwen-Qwen2-72B_results.json ADDED Viewed

	@@ -0,0 +1,250 @@

+{
+  "results": {
+    "b4b": {
+      "acc,none": 0.8372379778051788,
+      "acc_stderr,none": 0.07216098703042964,
+      "acc_norm,none": 0.8372379778051788,
+      "acc_norm_stderr,none": 0.07216098703042964,
+      "alias": "b4b"
+    },
+    "b4bqa": {
+      "acc,none": 0.91015625,
+      "acc_stderr,none": 0.006757003132881115,
+      "acc_norm,none": 0.91015625,
+      "acc_norm_stderr,none": 0.006757003132881115,
+      "alias": " - b4bqa"
+    },
+    "medmcqa_g2b": {
+      "acc,none": 0.7155172413793104,
+      "acc_stderr,none": 0.024219952635630794,
+      "acc_norm,none": 0.7155172413793104,
+      "acc_norm_stderr,none": 0.024219952635630794,
+      "alias": " - medmcqa_g2b"
+    },
+    "medmcqa_orig_filtered": {
+      "acc,none": 0.7787356321839081,
+      "acc_stderr,none": 0.02228363451068677,
+      "acc_norm,none": 0.7787356321839081,
+      "acc_norm_stderr,none": 0.02228363451068677,
+      "alias": " - medmcqa_orig_filtered"
+    },
+    "medqa_4options_g2b": {
+      "acc,none": 0.7407407407407407,
+      "acc_stderr,none": 0.022569897074918417,
+      "acc_norm,none": 0.7407407407407407,
+      "acc_norm_stderr,none": 0.022569897074918417,
+      "alias": " - medqa_4options_g2b"
+    },
+    "medqa_4options_orig_filtered": {
+      "acc,none": 0.753968253968254,
+      "acc_stderr,none": 0.022182037202948368,
+      "acc_norm,none": 0.753968253968254,
+      "acc_norm_stderr,none": 0.022182037202948368,
+      "alias": " - medqa_4options_orig_filtered"
+    }
+  },
+  "groups": {
+    "b4b": {
+      "acc,none": 0.8372379778051788,
+      "acc_stderr,none": 0.07216098703042964,
+      "acc_norm,none": 0.8372379778051788,
+      "acc_norm_stderr,none": 0.07216098703042964,
+      "alias": "b4b"
+    }
+  },
+  "configs": {
+    "b4bqa": {
+      "task": "b4bqa",
+      "dataset_path": "AIM-Harvard/b4b_drug_qa",
+      "test_split": "test",
+      "doc_to_text": "<function process_cd at 0x7fe2a537cf70>",
+      "doc_to_target": "correct_choice",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medmcqa_g2b": {
+      "task": "medmcqa_g2b",
+      "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7fe2a4fa0430>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medmcqa_orig_filtered": {
+      "task": "medmcqa_orig_filtered",
+      "dataset_path": "AIM-Harvard/medmcqa_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7fe2a4fa05e0>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medqa_4options_g2b": {
+      "task": "medqa_4options_g2b",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7fe2a4fa35b0>",
+      "doc_to_target": "<function doc_to_target at 0x7fe2a4fa3910>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medqa_4options_orig_filtered": {
+      "task": "medqa_4options_orig_filtered",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7fe2a4fa3b50>",
+      "doc_to_target": "<function doc_to_target at 0x7fe2a4fa3d90>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    }
+  },
+  "versions": {
+    "b4b": "N/A",
+    "b4bqa": "Yaml",
+    "medmcqa_g2b": "Yaml",
+    "medmcqa_orig_filtered": "Yaml",
+    "medqa_4options_g2b": "Yaml",
+    "medqa_4options_orig_filtered": "Yaml"
+  },
+  "n-shot": {
+    "b4b": 0,
+    "b4bqa": 0,
+    "medmcqa_g2b": 0,
+    "medmcqa_orig_filtered": 0,
+    "medqa_4options_g2b": 0,
+    "medqa_4options_orig_filtered": 0
+  },
+  "config": {
+    "model": "hf",
+    "model_args": "pretrained=Qwen/Qwen2-72B,load_in_4bit=True",
+    "batch_size": "4",
+    "batch_sizes": [],
+    "device": "cuda:0",
+    "use_cache": null,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "gen_kwargs": null
+  },
+  "git_hash": "928c7657"
+}

data/raw-eval-outputs/Qwen-Qwen2-7B_results.json ADDED Viewed

	@@ -0,0 +1,316 @@

+{
+  "results": {
+    "b4b": {
+      "acc,none": 0.7028360049321827,
+      "acc_stderr,none": 0.1004832322485701,
+      "acc_norm,none": 0.7028360049321827,
+      "acc_norm_stderr,none": 0.1004832322485701,
+      "alias": "b4b"
+    },
+    "b4bqa": {
+      "acc,none": 0.8041294642857143,
+      "acc_stderr,none": 0.009377773744245437,
+      "acc_norm,none": 0.8041294642857143,
+      "acc_norm_stderr,none": 0.009377773744245437,
+      "alias": " - b4bqa"
+    },
+    "medmcqa_g2b": {
+      "acc,none": 0.5517241379310345,
+      "acc_stderr,none": 0.02669739777037782,
+      "acc_norm,none": 0.5517241379310345,
+      "acc_norm_stderr,none": 0.02669739777037782,
+      "alias": " - medmcqa_g2b"
+    },
+    "medmcqa_orig_filtered": {
+      "acc,none": 0.6350574712643678,
+      "acc_stderr,none": 0.025843659831273274,
+      "acc_norm,none": 0.6350574712643678,
+      "acc_norm_stderr,none": 0.025843659831273274,
+      "alias": " - medmcqa_orig_filtered"
+    },
+    "medqa_4options_g2b": {
+      "acc,none": 0.5370370370370371,
+      "acc_stderr,none": 0.025680564640056882,
+      "acc_norm,none": 0.5370370370370371,
+      "acc_norm_stderr,none": 0.025680564640056882,
+      "alias": " - medqa_4options_g2b"
+    },
+    "medqa_4options_orig_filtered": {
+      "acc,none": 0.58994708994709,
+      "acc_stderr,none": 0.025331202438944444,
+      "acc_norm,none": 0.58994708994709,
+      "acc_norm_stderr,none": 0.025331202438944444,
+      "alias": " - medqa_4options_orig_filtered"
+    }
+  },
+  "groups": {
+    "b4b": {
+      "acc,none": 0.7028360049321827,
+      "acc_stderr,none": 0.1004832322485701,
+      "acc_norm,none": 0.7028360049321827,
+      "acc_norm_stderr,none": 0.1004832322485701,
+      "alias": "b4b"
+    }
+  },
+  "configs": {
+    "b4bqa": {
+      "task": "b4bqa",
+      "dataset_path": "AIM-Harvard/b4b_drug_qa",
+      "test_split": "test",
+      "doc_to_text": "<function process_cd at 0x7f8319c60ee0>",
+      "doc_to_target": "correct_choice",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medmcqa_g2b": {
+      "task": "medmcqa_g2b",
+      "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7f831a19e3a0>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medmcqa_orig_filtered": {
+      "task": "medmcqa_orig_filtered",
+      "dataset_path": "AIM-Harvard/medmcqa_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7f8319ac8310>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medqa_4options_g2b": {
+      "task": "medqa_4options_g2b",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7f831a19e820>",
+      "doc_to_target": "<function doc_to_target at 0x7f831a19eb80>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medqa_4options_orig_filtered": {
+      "task": "medqa_4options_orig_filtered",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7f8319c844c0>",
+      "doc_to_target": "<function doc_to_target at 0x7f8319c30ee0>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    }
+  },
+  "versions": {
+    "b4b": "N/A",
+    "b4bqa": "Yaml",
+    "medmcqa_g2b": "Yaml",
+    "medmcqa_orig_filtered": "Yaml",
+    "medqa_4options_g2b": "Yaml",
+    "medqa_4options_orig_filtered": "Yaml"
+  },
+  "n-shot": {
+    "b4b": 0,
+    "b4bqa": 0,
+    "medmcqa_g2b": 0,
+    "medmcqa_orig_filtered": 0,
+    "medqa_4options_g2b": 0,
+    "medqa_4options_orig_filtered": 0
+  },
+  "config": {
+    "model": "hf",
+    "model_args": "pretrained=Qwen/Qwen2-7B,load_in_4bit=True",
+    "batch_size": "auto:64",
+    "batch_sizes": [
+      8,
+      8,
+      16,
+      16,
+      16,
+      16,
+      16,
+      16,
+      16,
+      16,
+      16,
+      16,
+      32,
+      32,
+      32,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64
+    ],
+    "device": "cuda:0",
+    "use_cache": null,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "gen_kwargs": null
+  },
+  "git_hash": "928c7657"
+}

data/raw-eval-outputs/aaditya-Llama3-OpenBioLLM-70B_results.json ADDED Viewed

	@@ -0,0 +1,252 @@

+{
+  "results": {
+    "b4b": {
+      "acc,none": 0.7876078914919852,
+      "acc_stderr,none": 0.06728010300021042,
+      "acc_norm,none": 0.7876078914919852,
+      "acc_norm_stderr,none": 0.06728010300021042,
+      "alias": "b4b"
+    },
+    "b4bqa": {
+      "acc,none": 0.8510044642857143,
+      "acc_stderr,none": 0.008414043525477657,
+      "acc_norm,none": 0.8510044642857143,
+      "acc_norm_stderr,none": 0.008414043525477657,
+      "alias": " - b4bqa"
+    },
+    "medmcqa_g2b": {
+      "acc,none": 0.632183908045977,
+      "acc_stderr,none": 0.025886440903166212,
+      "acc_norm,none": 0.632183908045977,
+      "acc_norm_stderr,none": 0.025886440903166212,
+      "alias": " - medmcqa_g2b"
+    },
+    "medmcqa_orig_filtered": {
+      "acc,none": 0.7385057471264368,
+      "acc_stderr,none": 0.023590833013480327,
+      "acc_norm,none": 0.7385057471264368,
+      "acc_norm_stderr,none": 0.023590833013480327,
+      "alias": " - medmcqa_orig_filtered"
+    },
+    "medqa_4options_g2b": {
+      "acc,none": 0.708994708994709,
+      "acc_stderr,none": 0.02339382650048486,
+      "acc_norm,none": 0.708994708994709,
+      "acc_norm_stderr,none": 0.02339382650048486,
+      "alias": " - medqa_4options_g2b"
+    },
+    "medqa_4options_orig_filtered": {
+      "acc,none": 0.753968253968254,
+      "acc_stderr,none": 0.022182037202948368,
+      "acc_norm,none": 0.753968253968254,
+      "acc_norm_stderr,none": 0.022182037202948368,
+      "alias": " - medqa_4options_orig_filtered"
+    }
+  },
+  "groups": {
+    "b4b": {
+      "acc,none": 0.7876078914919852,
+      "acc_stderr,none": 0.06728010300021042,
+      "acc_norm,none": 0.7876078914919852,
+      "acc_norm_stderr,none": 0.06728010300021042,
+      "alias": "b4b"
+    }
+  },
+  "configs": {
+    "b4bqa": {
+      "task": "b4bqa",
+      "dataset_path": "AIM-Harvard/b4b_drug_qa",
+      "test_split": "test",
+      "doc_to_text": "<function process_cd at 0x7f1d7499c820>",
+      "doc_to_target": "correct_choice",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medmcqa_g2b": {
+      "task": "medmcqa_g2b",
+      "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7f1d749e6050>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medmcqa_orig_filtered": {
+      "task": "medmcqa_orig_filtered",
+      "dataset_path": "AIM-Harvard/medmcqa_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7f1d66619240>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medqa_4options_g2b": {
+      "task": "medqa_4options_g2b",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7f1d748a8820>",
+      "doc_to_target": "<function doc_to_target at 0x7f1d748a8b80>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medqa_4options_orig_filtered": {
+      "task": "medqa_4options_orig_filtered",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7f1d74962680>",
+      "doc_to_target": "<function doc_to_target at 0x7f1d74981120>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    }
+  },
+  "versions": {
+    "b4b": "N/A",
+    "b4bqa": "Yaml",
+    "medmcqa_g2b": "Yaml",
+    "medmcqa_orig_filtered": "Yaml",
+    "medqa_4options_g2b": "Yaml",
+    "medqa_4options_orig_filtered": "Yaml"
+  },
+  "n-shot": {
+    "b4b": 0,
+    "b4bqa": 0,
+    "medmcqa_g2b": 0,
+    "medmcqa_orig_filtered": 0,
+    "medqa_4options_g2b": 0,
+    "medqa_4options_orig_filtered": 0
+  },
+  "config": {
+    "model": "hf",
+    "model_args": "pretrained=aaditya/Llama3-OpenBioLLM-70B,parallelize=True,load_in_4bit=True",
+    "batch_size": "auto",
+    "batch_sizes": [
+      32
+    ],
+    "device": null,
+    "use_cache": null,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "gen_kwargs": null
+  },
+  "git_hash": "928c7657"
+}

data/raw-eval-outputs/johnsnowlabs-JSL-MedLlama-3-8B-v9_results.json ADDED Viewed

	@@ -0,0 +1,252 @@

+{
+  "results": {
+    "b4b": {
+      "acc,none": 0.7444512946979038,
+      "acc_stderr,none": 0.04274747119698657,
+      "acc_norm,none": 0.7444512946979038,
+      "acc_norm_stderr,none": 0.04274747119698657,
+      "alias": "b4b"
+    },
+    "b4bqa": {
+      "acc,none": 0.7516741071428571,
+      "acc_stderr,none": 0.010208877794084196,
+      "acc_norm,none": 0.7516741071428571,
+      "acc_norm_stderr,none": 0.010208877794084196,
+      "alias": " - b4bqa"
+    },
+    "medmcqa_g2b": {
+      "acc,none": 0.6408045977011494,
+      "acc_stderr,none": 0.025755112822545917,
+      "acc_norm,none": 0.6408045977011494,
+      "acc_norm_stderr,none": 0.025755112822545917,
+      "alias": " - medmcqa_g2b"
+    },
+    "medmcqa_orig_filtered": {
+      "acc,none": 0.7701149425287356,
+      "acc_stderr,none": 0.022587512669518847,
+      "acc_norm,none": 0.7701149425287356,
+      "acc_norm_stderr,none": 0.022587512669518847,
+      "alias": " - medmcqa_orig_filtered"
+    },
+    "medqa_4options_g2b": {
+      "acc,none": 0.7063492063492064,
+      "acc_stderr,none": 0.023456037383982033,
+      "acc_norm,none": 0.7063492063492064,
+      "acc_norm_stderr,none": 0.023456037383982033,
+      "alias": " - medqa_4options_g2b"
+    },
+    "medqa_4options_orig_filtered": {
+      "acc,none": 0.8201058201058201,
+      "acc_stderr,none": 0.01978211983276641,
+      "acc_norm,none": 0.8201058201058201,
+      "acc_norm_stderr,none": 0.01978211983276641,
+      "alias": " - medqa_4options_orig_filtered"
+    }
+  },
+  "groups": {
+    "b4b": {
+      "acc,none": 0.7444512946979038,
+      "acc_stderr,none": 0.04274747119698657,
+      "acc_norm,none": 0.7444512946979038,
+      "acc_norm_stderr,none": 0.04274747119698657,
+      "alias": "b4b"
+    }
+  },
+  "configs": {
+    "b4bqa": {
+      "task": "b4bqa",
+      "dataset_path": "AIM-Harvard/b4b_drug_qa",
+      "test_split": "test",
+      "doc_to_text": "<function process_cd at 0x78639f2e7040>",
+      "doc_to_target": "correct_choice",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medmcqa_g2b": {
+      "task": "medmcqa_g2b",
+      "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x78639ef36280>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medmcqa_orig_filtered": {
+      "task": "medmcqa_orig_filtered",
+      "dataset_path": "AIM-Harvard/medmcqa_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x78639f2d9e50>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medqa_4options_g2b": {
+      "task": "medqa_4options_g2b",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x78639f24a0d0>",
+      "doc_to_target": "<function doc_to_target at 0x78639f24a550>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medqa_4options_orig_filtered": {
+      "task": "medqa_4options_orig_filtered",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x78639f24a670>",
+      "doc_to_target": "<function doc_to_target at 0x78639f24a8b0>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    }
+  },
+  "versions": {
+    "b4b": "N/A",
+    "b4bqa": "Yaml",
+    "medmcqa_g2b": "Yaml",
+    "medmcqa_orig_filtered": "Yaml",
+    "medqa_4options_g2b": "Yaml",
+    "medqa_4options_orig_filtered": "Yaml"
+  },
+  "n-shot": {
+    "b4b": 0,
+    "b4bqa": 0,
+    "medmcqa_g2b": 0,
+    "medmcqa_orig_filtered": 0,
+    "medqa_4options_g2b": 0,
+    "medqa_4options_orig_filtered": 0
+  },
+  "config": {
+    "model": "hf",
+    "model_args": "pretrained=johnsnowlabs/JSL-MedLlama-3-8B-v9,parallelize=True,load_in_4bit=True",
+    "batch_size": "auto",
+    "batch_sizes": [
+      8
+    ],
+    "device": null,
+    "use_cache": null,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "gen_kwargs": null
+  },
+  "git_hash": "a6ca0b90"
+}

data/raw-eval-outputs/meta-llama-Llama-2-70B-hf_results.json ADDED Viewed

	@@ -0,0 +1,250 @@

+{
+  "results": {
+    "b4b": {
+      "acc,none": 0.656288532675709,
+      "acc_stderr,none": 0.11099422321488661,
+      "acc_norm,none": 0.656288532675709,
+      "acc_norm_stderr,none": 0.11099422321488661,
+      "alias": "b4b"
+    },
+    "b4bqa": {
+      "acc,none": 0.7700892857142857,
+      "acc_stderr,none": 0.009942654607749084,
+      "acc_norm,none": 0.7700892857142857,
+      "acc_norm_stderr,none": 0.009942654607749084,
+      "alias": " - b4bqa"
+    },
+    "medmcqa_g2b": {
+      "acc,none": 0.45977011494252873,
+      "acc_stderr,none": 0.026754382675705738,
+      "acc_norm,none": 0.45977011494252873,
+      "acc_norm_stderr,none": 0.026754382675705738,
+      "alias": " - medmcqa_g2b"
+    },
+    "medmcqa_orig_filtered": {
+      "acc,none": 0.5229885057471264,
+      "acc_stderr,none": 0.026813021515239517,
+      "acc_norm,none": 0.5229885057471264,
+      "acc_norm_stderr,none": 0.026813021515239517,
+      "alias": " - medmcqa_orig_filtered"
+    },
+    "medqa_4options_g2b": {
+      "acc,none": 0.5264550264550265,
+      "acc_stderr,none": 0.025715239811346758,
+      "acc_norm,none": 0.5264550264550265,
+      "acc_norm_stderr,none": 0.025715239811346758,
+      "alias": " - medqa_4options_g2b"
+    },
+    "medqa_4options_orig_filtered": {
+      "acc,none": 0.5502645502645502,
+      "acc_stderr,none": 0.02562085704293665,
+      "acc_norm,none": 0.5502645502645502,
+      "acc_norm_stderr,none": 0.02562085704293665,
+      "alias": " - medqa_4options_orig_filtered"
+    }
+  },
+  "groups": {
+    "b4b": {
+      "acc,none": 0.656288532675709,
+      "acc_stderr,none": 0.11099422321488661,
+      "acc_norm,none": 0.656288532675709,
+      "acc_norm_stderr,none": 0.11099422321488661,
+      "alias": "b4b"
+    }
+  },
+  "configs": {
+    "b4bqa": {
+      "task": "b4bqa",
+      "dataset_path": "AIM-Harvard/b4b_drug_qa",
+      "test_split": "test",
+      "doc_to_text": "<function process_cd at 0x7fc9f1d3d090>",
+      "doc_to_target": "correct_choice",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medmcqa_g2b": {
+      "task": "medmcqa_g2b",
+      "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7fc9f0108550>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medmcqa_orig_filtered": {
+      "task": "medmcqa_orig_filtered",
+      "dataset_path": "AIM-Harvard/medmcqa_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7fc9f0108700>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medqa_4options_g2b": {
+      "task": "medqa_4options_g2b",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7fc9f010b6d0>",
+      "doc_to_target": "<function doc_to_target at 0x7fc9f010ba30>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medqa_4options_orig_filtered": {
+      "task": "medqa_4options_orig_filtered",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7fc9f010bc70>",
+      "doc_to_target": "<function doc_to_target at 0x7fc9f010beb0>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    }
+  },
+  "versions": {
+    "b4b": "N/A",
+    "b4bqa": "Yaml",
+    "medmcqa_g2b": "Yaml",
+    "medmcqa_orig_filtered": "Yaml",
+    "medqa_4options_g2b": "Yaml",
+    "medqa_4options_orig_filtered": "Yaml"
+  },
+  "n-shot": {
+    "b4b": 0,
+    "b4bqa": 0,
+    "medmcqa_g2b": 0,
+    "medmcqa_orig_filtered": 0,
+    "medqa_4options_g2b": 0,
+    "medqa_4options_orig_filtered": 0
+  },
+  "config": {
+    "model": "hf",
+    "model_args": "pretrained=meta-llama/Llama-2-70B-hf,load_in_4bit=True",
+    "batch_size": "4",
+    "batch_sizes": [],
+    "device": "cuda:0",
+    "use_cache": null,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "gen_kwargs": null
+  },
+  "git_hash": "928c7657"
+}

data/raw-eval-outputs/meta-llama-Llama-2-7b-hf_results.json ADDED Viewed

	@@ -0,0 +1,250 @@

+{
+  "results": {
+    "b4b": {
+      "acc,none": 0.3600493218249075,
+      "acc_stderr,none": 0.021816304388272503,
+      "acc_norm,none": 0.3600493218249075,
+      "acc_norm_stderr,none": 0.021816304388272503,
+      "alias": "b4b"
+    },
+    "b4bqa": {
+      "acc,none": 0.36830357142857145,
+      "acc_stderr,none": 0.011397494280772988,
+      "acc_norm,none": 0.36830357142857145,
+      "acc_norm_stderr,none": 0.011397494280772988,
+      "alias": " - b4bqa"
+    },
+    "medmcqa_g2b": {
+      "acc,none": 0.3390804597701149,
+      "acc_stderr,none": 0.02541329280547327,
+      "acc_norm,none": 0.3390804597701149,
+      "acc_norm_stderr,none": 0.02541329280547327,
+      "alias": " - medmcqa_g2b"
+    },
+    "medmcqa_orig_filtered": {
+      "acc,none": 0.34195402298850575,
+      "acc_stderr,none": 0.025465208743331563,
+      "acc_norm,none": 0.34195402298850575,
+      "acc_norm_stderr,none": 0.025465208743331563,
+      "alias": " - medmcqa_orig_filtered"
+    },
+    "medqa_4options_g2b": {
+      "acc,none": 0.3439153439153439,
+      "acc_stderr,none": 0.024464426625596437,
+      "acc_norm,none": 0.3439153439153439,
+      "acc_norm_stderr,none": 0.024464426625596437,
+      "alias": " - medqa_4options_g2b"
+    },
+    "medqa_4options_orig_filtered": {
+      "acc,none": 0.373015873015873,
+      "acc_stderr,none": 0.02490699045899257,
+      "acc_norm,none": 0.373015873015873,
+      "acc_norm_stderr,none": 0.02490699045899257,
+      "alias": " - medqa_4options_orig_filtered"
+    }
+  },
+  "groups": {
+    "b4b": {
+      "acc,none": 0.3600493218249075,
+      "acc_stderr,none": 0.021816304388272503,
+      "acc_norm,none": 0.3600493218249075,
+      "acc_norm_stderr,none": 0.021816304388272503,
+      "alias": "b4b"
+    }
+  },
+  "configs": {
+    "b4bqa": {
+      "task": "b4bqa",
+      "dataset_path": "AIM-Harvard/b4b_drug_qa",
+      "test_split": "test",
+      "doc_to_text": "<function process_cd at 0x7f9fc69011b0>",
+      "doc_to_target": "correct_choice",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medmcqa_g2b": {
+      "task": "medmcqa_g2b",
+      "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7f9fc4d94670>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medmcqa_orig_filtered": {
+      "task": "medmcqa_orig_filtered",
+      "dataset_path": "AIM-Harvard/medmcqa_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7f9fc4d94820>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medqa_4options_g2b": {
+      "task": "medqa_4options_g2b",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7f9fc4d977f0>",
+      "doc_to_target": "<function doc_to_target at 0x7f9fc4d97b50>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medqa_4options_orig_filtered": {
+      "task": "medqa_4options_orig_filtered",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7f9fc4d97d90>",
+      "doc_to_target": "<function doc_to_target at 0x7f9fc4db8040>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    }
+  },
+  "versions": {
+    "b4b": "N/A",
+    "b4bqa": "Yaml",
+    "medmcqa_g2b": "Yaml",
+    "medmcqa_orig_filtered": "Yaml",
+    "medqa_4options_g2b": "Yaml",
+    "medqa_4options_orig_filtered": "Yaml"
+  },
+  "n-shot": {
+    "b4b": 0,
+    "b4bqa": 0,
+    "medmcqa_g2b": 0,
+    "medmcqa_orig_filtered": 0,
+    "medqa_4options_g2b": 0,
+    "medqa_4options_orig_filtered": 0
+  },
+  "config": {
+    "model": "hf",
+    "model_args": "pretrained=meta-llama/Llama-2-7b-hf,load_in_4bit=True",
+    "batch_size": "4",
+    "batch_sizes": [],
+    "device": "cuda:0",
+    "use_cache": null,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "gen_kwargs": null
+  },
+  "git_hash": "928c7657"
+}

data/raw-eval-outputs/meta-llama-Meta-Llama-3-70B_results.json ADDED Viewed

	@@ -0,0 +1,250 @@

+{
+  "results": {
+    "b4b": {
+      "acc,none": 0.8255240443896424,
+      "acc_stderr,none": 0.07700722588574725,
+      "acc_norm,none": 0.8255240443896424,
+      "acc_norm_stderr,none": 0.07700722588574725,
+      "alias": "b4b"
+    },
+    "b4bqa": {
+      "acc,none": 0.9012276785714286,
+      "acc_stderr,none": 0.007049967229617683,
+      "acc_norm,none": 0.9012276785714286,
+      "acc_norm_stderr,none": 0.007049967229617683,
+      "alias": " - b4bqa"
+    },
+    "medmcqa_g2b": {
+      "acc,none": 0.6666666666666666,
+      "acc_stderr,none": 0.025306320600037485,
+      "acc_norm,none": 0.6666666666666666,
+      "acc_norm_stderr,none": 0.025306320600037485,
+      "alias": " - medmcqa_g2b"
+    },
+    "medmcqa_orig_filtered": {
+      "acc,none": 0.7816091954022989,
+      "acc_stderr,none": 0.02217927096875997,
+      "acc_norm,none": 0.7816091954022989,
+      "acc_norm_stderr,none": 0.02217927096875997,
+      "alias": " - medmcqa_orig_filtered"
+    },
+    "medqa_4options_g2b": {
+      "acc,none": 0.7275132275132276,
+      "acc_stderr,none": 0.022930973071633363,
+      "acc_norm,none": 0.7275132275132276,
+      "acc_norm_stderr,none": 0.022930973071633363,
+      "alias": " - medqa_4options_g2b"
+    },
+    "medqa_4options_orig_filtered": {
+      "acc,none": 0.7513227513227513,
+      "acc_stderr,none": 0.022261817692400168,
+      "acc_norm,none": 0.7513227513227513,
+      "acc_norm_stderr,none": 0.022261817692400168,
+      "alias": " - medqa_4options_orig_filtered"
+    }
+  },
+  "groups": {
+    "b4b": {
+      "acc,none": 0.8255240443896424,
+      "acc_stderr,none": 0.07700722588574725,
+      "acc_norm,none": 0.8255240443896424,
+      "acc_norm_stderr,none": 0.07700722588574725,
+      "alias": "b4b"
+    }
+  },
+  "configs": {
+    "b4bqa": {
+      "task": "b4bqa",
+      "dataset_path": "AIM-Harvard/b4b_drug_qa",
+      "test_split": "test",
+      "doc_to_text": "<function process_cd at 0x7f572baed090>",
+      "doc_to_target": "correct_choice",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medmcqa_g2b": {
+      "task": "medmcqa_g2b",
+      "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7f5729f00550>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medmcqa_orig_filtered": {
+      "task": "medmcqa_orig_filtered",
+      "dataset_path": "AIM-Harvard/medmcqa_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7f5729f00700>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medqa_4options_g2b": {
+      "task": "medqa_4options_g2b",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7f5729f036d0>",
+      "doc_to_target": "<function doc_to_target at 0x7f5729f03a30>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medqa_4options_orig_filtered": {
+      "task": "medqa_4options_orig_filtered",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7f5729f03c70>",
+      "doc_to_target": "<function doc_to_target at 0x7f5729f03eb0>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    }
+  },
+  "versions": {
+    "b4b": "N/A",
+    "b4bqa": "Yaml",
+    "medmcqa_g2b": "Yaml",
+    "medmcqa_orig_filtered": "Yaml",
+    "medqa_4options_g2b": "Yaml",
+    "medqa_4options_orig_filtered": "Yaml"
+  },
+  "n-shot": {
+    "b4b": 0,
+    "b4bqa": 0,
+    "medmcqa_g2b": 0,
+    "medmcqa_orig_filtered": 0,
+    "medqa_4options_g2b": 0,
+    "medqa_4options_orig_filtered": 0
+  },
+  "config": {
+    "model": "hf",
+    "model_args": "pretrained=meta-llama/Meta-Llama-3-70B,load_in_4bit=True",
+    "batch_size": "4",
+    "batch_sizes": [],
+    "device": "cuda:0",
+    "use_cache": null,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "gen_kwargs": null
+  },
+  "git_hash": "928c7657"
+}

data/raw-eval-outputs/meta-llama-Meta-Llama-3-8B_results.json ADDED Viewed

	@@ -0,0 +1,250 @@

+{
+  "results": {
+    "b4b": {
+      "acc,none": 0.7120838471023427,
+      "acc_stderr,none": 0.11202233860795015,
+      "acc_norm,none": 0.7120838471023427,
+      "acc_norm_stderr,none": 0.11202233860795015,
+      "alias": "b4b"
+    },
+    "b4bqa": {
+      "acc,none": 0.8270089285714286,
+      "acc_stderr,none": 0.00893756370730241,
+      "acc_norm,none": 0.8270089285714286,
+      "acc_norm_stderr,none": 0.00893756370730241,
+      "alias": " - b4bqa"
+    },
+    "medmcqa_g2b": {
+      "acc,none": 0.5287356321839081,
+      "acc_stderr,none": 0.02679704183010415,
+      "acc_norm,none": 0.5287356321839081,
+      "acc_norm_stderr,none": 0.02679704183010415,
+      "alias": " - medmcqa_g2b"
+    },
+    "medmcqa_orig_filtered": {
+      "acc,none": 0.5919540229885057,
+      "acc_stderr,none": 0.026383584629731508,
+      "acc_norm,none": 0.5919540229885057,
+      "acc_norm_stderr,none": 0.026383584629731508,
+      "alias": " - medmcqa_orig_filtered"
+    },
+    "medqa_4options_g2b": {
+      "acc,none": 0.5502645502645502,
+      "acc_stderr,none": 0.025620857042936655,
+      "acc_norm,none": 0.5502645502645502,
+      "acc_norm_stderr,none": 0.025620857042936655,
+      "alias": " - medqa_4options_g2b"
+    },
+    "medqa_4options_orig_filtered": {
+      "acc,none": 0.6084656084656085,
+      "acc_stderr,none": 0.025138091388851102,
+      "acc_norm,none": 0.6084656084656085,
+      "acc_norm_stderr,none": 0.025138091388851102,
+      "alias": " - medqa_4options_orig_filtered"
+    }
+  },
+  "groups": {
+    "b4b": {
+      "acc,none": 0.7120838471023427,
+      "acc_stderr,none": 0.11202233860795015,
+      "acc_norm,none": 0.7120838471023427,
+      "acc_norm_stderr,none": 0.11202233860795015,
+      "alias": "b4b"
+    }
+  },
+  "configs": {
+    "b4bqa": {
+      "task": "b4bqa",
+      "dataset_path": "AIM-Harvard/b4b_drug_qa",
+      "test_split": "test",
+      "doc_to_text": "<function process_cd at 0x7ff55118d090>",
+      "doc_to_target": "correct_choice",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medmcqa_g2b": {
+      "task": "medmcqa_g2b",
+      "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7ff55058c550>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medmcqa_orig_filtered": {
+      "task": "medmcqa_orig_filtered",
+      "dataset_path": "AIM-Harvard/medmcqa_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7ff55058c700>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medqa_4options_g2b": {
+      "task": "medqa_4options_g2b",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7ff55058f6d0>",
+      "doc_to_target": "<function doc_to_target at 0x7ff55058fa30>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medqa_4options_orig_filtered": {
+      "task": "medqa_4options_orig_filtered",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7ff55058fc70>",
+      "doc_to_target": "<function doc_to_target at 0x7ff55058feb0>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    }
+  },
+  "versions": {
+    "b4b": "N/A",
+    "b4bqa": "Yaml",
+    "medmcqa_g2b": "Yaml",
+    "medmcqa_orig_filtered": "Yaml",
+    "medqa_4options_g2b": "Yaml",
+    "medqa_4options_orig_filtered": "Yaml"
+  },
+  "n-shot": {
+    "b4b": 0,
+    "b4bqa": 0,
+    "medmcqa_g2b": 0,
+    "medmcqa_orig_filtered": 0,
+    "medqa_4options_g2b": 0,
+    "medqa_4options_orig_filtered": 0
+  },
+  "config": {
+    "model": "hf",
+    "model_args": "pretrained=meta-llama/Meta-Llama-3-8B,load_in_4bit=True",
+    "batch_size": "4",
+    "batch_sizes": [],
+    "device": "cuda:0",
+    "use_cache": null,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "gen_kwargs": null
+  },
+  "git_hash": "928c7657"
+}

data/raw-eval-outputs/microsoft-Phi-3-medium-4k-instruct_results.json ADDED Viewed

	@@ -0,0 +1,316 @@

+{
+  "results": {
+    "b4b": {
+      "acc,none": 0.6593711467324291,
+      "acc_stderr,none": 0.05882406104148581,
+      "acc_norm,none": 0.6593711467324291,
+      "acc_norm_stderr,none": 0.05882406104148581,
+      "alias": "b4b"
+    },
+    "b4bqa": {
+      "acc,none": 0.6997767857142857,
+      "acc_stderr,none": 0.010830639682891873,
+      "acc_norm,none": 0.6997767857142857,
+      "acc_norm_stderr,none": 0.010830639682891873,
+      "alias": " - b4bqa"
+    },
+    "medmcqa_g2b": {
+      "acc,none": 0.603448275862069,
+      "acc_stderr,none": 0.026260634141933786,
+      "acc_norm,none": 0.603448275862069,
+      "acc_norm_stderr,none": 0.026260634141933786,
+      "alias": " - medmcqa_g2b"
+    },
+    "medmcqa_orig_filtered": {
+      "acc,none": 0.7241379310344828,
+      "acc_stderr,none": 0.023993406146998367,
+      "acc_norm,none": 0.7241379310344828,
+      "acc_norm_stderr,none": 0.023993406146998367,
+      "alias": " - medmcqa_orig_filtered"
+    },
+    "medqa_4options_g2b": {
+      "acc,none": 0.5343915343915344,
+      "acc_stderr,none": 0.025690321762493848,
+      "acc_norm,none": 0.5343915343915344,
+      "acc_norm_stderr,none": 0.025690321762493848,
+      "alias": " - medqa_4options_g2b"
+    },
+    "medqa_4options_orig_filtered": {
+      "acc,none": 0.5846560846560847,
+      "acc_stderr,none": 0.025379524910778398,
+      "acc_norm,none": 0.5846560846560847,
+      "acc_norm_stderr,none": 0.025379524910778398,
+      "alias": " - medqa_4options_orig_filtered"
+    }
+  },
+  "groups": {
+    "b4b": {
+      "acc,none": 0.6593711467324291,
+      "acc_stderr,none": 0.05882406104148581,
+      "acc_norm,none": 0.6593711467324291,
+      "acc_norm_stderr,none": 0.05882406104148581,
+      "alias": "b4b"
+    }
+  },
+  "configs": {
+    "b4bqa": {
+      "task": "b4bqa",
+      "dataset_path": "AIM-Harvard/b4b_drug_qa",
+      "test_split": "test",
+      "doc_to_text": "<function process_cd at 0x7f872445dee0>",
+      "doc_to_target": "correct_choice",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medmcqa_g2b": {
+      "task": "medmcqa_g2b",
+      "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7f87249823a0>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medmcqa_orig_filtered": {
+      "task": "medmcqa_orig_filtered",
+      "dataset_path": "AIM-Harvard/medmcqa_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7f87242cb310>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medqa_4options_g2b": {
+      "task": "medqa_4options_g2b",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7f8724982820>",
+      "doc_to_target": "<function doc_to_target at 0x7f8724982b80>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medqa_4options_orig_filtered": {
+      "task": "medqa_4options_orig_filtered",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7f872447f4c0>",
+      "doc_to_target": "<function doc_to_target at 0x7f872442cee0>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    }
+  },
+  "versions": {
+    "b4b": "N/A",
+    "b4bqa": "Yaml",
+    "medmcqa_g2b": "Yaml",
+    "medmcqa_orig_filtered": "Yaml",
+    "medqa_4options_g2b": "Yaml",
+    "medqa_4options_orig_filtered": "Yaml"
+  },
+  "n-shot": {
+    "b4b": 0,
+    "b4bqa": 0,
+    "medmcqa_g2b": 0,
+    "medmcqa_orig_filtered": 0,
+    "medqa_4options_g2b": 0,
+    "medqa_4options_orig_filtered": 0
+  },
+  "config": {
+    "model": "hf",
+    "model_args": "pretrained=microsoft/Phi-3-medium-4k-instruct,load_in_4bit=True",
+    "batch_size": "auto:64",
+    "batch_sizes": [
+      8,
+      16,
+      32,
+      32,
+      32,
+      32,
+      32,
+      32,
+      32,
+      32,
+      32,
+      32,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64
+    ],
+    "device": "cuda:0",
+    "use_cache": null,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "gen_kwargs": null
+  },
+  "git_hash": "928c7657"
+}

data/raw-eval-outputs/microsoft-phi-1_5_results.json ADDED Viewed

	@@ -0,0 +1,316 @@

+{
+  "results": {
+    "b4b": {
+      "acc,none": 0.3024044389642417,
+      "acc_stderr,none": 0.030335029823792846,
+      "acc_norm,none": 0.3024044389642417,
+      "acc_norm_stderr,none": 0.030335029823792846,
+      "alias": "b4b"
+    },
+    "b4bqa": {
+      "acc,none": 0.28013392857142855,
+      "acc_stderr,none": 0.010611112414051155,
+      "acc_norm,none": 0.28013392857142855,
+      "acc_norm_stderr,none": 0.010611112414051155,
+      "alias": " - b4bqa"
+    },
+    "medmcqa_g2b": {
+      "acc,none": 0.3160919540229885,
+      "acc_stderr,none": 0.024959784982131285,
+      "acc_norm,none": 0.3160919540229885,
+      "acc_norm_stderr,none": 0.024959784982131285,
+      "alias": " - medmcqa_g2b"
+    },
+    "medmcqa_orig_filtered": {
+      "acc,none": 0.3045977011494253,
+      "acc_stderr,none": 0.024706807658616183,
+      "acc_norm,none": 0.3045977011494253,
+      "acc_norm_stderr,none": 0.024706807658616183,
+      "alias": " - medmcqa_orig_filtered"
+    },
+    "medqa_4options_g2b": {
+      "acc,none": 0.3492063492063492,
+      "acc_stderr,none": 0.024552292209342654,
+      "acc_norm,none": 0.3492063492063492,
+      "acc_norm_stderr,none": 0.024552292209342654,
+      "alias": " - medqa_4options_g2b"
+    },
+    "medqa_4options_orig_filtered": {
+      "acc,none": 0.34656084656084657,
+      "acc_stderr,none": 0.024508777521028435,
+      "acc_norm,none": 0.34656084656084657,
+      "acc_norm_stderr,none": 0.024508777521028435,
+      "alias": " - medqa_4options_orig_filtered"
+    }
+  },
+  "groups": {
+    "b4b": {
+      "acc,none": 0.3024044389642417,
+      "acc_stderr,none": 0.030335029823792846,
+      "acc_norm,none": 0.3024044389642417,
+      "acc_norm_stderr,none": 0.030335029823792846,
+      "alias": "b4b"
+    }
+  },
+  "configs": {
+    "b4bqa": {
+      "task": "b4bqa",
+      "dataset_path": "AIM-Harvard/b4b_drug_qa",
+      "test_split": "test",
+      "doc_to_text": "<function process_cd at 0x7f202b05ff70>",
+      "doc_to_target": "correct_choice",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medmcqa_g2b": {
+      "task": "medmcqa_g2b",
+      "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7f202b59e430>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medmcqa_orig_filtered": {
+      "task": "medmcqa_orig_filtered",
+      "dataset_path": "AIM-Harvard/medmcqa_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7f202aecb3a0>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medqa_4options_g2b": {
+      "task": "medqa_4options_g2b",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7f202b59e8b0>",
+      "doc_to_target": "<function doc_to_target at 0x7f202b59ec10>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medqa_4options_orig_filtered": {
+      "task": "medqa_4options_orig_filtered",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7f202b084550>",
+      "doc_to_target": "<function doc_to_target at 0x7f202b030f70>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    }
+  },
+  "versions": {
+    "b4b": "N/A",
+    "b4bqa": "Yaml",
+    "medmcqa_g2b": "Yaml",
+    "medmcqa_orig_filtered": "Yaml",
+    "medqa_4options_g2b": "Yaml",
+    "medqa_4options_orig_filtered": "Yaml"
+  },
+  "n-shot": {
+    "b4b": 0,
+    "b4bqa": 0,
+    "medmcqa_g2b": 0,
+    "medmcqa_orig_filtered": 0,
+    "medqa_4options_g2b": 0,
+    "medqa_4options_orig_filtered": 0
+  },
+  "config": {
+    "model": "hf",
+    "model_args": "pretrained=microsoft/phi-1_5,load_in_4bit=True",
+    "batch_size": "auto:64",
+    "batch_sizes": [
+      32,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64
+    ],
+    "device": "cuda:0",
+    "use_cache": null,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "gen_kwargs": null
+  },
+  "git_hash": "928c7657"
+}

data/raw-eval-outputs/microsoft-phi-1_results.json ADDED Viewed

	@@ -0,0 +1,316 @@

+{
+  "results": {
+    "b4b": {
+      "acc,none": 0.21177558569667077,
+      "acc_stderr,none": 0.024570863489409633,
+      "acc_norm,none": 0.21177558569667077,
+      "acc_norm_stderr,none": 0.024570863489409633,
+      "alias": "b4b"
+    },
+    "b4bqa": {
+      "acc,none": 0.19642857142857142,
+      "acc_stderr,none": 0.009387863785916705,
+      "acc_norm,none": 0.19642857142857142,
+      "acc_norm_stderr,none": 0.009387863785916705,
+      "alias": " - b4bqa"
+    },
+    "medmcqa_g2b": {
+      "acc,none": 0.2413793103448276,
+      "acc_stderr,none": 0.02297193745254371,
+      "acc_norm,none": 0.2413793103448276,
+      "acc_norm_stderr,none": 0.02297193745254371,
+      "alias": " - medmcqa_g2b"
+    },
+    "medmcqa_orig_filtered": {
+      "acc,none": 0.25862068965517243,
+      "acc_stderr,none": 0.023506454355379604,
+      "acc_norm,none": 0.25862068965517243,
+      "acc_norm_stderr,none": 0.023506454355379604,
+      "alias": " - medmcqa_orig_filtered"
+    },
+    "medqa_4options_g2b": {
+      "acc,none": 0.21693121693121692,
+      "acc_stderr,none": 0.02122708244944506,
+      "acc_norm,none": 0.21693121693121692,
+      "acc_norm_stderr,none": 0.02122708244944506,
+      "alias": " - medqa_4options_g2b"
+    },
+    "medqa_4options_orig_filtered": {
+      "acc,none": 0.20899470899470898,
+      "acc_stderr,none": 0.02094048156533485,
+      "acc_norm,none": 0.20899470899470898,
+      "acc_norm_stderr,none": 0.02094048156533485,
+      "alias": " - medqa_4options_orig_filtered"
+    }
+  },
+  "groups": {
+    "b4b": {
+      "acc,none": 0.21177558569667077,
+      "acc_stderr,none": 0.024570863489409633,
+      "acc_norm,none": 0.21177558569667077,
+      "acc_norm_stderr,none": 0.024570863489409633,
+      "alias": "b4b"
+    }
+  },
+  "configs": {
+    "b4bqa": {
+      "task": "b4bqa",
+      "dataset_path": "AIM-Harvard/b4b_drug_qa",
+      "test_split": "test",
+      "doc_to_text": "<function process_cd at 0x7f3613c5fee0>",
+      "doc_to_target": "correct_choice",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medmcqa_g2b": {
+      "task": "medmcqa_g2b",
+      "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7f36141953a0>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medmcqa_orig_filtered": {
+      "task": "medmcqa_orig_filtered",
+      "dataset_path": "AIM-Harvard/medmcqa_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7f3613acb310>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medqa_4options_g2b": {
+      "task": "medqa_4options_g2b",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7f3614195820>",
+      "doc_to_target": "<function doc_to_target at 0x7f3614195b80>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medqa_4options_orig_filtered": {
+      "task": "medqa_4options_orig_filtered",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7f3613c814c0>",
+      "doc_to_target": "<function doc_to_target at 0x7f3613c30ee0>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    }
+  },
+  "versions": {
+    "b4b": "N/A",
+    "b4bqa": "Yaml",
+    "medmcqa_g2b": "Yaml",
+    "medmcqa_orig_filtered": "Yaml",
+    "medqa_4options_g2b": "Yaml",
+    "medqa_4options_orig_filtered": "Yaml"
+  },
+  "n-shot": {
+    "b4b": 0,
+    "b4bqa": 0,
+    "medmcqa_g2b": 0,
+    "medmcqa_orig_filtered": 0,
+    "medqa_4options_g2b": 0,
+    "medqa_4options_orig_filtered": 0
+  },
+  "config": {
+    "model": "hf",
+    "model_args": "pretrained=microsoft/phi-1,load_in_4bit=True",
+    "batch_size": "auto:64",
+    "batch_sizes": [
+      32,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64
+    ],
+    "device": "cuda:0",
+    "use_cache": null,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "gen_kwargs": null
+  },
+  "git_hash": "928c7657"
+}

data/raw-eval-outputs/microsoft-phi-2_results.json ADDED Viewed

	@@ -0,0 +1,316 @@

+{
+  "results": {
+    "b4b": {
+      "acc,none": 0.44790382244143034,
+      "acc_stderr,none": 0.0343882858973779,
+      "acc_norm,none": 0.44790382244143034,
+      "acc_norm_stderr,none": 0.0343882858973779,
+      "alias": "b4b"
+    },
+    "b4bqa": {
+      "acc,none": 0.47488839285714285,
+      "acc_stderr,none": 0.01179977682900124,
+      "acc_norm,none": 0.47488839285714285,
+      "acc_norm_stderr,none": 0.01179977682900124,
+      "alias": " - b4bqa"
+    },
+    "medmcqa_g2b": {
+      "acc,none": 0.3764367816091954,
+      "acc_stderr,none": 0.02600887296285643,
+      "acc_norm,none": 0.3764367816091954,
+      "acc_norm_stderr,none": 0.02600887296285643,
+      "alias": " - medmcqa_g2b"
+    },
+    "medmcqa_orig_filtered": {
+      "acc,none": 0.4224137931034483,
+      "acc_stderr,none": 0.02651628723013287,
+      "acc_norm,none": 0.4224137931034483,
+      "acc_norm_stderr,none": 0.02651628723013287,
+      "alias": " - medmcqa_orig_filtered"
+    },
+    "medqa_4options_g2b": {
+      "acc,none": 0.41798941798941797,
+      "acc_stderr,none": 0.02540255550326091,
+      "acc_norm,none": 0.41798941798941797,
+      "acc_norm_stderr,none": 0.02540255550326091,
+      "alias": " - medqa_4options_g2b"
+    },
+    "medqa_4options_orig_filtered": {
+      "acc,none": 0.43915343915343913,
+      "acc_stderr,none": 0.025559920550531003,
+      "acc_norm,none": 0.43915343915343913,
+      "acc_norm_stderr,none": 0.025559920550531003,
+      "alias": " - medqa_4options_orig_filtered"
+    }
+  },
+  "groups": {
+    "b4b": {
+      "acc,none": 0.44790382244143034,
+      "acc_stderr,none": 0.0343882858973779,
+      "acc_norm,none": 0.44790382244143034,
+      "acc_norm_stderr,none": 0.0343882858973779,
+      "alias": "b4b"
+    }
+  },
+  "configs": {
+    "b4bqa": {
+      "task": "b4bqa",
+      "dataset_path": "AIM-Harvard/b4b_drug_qa",
+      "test_split": "test",
+      "doc_to_text": "<function process_cd at 0x7fe111a60ee0>",
+      "doc_to_target": "correct_choice",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medmcqa_g2b": {
+      "task": "medmcqa_g2b",
+      "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7fe111f9e3a0>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medmcqa_orig_filtered": {
+      "task": "medmcqa_orig_filtered",
+      "dataset_path": "AIM-Harvard/medmcqa_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7fe1118cb310>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medqa_4options_g2b": {
+      "task": "medqa_4options_g2b",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7fe111f9e820>",
+      "doc_to_target": "<function doc_to_target at 0x7fe111f9eb80>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medqa_4options_orig_filtered": {
+      "task": "medqa_4options_orig_filtered",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7fe111a834c0>",
+      "doc_to_target": "<function doc_to_target at 0x7fe111a30ee0>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    }
+  },
+  "versions": {
+    "b4b": "N/A",
+    "b4bqa": "Yaml",
+    "medmcqa_g2b": "Yaml",
+    "medmcqa_orig_filtered": "Yaml",
+    "medqa_4options_g2b": "Yaml",
+    "medqa_4options_orig_filtered": "Yaml"
+  },
+  "n-shot": {
+    "b4b": 0,
+    "b4bqa": 0,
+    "medmcqa_g2b": 0,
+    "medmcqa_orig_filtered": 0,
+    "medqa_4options_g2b": 0,
+    "medqa_4options_orig_filtered": 0
+  },
+  "config": {
+    "model": "hf",
+    "model_args": "pretrained=microsoft/phi-2,load_in_4bit=True",
+    "batch_size": "auto:64",
+    "batch_sizes": [
+      32,
+      32,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64
+    ],
+    "device": "cuda:0",
+    "use_cache": null,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "gen_kwargs": null
+  },
+  "git_hash": "928c7657"
+}

data/raw-eval-outputs/mistralai-Mistral-7B-v0.3_results.json ADDED Viewed

	@@ -0,0 +1,316 @@

+{
+  "results": {
+    "b4b": {
+      "acc,none": 0.6199136868064118,
+      "acc_stderr,none": 0.0837373393352743,
+      "acc_norm,none": 0.6199136868064118,
+      "acc_norm_stderr,none": 0.0837373393352743,
+      "alias": "b4b"
+    },
+    "b4bqa": {
+      "acc,none": 0.703125,
+      "acc_stderr,none": 0.010795811437682205,
+      "acc_norm,none": 0.703125,
+      "acc_norm_stderr,none": 0.010795811437682205,
+      "alias": " - b4bqa"
+    },
+    "medmcqa_g2b": {
+      "acc,none": 0.4827586206896552,
+      "acc_stderr,none": 0.026825443578224806,
+      "acc_norm,none": 0.4827586206896552,
+      "acc_norm_stderr,none": 0.026825443578224806,
+      "alias": " - medmcqa_g2b"
+    },
+    "medmcqa_orig_filtered": {
+      "acc,none": 0.5689655172413793,
+      "acc_stderr,none": 0.026584851780353615,
+      "acc_norm,none": 0.5689655172413793,
+      "acc_norm_stderr,none": 0.026584851780353615,
+      "alias": " - medmcqa_orig_filtered"
+    },
+    "medqa_4options_g2b": {
+      "acc,none": 0.48677248677248675,
+      "acc_stderr,none": 0.025742297289575142,
+      "acc_norm,none": 0.48677248677248675,
+      "acc_norm_stderr,none": 0.025742297289575142,
+      "alias": " - medqa_4options_g2b"
+    },
+    "medqa_4options_orig_filtered": {
+      "acc,none": 0.5317460317460317,
+      "acc_stderr,none": 0.0256993528321318,
+      "acc_norm,none": 0.5317460317460317,
+      "acc_norm_stderr,none": 0.0256993528321318,
+      "alias": " - medqa_4options_orig_filtered"
+    }
+  },
+  "groups": {
+    "b4b": {
+      "acc,none": 0.6199136868064118,
+      "acc_stderr,none": 0.0837373393352743,
+      "acc_norm,none": 0.6199136868064118,
+      "acc_norm_stderr,none": 0.0837373393352743,
+      "alias": "b4b"
+    }
+  },
+  "configs": {
+    "b4bqa": {
+      "task": "b4bqa",
+      "dataset_path": "AIM-Harvard/b4b_drug_qa",
+      "test_split": "test",
+      "doc_to_text": "<function process_cd at 0x7f6e18c5ff70>",
+      "doc_to_target": "correct_choice",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medmcqa_g2b": {
+      "task": "medmcqa_g2b",
+      "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7f6e1919e430>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medmcqa_orig_filtered": {
+      "task": "medmcqa_orig_filtered",
+      "dataset_path": "AIM-Harvard/medmcqa_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7f6e18acb3a0>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medqa_4options_g2b": {
+      "task": "medqa_4options_g2b",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7f6e1919e8b0>",
+      "doc_to_target": "<function doc_to_target at 0x7f6e1919ec10>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medqa_4options_orig_filtered": {
+      "task": "medqa_4options_orig_filtered",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7f6e18c80550>",
+      "doc_to_target": "<function doc_to_target at 0x7f6e18c2ff70>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    }
+  },
+  "versions": {
+    "b4b": "N/A",
+    "b4bqa": "Yaml",
+    "medmcqa_g2b": "Yaml",
+    "medmcqa_orig_filtered": "Yaml",
+    "medqa_4options_g2b": "Yaml",
+    "medqa_4options_orig_filtered": "Yaml"
+  },
+  "n-shot": {
+    "b4b": 0,
+    "b4bqa": 0,
+    "medmcqa_g2b": 0,
+    "medmcqa_orig_filtered": 0,
+    "medqa_4options_g2b": 0,
+    "medqa_4options_orig_filtered": 0
+  },
+  "config": {
+    "model": "hf",
+    "model_args": "pretrained=mistralai/Mistral-7B-v0.3,load_in_4bit=True",
+    "batch_size": "auto:64",
+    "batch_sizes": [
+      32,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64,
+      64
+    ],
+    "device": "cuda:0",
+    "use_cache": null,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "gen_kwargs": null
+  },
+  "git_hash": "928c7657"
+}

data/raw-eval-outputs/mistralai-Mixtral-8x22B-v0.1_results.json ADDED Viewed

	@@ -0,0 +1,252 @@

+{
+  "results": {
+    "b4b": {
+      "acc,none": 0.7882244143033293,
+      "acc_stderr,none": 0.08841138813945006,
+      "acc_norm,none": 0.7882244143033293,
+      "acc_norm_stderr,none": 0.08841138813945006,
+      "alias": "b4b"
+    },
+    "b4bqa": {
+      "acc,none": 0.8772321428571429,
+      "acc_stderr,none": 0.007754464516034243,
+      "acc_norm,none": 0.8772321428571429,
+      "acc_norm_stderr,none": 0.007754464516034243,
+      "alias": " - b4bqa"
+    },
+    "medmcqa_g2b": {
+      "acc,none": 0.617816091954023,
+      "acc_stderr,none": 0.026085614333362674,
+      "acc_norm,none": 0.617816091954023,
+      "acc_norm_stderr,none": 0.026085614333362674,
+      "alias": " - medmcqa_g2b"
+    },
+    "medmcqa_orig_filtered": {
+      "acc,none": 0.7040229885057471,
+      "acc_stderr,none": 0.024505167376090542,
+      "acc_norm,none": 0.7040229885057471,
+      "acc_norm_stderr,none": 0.024505167376090542,
+      "alias": " - medmcqa_orig_filtered"
+    },
+    "medqa_4options_g2b": {
+      "acc,none": 0.6746031746031746,
+      "acc_stderr,none": 0.024130158299762613,
+      "acc_norm,none": 0.6746031746031746,
+      "acc_norm_stderr,none": 0.024130158299762613,
+      "alias": " - medqa_4options_g2b"
+    },
+    "medqa_4options_orig_filtered": {
+      "acc,none": 0.7142857142857143,
+      "acc_stderr,none": 0.023266512213730585,
+      "acc_norm,none": 0.7142857142857143,
+      "acc_norm_stderr,none": 0.023266512213730585,
+      "alias": " - medqa_4options_orig_filtered"
+    }
+  },
+  "groups": {
+    "b4b": {
+      "acc,none": 0.7882244143033293,
+      "acc_stderr,none": 0.08841138813945006,
+      "acc_norm,none": 0.7882244143033293,
+      "acc_norm_stderr,none": 0.08841138813945006,
+      "alias": "b4b"
+    }
+  },
+  "configs": {
+    "b4bqa": {
+      "task": "b4bqa",
+      "dataset_path": "AIM-Harvard/b4b_drug_qa",
+      "test_split": "test",
+      "doc_to_text": "<function process_cd at 0x7ff2d0094820>",
+      "doc_to_target": "correct_choice",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medmcqa_g2b": {
+      "task": "medmcqa_g2b",
+      "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7ff2d00de050>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medmcqa_orig_filtered": {
+      "task": "medmcqa_orig_filtered",
+      "dataset_path": "AIM-Harvard/medmcqa_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7ff2c2d29240>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medqa_4options_g2b": {
+      "task": "medqa_4options_g2b",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7ff2c4f34820>",
+      "doc_to_target": "<function doc_to_target at 0x7ff2c4f34b80>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medqa_4options_orig_filtered": {
+      "task": "medqa_4options_orig_filtered",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7ff2d005a680>",
+      "doc_to_target": "<function doc_to_target at 0x7ff2d0079120>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    }
+  },
+  "versions": {
+    "b4b": "N/A",
+    "b4bqa": "Yaml",
+    "medmcqa_g2b": "Yaml",
+    "medmcqa_orig_filtered": "Yaml",
+    "medqa_4options_g2b": "Yaml",
+    "medqa_4options_orig_filtered": "Yaml"
+  },
+  "n-shot": {
+    "b4b": 0,
+    "b4bqa": 0,
+    "medmcqa_g2b": 0,
+    "medmcqa_orig_filtered": 0,
+    "medqa_4options_g2b": 0,
+    "medqa_4options_orig_filtered": 0
+  },
+  "config": {
+    "model": "hf",
+    "model_args": "pretrained=mistralai/Mixtral-8x22B-v0.1,parallelize=True,load_in_4bit=True",
+    "batch_size": "auto",
+    "batch_sizes": [
+      32
+    ],
+    "device": null,
+    "use_cache": null,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "gen_kwargs": null
+  },
+  "git_hash": "928c7657"
+}

data/raw-eval-outputs/mistralai-Mixtral-8x7B-v0.1_results.json ADDED Viewed

	@@ -0,0 +1,250 @@

+{
+  "results": {
+    "b4b": {
+      "acc,none": 0.7475339087546239,
+      "acc_stderr,none": 0.11087824048509952,
+      "acc_norm,none": 0.7475339087546239,
+      "acc_norm_stderr,none": 0.11087824048509952,
+      "alias": "b4b"
+    },
+    "b4bqa": {
+      "acc,none": 0.8610491071428571,
+      "acc_stderr,none": 0.008173288677884256,
+      "acc_norm,none": 0.8610491071428571,
+      "acc_norm_stderr,none": 0.008173288677884256,
+      "alias": " - b4bqa"
+    },
+    "medmcqa_g2b": {
+      "acc,none": 0.5545977011494253,
+      "acc_stderr,none": 0.026680902895795475,
+      "acc_norm,none": 0.5545977011494253,
+      "acc_norm_stderr,none": 0.026680902895795475,
+      "alias": " - medmcqa_g2b"
+    },
+    "medmcqa_orig_filtered": {
+      "acc,none": 0.6494252873563219,
+      "acc_stderr,none": 0.025614751890362768,
+      "acc_norm,none": 0.6494252873563219,
+      "acc_norm_stderr,none": 0.025614751890362768,
+      "alias": " - medmcqa_orig_filtered"
+    },
+    "medqa_4options_g2b": {
+      "acc,none": 0.6005291005291006,
+      "acc_stderr,none": 0.025225450284067932,
+      "acc_norm,none": 0.6005291005291006,
+      "acc_norm_stderr,none": 0.025225450284067932,
+      "alias": " - medqa_4options_g2b"
+    },
+    "medqa_4options_orig_filtered": {
+      "acc,none": 0.6243386243386243,
+      "acc_stderr,none": 0.02494236893115979,
+      "acc_norm,none": 0.6243386243386243,
+      "acc_norm_stderr,none": 0.02494236893115979,
+      "alias": " - medqa_4options_orig_filtered"
+    }
+  },
+  "groups": {
+    "b4b": {
+      "acc,none": 0.7475339087546239,
+      "acc_stderr,none": 0.11087824048509952,
+      "acc_norm,none": 0.7475339087546239,
+      "acc_norm_stderr,none": 0.11087824048509952,
+      "alias": "b4b"
+    }
+  },
+  "configs": {
+    "b4bqa": {
+      "task": "b4bqa",
+      "dataset_path": "AIM-Harvard/b4b_drug_qa",
+      "test_split": "test",
+      "doc_to_text": "<function process_cd at 0x7fb0afadd090>",
+      "doc_to_target": "correct_choice",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medmcqa_g2b": {
+      "task": "medmcqa_g2b",
+      "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7fb0adf3c550>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medmcqa_orig_filtered": {
+      "task": "medmcqa_orig_filtered",
+      "dataset_path": "AIM-Harvard/medmcqa_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "validation",
+      "doc_to_text": "<function doc_to_text at 0x7fb0adf3c700>",
+      "doc_to_target": "cop",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": true,
+      "doc_to_decontamination_query": "{{question}}"
+    },
+    "medqa_4options_g2b": {
+      "task": "medqa_4options_g2b",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7fb0adf3f6d0>",
+      "doc_to_target": "<function doc_to_target at 0x7fb0adf3fa30>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    },
+    "medqa_4options_orig_filtered": {
+      "task": "medqa_4options_orig_filtered",
+      "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
+      "training_split": "train",
+      "validation_split": "validation",
+      "test_split": "test",
+      "doc_to_text": "<function doc_to_text at 0x7fb0adf3fc70>",
+      "doc_to_target": "<function doc_to_target at 0x7fb0adf3feb0>",
+      "doc_to_choice": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ],
+      "description": "",
+      "target_delimiter": " ",
+      "fewshot_delimiter": "\n\n",
+      "metric_list": [
+        {
+          "metric": "acc",
+          "aggregation": "mean",
+          "higher_is_better": true
+        },
+        {
+          "metric": "acc_norm",
+          "aggregation": "mean",
+          "higher_is_better": true
+        }
+      ],
+      "output_type": "multiple_choice",
+      "repeats": 1,
+      "should_decontaminate": false
+    }
+  },
+  "versions": {
+    "b4b": "N/A",
+    "b4bqa": "Yaml",
+    "medmcqa_g2b": "Yaml",
+    "medmcqa_orig_filtered": "Yaml",
+    "medqa_4options_g2b": "Yaml",
+    "medqa_4options_orig_filtered": "Yaml"
+  },
+  "n-shot": {
+    "b4b": 0,
+    "b4bqa": 0,
+    "medmcqa_g2b": 0,
+    "medmcqa_orig_filtered": 0,
+    "medqa_4options_g2b": 0,
+    "medqa_4options_orig_filtered": 0
+  },
+  "config": {
+    "model": "hf",
+    "model_args": "pretrained=mistralai/Mixtral-8x7B-v0.1,load_in_4bit=True",
+    "batch_size": "4",
+    "batch_sizes": [],
+    "device": "cuda:0",
+    "use_cache": null,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "gen_kwargs": null
+  },
+  "git_hash": "928c7657"
+}

src/__pycache__/model_links.cpython-311.pyc ADDED Viewed

Binary file (1.97 kB). View file

src/__pycache__/models_info.cpython-311.pyc ADDED Viewed

Binary file (2.43 kB). View file

src/json2df.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import os
+import json
+import pandas as pd
+from models_info import model_info
+directory = 'data/raw-eval-outputs'
+data = []
+def model_hyperlink(link, model_name):
+    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+def make_clickable_names(df):
+    df["Model"] = df.apply(
+        lambda row: model_hyperlink(row["Link"], row["Model"]), axis=1
+    )
+    return df
+# Iterate over all the files in the directory
+for filename in os.listdir(directory):
+    if filename.endswith(".json"):
+        filepath = os.path.join(directory, filename)
+        with open(filepath, 'r') as f:
+            json_data = json.load(f)
+            model_name = filename.replace("_results.json", "")
+            # Extract the accuracy values
+            results = json_data['results']
+            row = {'Model': model_name}
+            for key, value in results.items():
+                row[key] = round(value['acc,none'] * 100, 2)
+            # Add the tuning type and link to the row
+            row['T'] = model_info[model_name]['tuning']
+            row['Link'] = model_info[model_name]['link']
+            data.append(row)
+df = pd.DataFrame(data)
+df = make_clickable_names(df)
+df.drop(columns=["Link"], inplace=True)
+df['medmcqa_diff'] = (df['medmcqa_g2b'] - df['medmcqa_orig_filtered']).round(2)
+df['medqa_diff'] = (df['medqa_4options_g2b'] - df['medqa_4options_orig_filtered']).round(2)
+# Reorder columns
+cols = [
+    "T",
+    "Model",
+    "b4bqa",
+    "b4b",
+    "medmcqa_g2b",
+    "medmcqa_orig_filtered",
+    "medmcqa_diff",
+    "medqa_4options_g2b",
+    "medqa_4options_orig_filtered",
+    "medqa_diff"
+] + [col for col in df.columns if col not in [
+    "T", "Model", "b4bqa", "b4b", "medmcqa_g2b", "medmcqa_orig_filtered", "medmcqa_diff", "medqa_4options_g2b", "medqa_4options_orig_filtered", "medqa_diff"
+]]
+df = df[cols]
+output_csv = 'data/csv/models_data.csv'
+df.to_csv(output_csv, index=False)
+print(f"DataFrame saved to {output_csv}")

src/models_info.py ADDED Viewed

	@@ -0,0 +1,79 @@

+#feel free to correct these categories, I think size should also be added
+model_info = {
+    "meta-llama-Meta-Llama-3-70B": {
+        "link": "https://huggingface.co/meta-llama/Meta-Llama-3-70B",
+        "tuning": "🟢"  # Pre-trained
+    },
+    "meta-llama-Meta-Llama-3-8B": {
+        "link": "https://huggingface.co/meta-llama/Meta-Llama-3-8B",
+        "tuning": "🟢"  # Pre-trained
+    },
+    "01-ai-Yi-1.5-34B": {
+        "link": "https://huggingface.co/01-ai/Yi-1.5-34B",
+        "tuning": "🔶"  # Fine-tuned on task specific dataset
+    },
+    "aaditya-Llama3-OpenBioLLM-70B": {
+        "link": "https://huggingface.co/aaditya/Llama3-OpenBioLLM-70B",
+        "tuning": "🔶"  # Fine-tuned on task specific dataset
+    },
+    "CohereForAI-aya-23-35B": {
+        "link": "https://huggingface.co/CohereForAI/aya-23-35B",
+        "tuning": "🔶"  # Fine-tuned on task specific dataset
+    },
+    "CohereForAI-c4ai-command-r-plus": {
+        "link": "https://huggingface.co/CohereForAI/c4ai-command-r-plus",
+        "tuning": "💬"  # Chat-model (RLHF, DPO, IFT, etc.)
+    },
+    "johnsnowlabs-JSL-MedLlama-3-8B-v9": {
+        "link": "https://huggingface.co/johnsnowlabs/JSL-MedLlama-3-8B-v9",
+        "tuning": "🔶"  # Fine-tuned on task specific dataset
+    },
+    "meta-llama-Llama-2-70B-hf": {
+        "link": "https://huggingface.co/meta-llama/Llama-2-70B-hf",
+        "tuning": "🟢"  # Pre-trained
+    },
+    "meta-llama-Llama-2-7b-hf": {
+        "link": "https://huggingface.co/meta-llama/Llama-2-7b-hf",
+        "tuning": "🟢"  # Pre-trained
+    },
+    "microsoft-phi-1_5": {
+        "link": "https://huggingface.co/microsoft/phi-1_5",
+        "tuning": "🟢"  # Pre-trained
+    },
+    "microsoft-phi-1": {
+        "link": "https://huggingface.co/microsoft/phi-1",
+        "tuning": "🟢"  # Pre-trained
+    },
+    "microsoft-phi-2": {
+        "link": "https://huggingface.co/microsoft/phi-2",
+        "tuning": "🟢"  # Pre-trained
+    },
+    "microsoft-Phi-3-medium-4k-instruct": {
+        "link": "https://huggingface.co/microsoft/Phi-3-medium-4k-instruct",
+        "tuning": "💬"  # Chat-model (RLHF, DPO, IFT, etc.)
+    },
+    "mistralai-Mistral-7B-v0.3": {
+        "link": "https://huggingface.co/mistralai/Mistral-7B-v0.3",
+        "tuning": "🟢"  # Continuously pre-trained
+    },
+    "mistralai-Mixtral-8x22B-v0.1": {
+        "link": "https://huggingface.co/mistralai/Mixtral-8x22B-v0.1",
+        "tuning": "🟢"  # Continuously pre-trained
+    },
+    "mistralai-Mixtral-8x7B-v0.1": {
+        "link": "https://huggingface.co/mistralai/Mixtral-8x7B-v0.1",
+        "tuning": "🟢"  # Continuously pre-trained
+    },
+    "ProbeMedicalYonseiMAILab-medllama3-v20": {
+        "link": "https://huggingface.co/ProbeMedicalYonseiMAILab/medllama3-v20",
+        "tuning": "🔶"  # Fine-tuned on task specific dataset
+    },
+    "Qwen-Qwen2-72B": {
+        "link": "https://huggingface.co/Qwen/Qwen2-72B",
+        "tuning": "🟢"  # Pre-trained
+    },
+    "Qwen-Qwen2-7B": {
+        "link": "https://huggingface.co/Qwen/Qwen2-7B",
+        "tuning": "🟢"  # Pre-trained
+    },
+}