magilogi commited on
Commit
4c59875
β€’
1 Parent(s): cfb403c

rabbits-leaderboard-v0.1

Browse files
Files changed (26) hide show
  1. app.py +163 -0
  2. custom.css +62 -0
  3. data/csv/models_data.csv +20 -0
  4. data/raw-eval-outputs/01-ai-Yi-1.5-34B_results.json +252 -0
  5. data/raw-eval-outputs/CohereForAI-aya-23-35B_results.json +250 -0
  6. data/raw-eval-outputs/CohereForAI-c4ai-command-r-plus_results.json +250 -0
  7. data/raw-eval-outputs/ProbeMedicalYonseiMAILab-medllama3-v20_results.json +252 -0
  8. data/raw-eval-outputs/Qwen-Qwen2-72B_results.json +250 -0
  9. data/raw-eval-outputs/Qwen-Qwen2-7B_results.json +316 -0
  10. data/raw-eval-outputs/aaditya-Llama3-OpenBioLLM-70B_results.json +252 -0
  11. data/raw-eval-outputs/johnsnowlabs-JSL-MedLlama-3-8B-v9_results.json +252 -0
  12. data/raw-eval-outputs/meta-llama-Llama-2-70B-hf_results.json +250 -0
  13. data/raw-eval-outputs/meta-llama-Llama-2-7b-hf_results.json +250 -0
  14. data/raw-eval-outputs/meta-llama-Meta-Llama-3-70B_results.json +250 -0
  15. data/raw-eval-outputs/meta-llama-Meta-Llama-3-8B_results.json +250 -0
  16. data/raw-eval-outputs/microsoft-Phi-3-medium-4k-instruct_results.json +316 -0
  17. data/raw-eval-outputs/microsoft-phi-1_5_results.json +316 -0
  18. data/raw-eval-outputs/microsoft-phi-1_results.json +316 -0
  19. data/raw-eval-outputs/microsoft-phi-2_results.json +316 -0
  20. data/raw-eval-outputs/mistralai-Mistral-7B-v0.3_results.json +316 -0
  21. data/raw-eval-outputs/mistralai-Mixtral-8x22B-v0.1_results.json +252 -0
  22. data/raw-eval-outputs/mistralai-Mixtral-8x7B-v0.1_results.json +250 -0
  23. src/__pycache__/model_links.cpython-311.pyc +0 -0
  24. src/__pycache__/models_info.cpython-311.pyc +0 -0
  25. src/json2df.py +67 -0
  26. src/models_info.py +79 -0
app.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import gradio as gr
3
+ import plotly.express as px
4
+ import plotly.graph_objects as go
5
+
6
+ df = pd.read_csv("data/csv/models_data.csv")
7
+
8
+
9
+ filter_mapping = {
10
+ "all": "all",
11
+ "🟒 Pre-trained": "🟒",
12
+ "🟩 Continuously pre-trained": "🟩",
13
+ "πŸ”Ά Fine-tuned on domain-specific data": "πŸ”Ά",
14
+ "πŸ’¬ Chat-models (RLHF, DPO, IFT, ...)": "πŸ’¬"
15
+ }
16
+
17
+
18
+ def filter_items(df, query):
19
+ if query == "all":
20
+ return df
21
+ filter_value = filter_mapping[query]
22
+ return df[df["T"].str.contains(filter_value, na=False)]
23
+
24
+
25
+ def create_scatter_plot(df, x_col, y_col, title, x_title, y_title):
26
+ fig = px.scatter(df, x=x_col, y=y_col, color='Model', title=title)
27
+ fig.add_trace(
28
+ go.Scatter(
29
+ x=[0, 100],
30
+ y=[0, 100],
31
+ mode="lines",
32
+ name="y=x line",
33
+ line=dict(color='black', dash='dash')
34
+ )
35
+ )
36
+
37
+ fig.update_layout(
38
+ xaxis_title=x_title,
39
+ yaxis_title=y_title,
40
+ xaxis=dict(range=[0, 100]),
41
+ yaxis=dict(range=[0, 100]),
42
+ legend_title_text='Model'
43
+ )
44
+ fig.update_traces(marker=dict(size=10), selector=dict(mode='markers'))
45
+ return fig
46
+
47
+
48
+ with gr.Blocks(css="custom.css") as demo:
49
+ with gr.Row():
50
+ gr.Markdown(
51
+ """<div style="text-align: center;"><h1> <span style='color: #6aa84f;'>🐰 RABBITS:</span> <span style='color: #6aa84f;'>R</span>obust <span style='color: #6aa84f;'>A</span>ssessment of <span style='color: #6aa84f;'>B</span>iomedical <span style='color: #6aa84f;'>B</span>enchmarks <span style='color: #6aa84f;'>I</span>nvolving drug
52
+ <span style='color: #6aa84f;'>T</span>erm <span style='color: #6aa84f;'>S</span>ubstitutions for Language Models <span style='color: #6aa84f;'></span></h1></div>\
53
+ <br>\
54
+ <p class='markdown-text'>Robust language models are crucial in the medical domain and the RABBITS project tests the robustness of LLMs by evaluating their handling of synonyms, specifically brand and generic drug names. We assessed 16 open-source language models from Hugging Face using systematic synonym substitution on MedQA and MedMCQA tasks. Our results show a consistent decline in performance across all model sizes, highlighting challenges in synonym comprehension. Additionally, we discovered significant dataset contamination by identifying overlaps between MedQA, MedMCQA test sets, and the Dolma 1.6 dataset using an 8-gram analysis. This highlights the need to improve model robustness and address contamination in open-source datasets</p>"""
55
+ )
56
+
57
+ with gr.Tabs(elem_classes="tab-buttons"):
58
+ with gr.TabItem("πŸ” Evaluation table"):
59
+ with gr.Column():
60
+ with gr.Accordion("➑️ Filter by Column", open=False):
61
+ shown_columns = gr.CheckboxGroup(
62
+ choices=df.columns.tolist(),
63
+ value=df.columns.tolist(),
64
+ label="Select Columns",
65
+ interactive=True,
66
+ )
67
+ with gr.Row():
68
+ search_bar = gr.Textbox(
69
+ placeholder="πŸ” Search for your model and press ENTER...",
70
+ show_label=False,
71
+ elem_id="search-bar"
72
+ )
73
+ filter_columns = gr.Radio(
74
+ label="⏚ Filter model types",
75
+ choices=[
76
+ "all",
77
+ "🟒 Pre-trained",
78
+ "🟩 Continuously pre-trained",
79
+ "πŸ”Ά Fine-tuned on domain-specific data",
80
+ "πŸ’¬ Chat-models (RLHF, DPO, IFT, ...)"
81
+ ],
82
+ value="all",
83
+ elem_id="filter-columns",
84
+ )
85
+ leaderboard_df = gr.Dataframe(
86
+ value=df,
87
+ headers="keys",
88
+ datatype=["html" if col == "Model" else "str" for col in df.columns],
89
+ interactive=False,
90
+ elem_id="leaderboard-table"
91
+ )
92
+
93
+ def update_leaderboard(search_query):
94
+ filtered_df = df[df["Model"].str.contains(search_query, case=False)]
95
+ return filtered_df
96
+
97
+ search_bar.submit(
98
+ update_leaderboard,
99
+ inputs=search_bar,
100
+ outputs=leaderboard_df
101
+ )
102
+
103
+ def filter_update(query):
104
+ filtered_df = filter_items(df, query)
105
+ return filtered_df
106
+
107
+ filter_columns.change(
108
+ filter_update,
109
+ inputs=filter_columns,
110
+ outputs=leaderboard_df
111
+ )
112
+
113
+ shown_columns.change(
114
+ lambda cols: df[cols],
115
+ inputs=shown_columns,
116
+ outputs=leaderboard_df
117
+ )
118
+
119
+ with gr.TabItem("πŸ“Š Evaluation Plots"):
120
+ with gr.Column():
121
+ with gr.Row():
122
+ scatter1 = gr.Plot(
123
+ value=create_scatter_plot(df, "medmcqa_orig_filtered", "medmcqa_g2b",
124
+ "MedMCQA: Orig vs G2B", "medmcqa_orig_filtered", "medmcqa_g2b"),
125
+ elem_id="scatter1"
126
+ )
127
+ scatter2 = gr.Plot(
128
+ value=create_scatter_plot(df, "medqa_4options_orig_filtered", "medqa_4options_g2b",
129
+ "MedQA: Orig vs G2B", "medqa_4options_orig_filtered", "medqa_4options_g2b"),
130
+ elem_id="scatter2"
131
+ )
132
+ with gr.Row():
133
+ scatter3 = gr.Plot(
134
+ value=create_scatter_plot(df, "b4bqa", "b4b",
135
+ "b4bqa vs b4b", "b4bqa", "b4b"),
136
+ elem_id="scatter3"
137
+ )
138
+
139
+ with gr.TabItem("πŸ“ About"):
140
+ gr.Markdown(
141
+ """<div style="text-align: center;">
142
+ <h2>About RABBITS LLM Leaderboard</h2>
143
+ <p>This leaderboard ...</p>
144
+ <p>It is designed to ...</p>
145
+ </div>""",
146
+ elem_classes="markdown-text"
147
+ )
148
+
149
+ with gr.TabItem("πŸš€ Submit Here!"):
150
+ gr.Markdown(
151
+ """<div style="text-align: center;">
152
+ <h2>Submit Your Model Results</h2>
153
+ <p>If you have new model results that you would like to add to the leaderboard, please follow the submission guidelines below:</p>
154
+ <ul>
155
+ <li>COMING SOON</li>
156
+ </ul>
157
+ <p>COMING SOON</p>
158
+ </div>""",
159
+ elem_classes="markdown-text"
160
+ )
161
+
162
+ if __name__ == "__main__":
163
+ demo.launch()
custom.css ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #changelog-text {
2
+ font-size: 16px !important;
3
+ }
4
+ #changelog-text h2 {
5
+ font-size: 18px !important;
6
+ }
7
+ .markdown-text {
8
+ font-size: 16px !important;
9
+ }
10
+ #models-to-add-text {
11
+ font-size: 18px !important;
12
+ }
13
+ #citation-button span {
14
+ font-size: 16px !important;
15
+ }
16
+ #citation-button textarea {
17
+ font-size: 16px !important;
18
+ }
19
+ #citation-button > label > button {
20
+ margin: 6px;
21
+ transform: scale(1.3);
22
+ }
23
+ #leaderboard-table {
24
+ margin-top: 15px
25
+ }
26
+ #leaderboard-table-lite {
27
+ margin-top: 15px
28
+ }
29
+ #search-bar-table-box > div:first-child {
30
+ background: none;
31
+ border: none;
32
+ }
33
+
34
+ #search-bar {
35
+ padding: 0px;
36
+ }
37
+ /* Hides the final AutoEvalColumn */
38
+ #llm-benchmark-tab-table table td:last-child,
39
+ #llm-benchmark-tab-table table th:last-child {
40
+ display: none;
41
+ }
42
+ /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
43
+ table td:first-child,
44
+ table th:first-child {
45
+ max-width: 400px;
46
+ overflow: auto;
47
+ white-space: nowrap;
48
+ }
49
+ .tab-buttons button {
50
+ font-size: 20px;
51
+ }
52
+ #scale-logo {
53
+ border-style: none !important;
54
+ box-shadow: none;
55
+ display: block;
56
+ margin-left: auto;
57
+ margin-right: auto;
58
+ max-width: 600px;
59
+ }
60
+ #scale-logo .download {
61
+ display: none;
62
+ }
data/csv/models_data.csv ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ T,Model,b4bqa,b4b,medmcqa_g2b,medmcqa_orig_filtered,medmcqa_diff,medqa_4options_g2b,medqa_4options_orig_filtered,medqa_diff
2
+ πŸ”Ά,"<a target=""_blank"" href=""https://huggingface.co/01-ai/Yi-1.5-34B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">01-ai-Yi-1.5-34B</a>",85.16,75.37,59.77,69.25,-9.48,59.79,64.55,-4.76
3
+ πŸ”Ά,"<a target=""_blank"" href=""https://huggingface.co/aaditya/Llama3-OpenBioLLM-70B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">aaditya-Llama3-OpenBioLLM-70B</a>",85.1,78.76,63.22,73.85,-10.63,70.9,75.4,-4.5
4
+ πŸ”Ά,"<a target=""_blank"" href=""https://huggingface.co/CohereForAI/aya-23-35B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">CohereForAI-aya-23-35B</a>",78.4,65.72,48.56,52.87,-4.31,47.88,51.06,-3.18
5
+ πŸ’¬,"<a target=""_blank"" href=""https://huggingface.co/CohereForAI/c4ai-command-r-plus"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">CohereForAI-c4ai-command-r-plus</a>",84.93,72.41,49.14,61.49,-12.35,56.61,60.32,-3.71
6
+ πŸ”Ά,"<a target=""_blank"" href=""https://huggingface.co/johnsnowlabs/JSL-MedLlama-3-8B-v9"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">johnsnowlabs-JSL-MedLlama-3-8B-v9</a>",75.17,74.45,64.08,77.01,-12.93,70.63,82.01,-11.38
7
+ 🟒,"<a target=""_blank"" href=""https://huggingface.co/meta-llama/Llama-2-70B-hf"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">meta-llama-Llama-2-70B-hf</a>",77.01,65.63,45.98,52.3,-6.32,52.65,55.03,-2.38
8
+ 🟒,"<a target=""_blank"" href=""https://huggingface.co/meta-llama/Llama-2-7b-hf"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">meta-llama-Llama-2-7b-hf</a>",36.83,36.0,33.91,34.2,-0.29,34.39,37.3,-2.91
9
+ 🟒,"<a target=""_blank"" href=""https://huggingface.co/meta-llama/Meta-Llama-3-70B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">meta-llama-Meta-Llama-3-70B</a>",90.12,82.55,66.67,78.16,-11.49,72.75,75.13,-2.38
10
+ 🟒,"<a target=""_blank"" href=""https://huggingface.co/meta-llama/Meta-Llama-3-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">meta-llama-Meta-Llama-3-8B</a>",82.7,71.21,52.87,59.2,-6.33,55.03,60.85,-5.82
11
+ 🟒,"<a target=""_blank"" href=""https://huggingface.co/microsoft/phi-1_5"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">microsoft-phi-1_5</a>",28.01,30.24,31.61,30.46,1.15,34.92,34.66,0.26
12
+ 🟒,"<a target=""_blank"" href=""https://huggingface.co/microsoft/phi-1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">microsoft-phi-1</a>",19.64,21.18,24.14,25.86,-1.72,21.69,20.9,0.79
13
+ 🟒,"<a target=""_blank"" href=""https://huggingface.co/microsoft/phi-2"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">microsoft-phi-2</a>",47.49,44.79,37.64,42.24,-4.6,41.8,43.92,-2.12
14
+ πŸ’¬,"<a target=""_blank"" href=""https://huggingface.co/microsoft/Phi-3-medium-4k-instruct"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">microsoft-Phi-3-medium-4k-instruct</a>",69.98,65.94,60.34,72.41,-12.07,53.44,58.47,-5.03
15
+ 🟩,"<a target=""_blank"" href=""https://huggingface.co/mistralai/Mistral-7B-v0.3"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">mistralai-Mistral-7B-v0.3</a>",70.31,61.99,48.28,56.9,-8.62,48.68,53.17,-4.49
16
+ 🟩,"<a target=""_blank"" href=""https://huggingface.co/mistralai/Mixtral-8x22B-v0.1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">mistralai-Mixtral-8x22B-v0.1</a>",87.72,78.82,61.78,70.4,-8.62,67.46,71.43,-3.97
17
+ 🟩,"<a target=""_blank"" href=""https://huggingface.co/mistralai/Mixtral-8x7B-v0.1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">mistralai-Mixtral-8x7B-v0.1</a>",86.1,74.75,55.46,64.94,-9.48,60.05,62.43,-2.38
18
+ πŸ”Ά,"<a target=""_blank"" href=""https://huggingface.co/ProbeMedicalYonseiMAILab/medllama3-v20"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ProbeMedicalYonseiMAILab-medllama3-v20</a>",71.93,74.75,65.23,80.17,-14.94,76.46,90.21,-13.75
19
+ 🟒,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen2-72B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Qwen-Qwen2-72B</a>",91.02,83.72,71.55,77.87,-6.32,74.07,75.4,-1.33
20
+ 🟒,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen2-7B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Qwen-Qwen2-7B</a>",80.41,70.28,55.17,63.51,-8.34,53.7,58.99,-5.29
data/raw-eval-outputs/01-ai-Yi-1.5-34B_results.json ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "b4b": {
4
+ "acc,none": 0.7536991368680641,
5
+ "acc_stderr,none": 0.09728135187806679,
6
+ "acc_norm,none": 0.7536991368680641,
7
+ "acc_norm_stderr,none": 0.09728135187806679,
8
+ "alias": "b4b"
9
+ },
10
+ "b4bqa": {
11
+ "acc,none": 0.8515625,
12
+ "acc_stderr,none": 0.008401025189152976,
13
+ "acc_norm,none": 0.8515625,
14
+ "acc_norm_stderr,none": 0.008401025189152976,
15
+ "alias": " - b4bqa"
16
+ },
17
+ "medmcqa_g2b": {
18
+ "acc,none": 0.5977011494252874,
19
+ "acc_stderr,none": 0.026323989201783506,
20
+ "acc_norm,none": 0.5977011494252874,
21
+ "acc_norm_stderr,none": 0.026323989201783506,
22
+ "alias": " - medmcqa_g2b"
23
+ },
24
+ "medmcqa_orig_filtered": {
25
+ "acc,none": 0.6925287356321839,
26
+ "acc_stderr,none": 0.024771735192072118,
27
+ "acc_norm,none": 0.6925287356321839,
28
+ "acc_norm_stderr,none": 0.024771735192072118,
29
+ "alias": " - medmcqa_orig_filtered"
30
+ },
31
+ "medqa_4options_g2b": {
32
+ "acc,none": 0.5978835978835979,
33
+ "acc_stderr,none": 0.025253032554997695,
34
+ "acc_norm,none": 0.5978835978835979,
35
+ "acc_norm_stderr,none": 0.025253032554997695,
36
+ "alias": " - medqa_4options_g2b"
37
+ },
38
+ "medqa_4options_orig_filtered": {
39
+ "acc,none": 0.6455026455026455,
40
+ "acc_stderr,none": 0.024636830602842,
41
+ "acc_norm,none": 0.6455026455026455,
42
+ "acc_norm_stderr,none": 0.024636830602842,
43
+ "alias": " - medqa_4options_orig_filtered"
44
+ }
45
+ },
46
+ "groups": {
47
+ "b4b": {
48
+ "acc,none": 0.7536991368680641,
49
+ "acc_stderr,none": 0.09728135187806679,
50
+ "acc_norm,none": 0.7536991368680641,
51
+ "acc_norm_stderr,none": 0.09728135187806679,
52
+ "alias": "b4b"
53
+ }
54
+ },
55
+ "configs": {
56
+ "b4bqa": {
57
+ "task": "b4bqa",
58
+ "dataset_path": "AIM-Harvard/b4b_drug_qa",
59
+ "test_split": "test",
60
+ "doc_to_text": "<function process_cd at 0x7f7ab0e88700>",
61
+ "doc_to_target": "correct_choice",
62
+ "doc_to_choice": [
63
+ "A",
64
+ "B",
65
+ "C",
66
+ "D"
67
+ ],
68
+ "description": "",
69
+ "target_delimiter": " ",
70
+ "fewshot_delimiter": "\n\n",
71
+ "metric_list": [
72
+ {
73
+ "metric": "acc",
74
+ "aggregation": "mean",
75
+ "higher_is_better": true
76
+ },
77
+ {
78
+ "metric": "acc_norm",
79
+ "aggregation": "mean",
80
+ "higher_is_better": true
81
+ }
82
+ ],
83
+ "output_type": "multiple_choice",
84
+ "repeats": 1,
85
+ "should_decontaminate": false
86
+ },
87
+ "medmcqa_g2b": {
88
+ "task": "medmcqa_g2b",
89
+ "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
90
+ "training_split": "train",
91
+ "validation_split": "validation",
92
+ "test_split": "validation",
93
+ "doc_to_text": "<function doc_to_text at 0x7f7ab0ed1f30>",
94
+ "doc_to_target": "cop",
95
+ "doc_to_choice": [
96
+ "A",
97
+ "B",
98
+ "C",
99
+ "D"
100
+ ],
101
+ "description": "",
102
+ "target_delimiter": " ",
103
+ "fewshot_delimiter": "\n\n",
104
+ "metric_list": [
105
+ {
106
+ "metric": "acc",
107
+ "aggregation": "mean",
108
+ "higher_is_better": true
109
+ },
110
+ {
111
+ "metric": "acc_norm",
112
+ "aggregation": "mean",
113
+ "higher_is_better": true
114
+ }
115
+ ],
116
+ "output_type": "multiple_choice",
117
+ "repeats": 1,
118
+ "should_decontaminate": true,
119
+ "doc_to_decontamination_query": "{{question}}"
120
+ },
121
+ "medmcqa_orig_filtered": {
122
+ "task": "medmcqa_orig_filtered",
123
+ "dataset_path": "AIM-Harvard/medmcqa_original",
124
+ "training_split": "train",
125
+ "validation_split": "validation",
126
+ "test_split": "validation",
127
+ "doc_to_text": "<function doc_to_text at 0x7f7aa1ac9120>",
128
+ "doc_to_target": "cop",
129
+ "doc_to_choice": [
130
+ "A",
131
+ "B",
132
+ "C",
133
+ "D"
134
+ ],
135
+ "description": "",
136
+ "target_delimiter": " ",
137
+ "fewshot_delimiter": "\n\n",
138
+ "metric_list": [
139
+ {
140
+ "metric": "acc",
141
+ "aggregation": "mean",
142
+ "higher_is_better": true
143
+ },
144
+ {
145
+ "metric": "acc_norm",
146
+ "aggregation": "mean",
147
+ "higher_is_better": true
148
+ }
149
+ ],
150
+ "output_type": "multiple_choice",
151
+ "repeats": 1,
152
+ "should_decontaminate": true,
153
+ "doc_to_decontamination_query": "{{question}}"
154
+ },
155
+ "medqa_4options_g2b": {
156
+ "task": "medqa_4options_g2b",
157
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
158
+ "training_split": "train",
159
+ "validation_split": "validation",
160
+ "test_split": "test",
161
+ "doc_to_text": "<function doc_to_text at 0x7f7ab0d90700>",
162
+ "doc_to_target": "<function doc_to_target at 0x7f7ab0d90a60>",
163
+ "doc_to_choice": [
164
+ "A",
165
+ "B",
166
+ "C",
167
+ "D"
168
+ ],
169
+ "description": "",
170
+ "target_delimiter": " ",
171
+ "fewshot_delimiter": "\n\n",
172
+ "metric_list": [
173
+ {
174
+ "metric": "acc",
175
+ "aggregation": "mean",
176
+ "higher_is_better": true
177
+ },
178
+ {
179
+ "metric": "acc_norm",
180
+ "aggregation": "mean",
181
+ "higher_is_better": true
182
+ }
183
+ ],
184
+ "output_type": "multiple_choice",
185
+ "repeats": 1,
186
+ "should_decontaminate": false
187
+ },
188
+ "medqa_4options_orig_filtered": {
189
+ "task": "medqa_4options_orig_filtered",
190
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
191
+ "training_split": "train",
192
+ "validation_split": "validation",
193
+ "test_split": "test",
194
+ "doc_to_text": "<function doc_to_text at 0x7f7ab289e560>",
195
+ "doc_to_target": "<function doc_to_target at 0x7f7ab0e6d000>",
196
+ "doc_to_choice": [
197
+ "A",
198
+ "B",
199
+ "C",
200
+ "D"
201
+ ],
202
+ "description": "",
203
+ "target_delimiter": " ",
204
+ "fewshot_delimiter": "\n\n",
205
+ "metric_list": [
206
+ {
207
+ "metric": "acc",
208
+ "aggregation": "mean",
209
+ "higher_is_better": true
210
+ },
211
+ {
212
+ "metric": "acc_norm",
213
+ "aggregation": "mean",
214
+ "higher_is_better": true
215
+ }
216
+ ],
217
+ "output_type": "multiple_choice",
218
+ "repeats": 1,
219
+ "should_decontaminate": false
220
+ }
221
+ },
222
+ "versions": {
223
+ "b4b": "N/A",
224
+ "b4bqa": "Yaml",
225
+ "medmcqa_g2b": "Yaml",
226
+ "medmcqa_orig_filtered": "Yaml",
227
+ "medqa_4options_g2b": "Yaml",
228
+ "medqa_4options_orig_filtered": "Yaml"
229
+ },
230
+ "n-shot": {
231
+ "b4b": 0,
232
+ "b4bqa": 0,
233
+ "medmcqa_g2b": 0,
234
+ "medmcqa_orig_filtered": 0,
235
+ "medqa_4options_g2b": 0,
236
+ "medqa_4options_orig_filtered": 0
237
+ },
238
+ "config": {
239
+ "model": "hf",
240
+ "model_args": "pretrained=01-ai/Yi-1.5-34B,parallelize=True,load_in_4bit=True",
241
+ "batch_size": "auto",
242
+ "batch_sizes": [
243
+ 64
244
+ ],
245
+ "device": null,
246
+ "use_cache": null,
247
+ "limit": null,
248
+ "bootstrap_iters": 100000,
249
+ "gen_kwargs": null
250
+ },
251
+ "git_hash": "928c7657"
252
+ }
data/raw-eval-outputs/CohereForAI-aya-23-35B_results.json ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "b4b": {
4
+ "acc,none": 0.657213316892725,
5
+ "acc_stderr,none": 0.12271990860540663,
6
+ "acc_norm,none": 0.657213316892725,
7
+ "acc_norm_stderr,none": 0.12271990860540663,
8
+ "alias": "b4b"
9
+ },
10
+ "b4bqa": {
11
+ "acc,none": 0.7840401785714286,
12
+ "acc_stderr,none": 0.009723169269065642,
13
+ "acc_norm,none": 0.7840401785714286,
14
+ "acc_norm_stderr,none": 0.009723169269065642,
15
+ "alias": " - b4bqa"
16
+ },
17
+ "medmcqa_g2b": {
18
+ "acc,none": 0.48563218390804597,
19
+ "acc_stderr,none": 0.026830322100875627,
20
+ "acc_norm,none": 0.48563218390804597,
21
+ "acc_norm_stderr,none": 0.026830322100875627,
22
+ "alias": " - medmcqa_g2b"
23
+ },
24
+ "medmcqa_orig_filtered": {
25
+ "acc,none": 0.5287356321839081,
26
+ "acc_stderr,none": 0.026797041830104146,
27
+ "acc_norm,none": 0.5287356321839081,
28
+ "acc_norm_stderr,none": 0.026797041830104146,
29
+ "alias": " - medmcqa_orig_filtered"
30
+ },
31
+ "medqa_4options_g2b": {
32
+ "acc,none": 0.47883597883597884,
33
+ "acc_stderr,none": 0.025728230952130723,
34
+ "acc_norm,none": 0.47883597883597884,
35
+ "acc_norm_stderr,none": 0.025728230952130723,
36
+ "alias": " - medqa_4options_g2b"
37
+ },
38
+ "medqa_4options_orig_filtered": {
39
+ "acc,none": 0.5105820105820106,
40
+ "acc_stderr,none": 0.02574554227604548,
41
+ "acc_norm,none": 0.5105820105820106,
42
+ "acc_norm_stderr,none": 0.02574554227604548,
43
+ "alias": " - medqa_4options_orig_filtered"
44
+ }
45
+ },
46
+ "groups": {
47
+ "b4b": {
48
+ "acc,none": 0.657213316892725,
49
+ "acc_stderr,none": 0.12271990860540663,
50
+ "acc_norm,none": 0.657213316892725,
51
+ "acc_norm_stderr,none": 0.12271990860540663,
52
+ "alias": "b4b"
53
+ }
54
+ },
55
+ "configs": {
56
+ "b4bqa": {
57
+ "task": "b4bqa",
58
+ "dataset_path": "AIM-Harvard/b4b_drug_qa",
59
+ "test_split": "test",
60
+ "doc_to_text": "<function process_cd at 0x7f77e7a6d090>",
61
+ "doc_to_target": "correct_choice",
62
+ "doc_to_choice": [
63
+ "A",
64
+ "B",
65
+ "C",
66
+ "D"
67
+ ],
68
+ "description": "",
69
+ "target_delimiter": " ",
70
+ "fewshot_delimiter": "\n\n",
71
+ "metric_list": [
72
+ {
73
+ "metric": "acc",
74
+ "aggregation": "mean",
75
+ "higher_is_better": true
76
+ },
77
+ {
78
+ "metric": "acc_norm",
79
+ "aggregation": "mean",
80
+ "higher_is_better": true
81
+ }
82
+ ],
83
+ "output_type": "multiple_choice",
84
+ "repeats": 1,
85
+ "should_decontaminate": false
86
+ },
87
+ "medmcqa_g2b": {
88
+ "task": "medmcqa_g2b",
89
+ "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
90
+ "training_split": "train",
91
+ "validation_split": "validation",
92
+ "test_split": "validation",
93
+ "doc_to_text": "<function doc_to_text at 0x7f77e770c550>",
94
+ "doc_to_target": "cop",
95
+ "doc_to_choice": [
96
+ "A",
97
+ "B",
98
+ "C",
99
+ "D"
100
+ ],
101
+ "description": "",
102
+ "target_delimiter": " ",
103
+ "fewshot_delimiter": "\n\n",
104
+ "metric_list": [
105
+ {
106
+ "metric": "acc",
107
+ "aggregation": "mean",
108
+ "higher_is_better": true
109
+ },
110
+ {
111
+ "metric": "acc_norm",
112
+ "aggregation": "mean",
113
+ "higher_is_better": true
114
+ }
115
+ ],
116
+ "output_type": "multiple_choice",
117
+ "repeats": 1,
118
+ "should_decontaminate": true,
119
+ "doc_to_decontamination_query": "{{question}}"
120
+ },
121
+ "medmcqa_orig_filtered": {
122
+ "task": "medmcqa_orig_filtered",
123
+ "dataset_path": "AIM-Harvard/medmcqa_original",
124
+ "training_split": "train",
125
+ "validation_split": "validation",
126
+ "test_split": "validation",
127
+ "doc_to_text": "<function doc_to_text at 0x7f77e770c700>",
128
+ "doc_to_target": "cop",
129
+ "doc_to_choice": [
130
+ "A",
131
+ "B",
132
+ "C",
133
+ "D"
134
+ ],
135
+ "description": "",
136
+ "target_delimiter": " ",
137
+ "fewshot_delimiter": "\n\n",
138
+ "metric_list": [
139
+ {
140
+ "metric": "acc",
141
+ "aggregation": "mean",
142
+ "higher_is_better": true
143
+ },
144
+ {
145
+ "metric": "acc_norm",
146
+ "aggregation": "mean",
147
+ "higher_is_better": true
148
+ }
149
+ ],
150
+ "output_type": "multiple_choice",
151
+ "repeats": 1,
152
+ "should_decontaminate": true,
153
+ "doc_to_decontamination_query": "{{question}}"
154
+ },
155
+ "medqa_4options_g2b": {
156
+ "task": "medqa_4options_g2b",
157
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
158
+ "training_split": "train",
159
+ "validation_split": "validation",
160
+ "test_split": "test",
161
+ "doc_to_text": "<function doc_to_text at 0x7f77e770f6d0>",
162
+ "doc_to_target": "<function doc_to_target at 0x7f77e770fa30>",
163
+ "doc_to_choice": [
164
+ "A",
165
+ "B",
166
+ "C",
167
+ "D"
168
+ ],
169
+ "description": "",
170
+ "target_delimiter": " ",
171
+ "fewshot_delimiter": "\n\n",
172
+ "metric_list": [
173
+ {
174
+ "metric": "acc",
175
+ "aggregation": "mean",
176
+ "higher_is_better": true
177
+ },
178
+ {
179
+ "metric": "acc_norm",
180
+ "aggregation": "mean",
181
+ "higher_is_better": true
182
+ }
183
+ ],
184
+ "output_type": "multiple_choice",
185
+ "repeats": 1,
186
+ "should_decontaminate": false
187
+ },
188
+ "medqa_4options_orig_filtered": {
189
+ "task": "medqa_4options_orig_filtered",
190
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
191
+ "training_split": "train",
192
+ "validation_split": "validation",
193
+ "test_split": "test",
194
+ "doc_to_text": "<function doc_to_text at 0x7f77e770fc70>",
195
+ "doc_to_target": "<function doc_to_target at 0x7f77e770feb0>",
196
+ "doc_to_choice": [
197
+ "A",
198
+ "B",
199
+ "C",
200
+ "D"
201
+ ],
202
+ "description": "",
203
+ "target_delimiter": " ",
204
+ "fewshot_delimiter": "\n\n",
205
+ "metric_list": [
206
+ {
207
+ "metric": "acc",
208
+ "aggregation": "mean",
209
+ "higher_is_better": true
210
+ },
211
+ {
212
+ "metric": "acc_norm",
213
+ "aggregation": "mean",
214
+ "higher_is_better": true
215
+ }
216
+ ],
217
+ "output_type": "multiple_choice",
218
+ "repeats": 1,
219
+ "should_decontaminate": false
220
+ }
221
+ },
222
+ "versions": {
223
+ "b4b": "N/A",
224
+ "b4bqa": "Yaml",
225
+ "medmcqa_g2b": "Yaml",
226
+ "medmcqa_orig_filtered": "Yaml",
227
+ "medqa_4options_g2b": "Yaml",
228
+ "medqa_4options_orig_filtered": "Yaml"
229
+ },
230
+ "n-shot": {
231
+ "b4b": 0,
232
+ "b4bqa": 0,
233
+ "medmcqa_g2b": 0,
234
+ "medmcqa_orig_filtered": 0,
235
+ "medqa_4options_g2b": 0,
236
+ "medqa_4options_orig_filtered": 0
237
+ },
238
+ "config": {
239
+ "model": "hf",
240
+ "model_args": "pretrained=CohereForAI/aya-23-35B,load_in_4bit=True",
241
+ "batch_size": "4",
242
+ "batch_sizes": [],
243
+ "device": "cuda:0",
244
+ "use_cache": null,
245
+ "limit": null,
246
+ "bootstrap_iters": 100000,
247
+ "gen_kwargs": null
248
+ },
249
+ "git_hash": "928c7657"
250
+ }
data/raw-eval-outputs/CohereForAI-c4ai-command-r-plus_results.json ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "b4b": {
4
+ "acc,none": 0.7241060419235512,
5
+ "acc_stderr,none": 0.12287593035527263,
6
+ "acc_norm,none": 0.7241060419235512,
7
+ "acc_norm_stderr,none": 0.12287593035527263,
8
+ "alias": "b4b"
9
+ },
10
+ "b4bqa": {
11
+ "acc,none": 0.8493303571428571,
12
+ "acc_stderr,none": 0.00845285482249418,
13
+ "acc_norm,none": 0.8493303571428571,
14
+ "acc_norm_stderr,none": 0.00845285482249418,
15
+ "alias": " - b4bqa"
16
+ },
17
+ "medmcqa_g2b": {
18
+ "acc,none": 0.49137931034482757,
19
+ "acc_stderr,none": 0.026837416550737143,
20
+ "acc_norm,none": 0.49137931034482757,
21
+ "acc_norm_stderr,none": 0.026837416550737143,
22
+ "alias": " - medmcqa_g2b"
23
+ },
24
+ "medmcqa_orig_filtered": {
25
+ "acc,none": 0.6149425287356322,
26
+ "acc_stderr,none": 0.026122534084516178,
27
+ "acc_norm,none": 0.6149425287356322,
28
+ "acc_norm_stderr,none": 0.026122534084516178,
29
+ "alias": " - medmcqa_orig_filtered"
30
+ },
31
+ "medqa_4options_g2b": {
32
+ "acc,none": 0.5661375661375662,
33
+ "acc_stderr,none": 0.0255250343824749,
34
+ "acc_norm,none": 0.5661375661375662,
35
+ "acc_norm_stderr,none": 0.0255250343824749,
36
+ "alias": " - medqa_4options_g2b"
37
+ },
38
+ "medqa_4options_orig_filtered": {
39
+ "acc,none": 0.6031746031746031,
40
+ "acc_stderr,none": 0.025197101074246483,
41
+ "acc_norm,none": 0.6031746031746031,
42
+ "acc_norm_stderr,none": 0.025197101074246483,
43
+ "alias": " - medqa_4options_orig_filtered"
44
+ }
45
+ },
46
+ "groups": {
47
+ "b4b": {
48
+ "acc,none": 0.7241060419235512,
49
+ "acc_stderr,none": 0.12287593035527263,
50
+ "acc_norm,none": 0.7241060419235512,
51
+ "acc_norm_stderr,none": 0.12287593035527263,
52
+ "alias": "b4b"
53
+ }
54
+ },
55
+ "configs": {
56
+ "b4bqa": {
57
+ "task": "b4bqa",
58
+ "dataset_path": "AIM-Harvard/b4b_drug_qa",
59
+ "test_split": "test",
60
+ "doc_to_text": "<function process_cd at 0x7f6d9dc51090>",
61
+ "doc_to_target": "correct_choice",
62
+ "doc_to_choice": [
63
+ "A",
64
+ "B",
65
+ "C",
66
+ "D"
67
+ ],
68
+ "description": "",
69
+ "target_delimiter": " ",
70
+ "fewshot_delimiter": "\n\n",
71
+ "metric_list": [
72
+ {
73
+ "metric": "acc",
74
+ "aggregation": "mean",
75
+ "higher_is_better": true
76
+ },
77
+ {
78
+ "metric": "acc_norm",
79
+ "aggregation": "mean",
80
+ "higher_is_better": true
81
+ }
82
+ ],
83
+ "output_type": "multiple_choice",
84
+ "repeats": 1,
85
+ "should_decontaminate": false
86
+ },
87
+ "medmcqa_g2b": {
88
+ "task": "medmcqa_g2b",
89
+ "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
90
+ "training_split": "train",
91
+ "validation_split": "validation",
92
+ "test_split": "validation",
93
+ "doc_to_text": "<function doc_to_text at 0x7f6d9d85c550>",
94
+ "doc_to_target": "cop",
95
+ "doc_to_choice": [
96
+ "A",
97
+ "B",
98
+ "C",
99
+ "D"
100
+ ],
101
+ "description": "",
102
+ "target_delimiter": " ",
103
+ "fewshot_delimiter": "\n\n",
104
+ "metric_list": [
105
+ {
106
+ "metric": "acc",
107
+ "aggregation": "mean",
108
+ "higher_is_better": true
109
+ },
110
+ {
111
+ "metric": "acc_norm",
112
+ "aggregation": "mean",
113
+ "higher_is_better": true
114
+ }
115
+ ],
116
+ "output_type": "multiple_choice",
117
+ "repeats": 1,
118
+ "should_decontaminate": true,
119
+ "doc_to_decontamination_query": "{{question}}"
120
+ },
121
+ "medmcqa_orig_filtered": {
122
+ "task": "medmcqa_orig_filtered",
123
+ "dataset_path": "AIM-Harvard/medmcqa_original",
124
+ "training_split": "train",
125
+ "validation_split": "validation",
126
+ "test_split": "validation",
127
+ "doc_to_text": "<function doc_to_text at 0x7f6d9d85c700>",
128
+ "doc_to_target": "cop",
129
+ "doc_to_choice": [
130
+ "A",
131
+ "B",
132
+ "C",
133
+ "D"
134
+ ],
135
+ "description": "",
136
+ "target_delimiter": " ",
137
+ "fewshot_delimiter": "\n\n",
138
+ "metric_list": [
139
+ {
140
+ "metric": "acc",
141
+ "aggregation": "mean",
142
+ "higher_is_better": true
143
+ },
144
+ {
145
+ "metric": "acc_norm",
146
+ "aggregation": "mean",
147
+ "higher_is_better": true
148
+ }
149
+ ],
150
+ "output_type": "multiple_choice",
151
+ "repeats": 1,
152
+ "should_decontaminate": true,
153
+ "doc_to_decontamination_query": "{{question}}"
154
+ },
155
+ "medqa_4options_g2b": {
156
+ "task": "medqa_4options_g2b",
157
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
158
+ "training_split": "train",
159
+ "validation_split": "validation",
160
+ "test_split": "test",
161
+ "doc_to_text": "<function doc_to_text at 0x7f6d9d85f6d0>",
162
+ "doc_to_target": "<function doc_to_target at 0x7f6d9d85fa30>",
163
+ "doc_to_choice": [
164
+ "A",
165
+ "B",
166
+ "C",
167
+ "D"
168
+ ],
169
+ "description": "",
170
+ "target_delimiter": " ",
171
+ "fewshot_delimiter": "\n\n",
172
+ "metric_list": [
173
+ {
174
+ "metric": "acc",
175
+ "aggregation": "mean",
176
+ "higher_is_better": true
177
+ },
178
+ {
179
+ "metric": "acc_norm",
180
+ "aggregation": "mean",
181
+ "higher_is_better": true
182
+ }
183
+ ],
184
+ "output_type": "multiple_choice",
185
+ "repeats": 1,
186
+ "should_decontaminate": false
187
+ },
188
+ "medqa_4options_orig_filtered": {
189
+ "task": "medqa_4options_orig_filtered",
190
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
191
+ "training_split": "train",
192
+ "validation_split": "validation",
193
+ "test_split": "test",
194
+ "doc_to_text": "<function doc_to_text at 0x7f6d9d85fc70>",
195
+ "doc_to_target": "<function doc_to_target at 0x7f6d9d85feb0>",
196
+ "doc_to_choice": [
197
+ "A",
198
+ "B",
199
+ "C",
200
+ "D"
201
+ ],
202
+ "description": "",
203
+ "target_delimiter": " ",
204
+ "fewshot_delimiter": "\n\n",
205
+ "metric_list": [
206
+ {
207
+ "metric": "acc",
208
+ "aggregation": "mean",
209
+ "higher_is_better": true
210
+ },
211
+ {
212
+ "metric": "acc_norm",
213
+ "aggregation": "mean",
214
+ "higher_is_better": true
215
+ }
216
+ ],
217
+ "output_type": "multiple_choice",
218
+ "repeats": 1,
219
+ "should_decontaminate": false
220
+ }
221
+ },
222
+ "versions": {
223
+ "b4b": "N/A",
224
+ "b4bqa": "Yaml",
225
+ "medmcqa_g2b": "Yaml",
226
+ "medmcqa_orig_filtered": "Yaml",
227
+ "medqa_4options_g2b": "Yaml",
228
+ "medqa_4options_orig_filtered": "Yaml"
229
+ },
230
+ "n-shot": {
231
+ "b4b": 0,
232
+ "b4bqa": 0,
233
+ "medmcqa_g2b": 0,
234
+ "medmcqa_orig_filtered": 0,
235
+ "medqa_4options_g2b": 0,
236
+ "medqa_4options_orig_filtered": 0
237
+ },
238
+ "config": {
239
+ "model": "hf",
240
+ "model_args": "pretrained=CohereForAI/c4ai-command-r-plus,load_in_4bit=True",
241
+ "batch_size": "4",
242
+ "batch_sizes": [],
243
+ "device": "cuda:0",
244
+ "use_cache": null,
245
+ "limit": null,
246
+ "bootstrap_iters": 100000,
247
+ "gen_kwargs": null
248
+ },
249
+ "git_hash": "928c7657"
250
+ }
data/raw-eval-outputs/ProbeMedicalYonseiMAILab-medllama3-v20_results.json ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "b4b": {
4
+ "acc,none": 0.7475339087546239,
5
+ "acc_stderr,none": 0.0611860272880456,
6
+ "acc_norm,none": 0.7475339087546239,
7
+ "acc_norm_stderr,none": 0.0611860272880456,
8
+ "alias": "b4b"
9
+ },
10
+ "b4bqa": {
11
+ "acc,none": 0.7193080357142857,
12
+ "acc_stderr,none": 0.01061755826614456,
13
+ "acc_norm,none": 0.7193080357142857,
14
+ "acc_norm_stderr,none": 0.01061755826614456,
15
+ "alias": " - b4bqa"
16
+ },
17
+ "medmcqa_g2b": {
18
+ "acc,none": 0.6522988505747126,
19
+ "acc_stderr,none": 0.025565932174194388,
20
+ "acc_norm,none": 0.6522988505747126,
21
+ "acc_norm_stderr,none": 0.025565932174194388,
22
+ "alias": " - medmcqa_g2b"
23
+ },
24
+ "medmcqa_orig_filtered": {
25
+ "acc,none": 0.8017241379310345,
26
+ "acc_stderr,none": 0.021403394960161685,
27
+ "acc_norm,none": 0.8017241379310345,
28
+ "acc_norm_stderr,none": 0.021403394960161685,
29
+ "alias": " - medmcqa_orig_filtered"
30
+ },
31
+ "medqa_4options_g2b": {
32
+ "acc,none": 0.7645502645502645,
33
+ "acc_stderr,none": 0.021851509822031715,
34
+ "acc_norm,none": 0.7645502645502645,
35
+ "acc_norm_stderr,none": 0.021851509822031715,
36
+ "alias": " - medqa_4options_g2b"
37
+ },
38
+ "medqa_4options_orig_filtered": {
39
+ "acc,none": 0.9021164021164021,
40
+ "acc_stderr,none": 0.015304374225091422,
41
+ "acc_norm,none": 0.9021164021164021,
42
+ "acc_norm_stderr,none": 0.015304374225091422,
43
+ "alias": " - medqa_4options_orig_filtered"
44
+ }
45
+ },
46
+ "groups": {
47
+ "b4b": {
48
+ "acc,none": 0.7475339087546239,
49
+ "acc_stderr,none": 0.0611860272880456,
50
+ "acc_norm,none": 0.7475339087546239,
51
+ "acc_norm_stderr,none": 0.0611860272880456,
52
+ "alias": "b4b"
53
+ }
54
+ },
55
+ "configs": {
56
+ "b4bqa": {
57
+ "task": "b4bqa",
58
+ "dataset_path": "AIM-Harvard/b4b_drug_qa",
59
+ "test_split": "test",
60
+ "doc_to_text": "<function process_cd at 0x7f59e4b48820>",
61
+ "doc_to_target": "correct_choice",
62
+ "doc_to_choice": [
63
+ "A",
64
+ "B",
65
+ "C",
66
+ "D"
67
+ ],
68
+ "description": "",
69
+ "target_delimiter": " ",
70
+ "fewshot_delimiter": "\n\n",
71
+ "metric_list": [
72
+ {
73
+ "metric": "acc",
74
+ "aggregation": "mean",
75
+ "higher_is_better": true
76
+ },
77
+ {
78
+ "metric": "acc_norm",
79
+ "aggregation": "mean",
80
+ "higher_is_better": true
81
+ }
82
+ ],
83
+ "output_type": "multiple_choice",
84
+ "repeats": 1,
85
+ "should_decontaminate": false
86
+ },
87
+ "medmcqa_g2b": {
88
+ "task": "medmcqa_g2b",
89
+ "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
90
+ "training_split": "train",
91
+ "validation_split": "validation",
92
+ "test_split": "validation",
93
+ "doc_to_text": "<function doc_to_text at 0x7f59e4b92050>",
94
+ "doc_to_target": "cop",
95
+ "doc_to_choice": [
96
+ "A",
97
+ "B",
98
+ "C",
99
+ "D"
100
+ ],
101
+ "description": "",
102
+ "target_delimiter": " ",
103
+ "fewshot_delimiter": "\n\n",
104
+ "metric_list": [
105
+ {
106
+ "metric": "acc",
107
+ "aggregation": "mean",
108
+ "higher_is_better": true
109
+ },
110
+ {
111
+ "metric": "acc_norm",
112
+ "aggregation": "mean",
113
+ "higher_is_better": true
114
+ }
115
+ ],
116
+ "output_type": "multiple_choice",
117
+ "repeats": 1,
118
+ "should_decontaminate": true,
119
+ "doc_to_decontamination_query": "{{question}}"
120
+ },
121
+ "medmcqa_orig_filtered": {
122
+ "task": "medmcqa_orig_filtered",
123
+ "dataset_path": "AIM-Harvard/medmcqa_original",
124
+ "training_split": "train",
125
+ "validation_split": "validation",
126
+ "test_split": "validation",
127
+ "doc_to_text": "<function doc_to_text at 0x7f59d579d240>",
128
+ "doc_to_target": "cop",
129
+ "doc_to_choice": [
130
+ "A",
131
+ "B",
132
+ "C",
133
+ "D"
134
+ ],
135
+ "description": "",
136
+ "target_delimiter": " ",
137
+ "fewshot_delimiter": "\n\n",
138
+ "metric_list": [
139
+ {
140
+ "metric": "acc",
141
+ "aggregation": "mean",
142
+ "higher_is_better": true
143
+ },
144
+ {
145
+ "metric": "acc_norm",
146
+ "aggregation": "mean",
147
+ "higher_is_better": true
148
+ }
149
+ ],
150
+ "output_type": "multiple_choice",
151
+ "repeats": 1,
152
+ "should_decontaminate": true,
153
+ "doc_to_decontamination_query": "{{question}}"
154
+ },
155
+ "medqa_4options_g2b": {
156
+ "task": "medqa_4options_g2b",
157
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
158
+ "training_split": "train",
159
+ "validation_split": "validation",
160
+ "test_split": "test",
161
+ "doc_to_text": "<function doc_to_text at 0x7f59e4a54820>",
162
+ "doc_to_target": "<function doc_to_target at 0x7f59e4a54b80>",
163
+ "doc_to_choice": [
164
+ "A",
165
+ "B",
166
+ "C",
167
+ "D"
168
+ ],
169
+ "description": "",
170
+ "target_delimiter": " ",
171
+ "fewshot_delimiter": "\n\n",
172
+ "metric_list": [
173
+ {
174
+ "metric": "acc",
175
+ "aggregation": "mean",
176
+ "higher_is_better": true
177
+ },
178
+ {
179
+ "metric": "acc_norm",
180
+ "aggregation": "mean",
181
+ "higher_is_better": true
182
+ }
183
+ ],
184
+ "output_type": "multiple_choice",
185
+ "repeats": 1,
186
+ "should_decontaminate": false
187
+ },
188
+ "medqa_4options_orig_filtered": {
189
+ "task": "medqa_4options_orig_filtered",
190
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
191
+ "training_split": "train",
192
+ "validation_split": "validation",
193
+ "test_split": "test",
194
+ "doc_to_text": "<function doc_to_text at 0x7f59e4b0e680>",
195
+ "doc_to_target": "<function doc_to_target at 0x7f59e4b2d120>",
196
+ "doc_to_choice": [
197
+ "A",
198
+ "B",
199
+ "C",
200
+ "D"
201
+ ],
202
+ "description": "",
203
+ "target_delimiter": " ",
204
+ "fewshot_delimiter": "\n\n",
205
+ "metric_list": [
206
+ {
207
+ "metric": "acc",
208
+ "aggregation": "mean",
209
+ "higher_is_better": true
210
+ },
211
+ {
212
+ "metric": "acc_norm",
213
+ "aggregation": "mean",
214
+ "higher_is_better": true
215
+ }
216
+ ],
217
+ "output_type": "multiple_choice",
218
+ "repeats": 1,
219
+ "should_decontaminate": false
220
+ }
221
+ },
222
+ "versions": {
223
+ "b4b": "N/A",
224
+ "b4bqa": "Yaml",
225
+ "medmcqa_g2b": "Yaml",
226
+ "medmcqa_orig_filtered": "Yaml",
227
+ "medqa_4options_g2b": "Yaml",
228
+ "medqa_4options_orig_filtered": "Yaml"
229
+ },
230
+ "n-shot": {
231
+ "b4b": 0,
232
+ "b4bqa": 0,
233
+ "medmcqa_g2b": 0,
234
+ "medmcqa_orig_filtered": 0,
235
+ "medqa_4options_g2b": 0,
236
+ "medqa_4options_orig_filtered": 0
237
+ },
238
+ "config": {
239
+ "model": "hf",
240
+ "model_args": "pretrained=ProbeMedicalYonseiMAILab/medllama3-v20,parallelize=True,load_in_4bit=True",
241
+ "batch_size": "auto",
242
+ "batch_sizes": [
243
+ 32
244
+ ],
245
+ "device": null,
246
+ "use_cache": null,
247
+ "limit": null,
248
+ "bootstrap_iters": 100000,
249
+ "gen_kwargs": null
250
+ },
251
+ "git_hash": "928c7657"
252
+ }
data/raw-eval-outputs/Qwen-Qwen2-72B_results.json ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "b4b": {
4
+ "acc,none": 0.8372379778051788,
5
+ "acc_stderr,none": 0.07216098703042964,
6
+ "acc_norm,none": 0.8372379778051788,
7
+ "acc_norm_stderr,none": 0.07216098703042964,
8
+ "alias": "b4b"
9
+ },
10
+ "b4bqa": {
11
+ "acc,none": 0.91015625,
12
+ "acc_stderr,none": 0.006757003132881115,
13
+ "acc_norm,none": 0.91015625,
14
+ "acc_norm_stderr,none": 0.006757003132881115,
15
+ "alias": " - b4bqa"
16
+ },
17
+ "medmcqa_g2b": {
18
+ "acc,none": 0.7155172413793104,
19
+ "acc_stderr,none": 0.024219952635630794,
20
+ "acc_norm,none": 0.7155172413793104,
21
+ "acc_norm_stderr,none": 0.024219952635630794,
22
+ "alias": " - medmcqa_g2b"
23
+ },
24
+ "medmcqa_orig_filtered": {
25
+ "acc,none": 0.7787356321839081,
26
+ "acc_stderr,none": 0.02228363451068677,
27
+ "acc_norm,none": 0.7787356321839081,
28
+ "acc_norm_stderr,none": 0.02228363451068677,
29
+ "alias": " - medmcqa_orig_filtered"
30
+ },
31
+ "medqa_4options_g2b": {
32
+ "acc,none": 0.7407407407407407,
33
+ "acc_stderr,none": 0.022569897074918417,
34
+ "acc_norm,none": 0.7407407407407407,
35
+ "acc_norm_stderr,none": 0.022569897074918417,
36
+ "alias": " - medqa_4options_g2b"
37
+ },
38
+ "medqa_4options_orig_filtered": {
39
+ "acc,none": 0.753968253968254,
40
+ "acc_stderr,none": 0.022182037202948368,
41
+ "acc_norm,none": 0.753968253968254,
42
+ "acc_norm_stderr,none": 0.022182037202948368,
43
+ "alias": " - medqa_4options_orig_filtered"
44
+ }
45
+ },
46
+ "groups": {
47
+ "b4b": {
48
+ "acc,none": 0.8372379778051788,
49
+ "acc_stderr,none": 0.07216098703042964,
50
+ "acc_norm,none": 0.8372379778051788,
51
+ "acc_norm_stderr,none": 0.07216098703042964,
52
+ "alias": "b4b"
53
+ }
54
+ },
55
+ "configs": {
56
+ "b4bqa": {
57
+ "task": "b4bqa",
58
+ "dataset_path": "AIM-Harvard/b4b_drug_qa",
59
+ "test_split": "test",
60
+ "doc_to_text": "<function process_cd at 0x7fe2a537cf70>",
61
+ "doc_to_target": "correct_choice",
62
+ "doc_to_choice": [
63
+ "A",
64
+ "B",
65
+ "C",
66
+ "D"
67
+ ],
68
+ "description": "",
69
+ "target_delimiter": " ",
70
+ "fewshot_delimiter": "\n\n",
71
+ "metric_list": [
72
+ {
73
+ "metric": "acc",
74
+ "aggregation": "mean",
75
+ "higher_is_better": true
76
+ },
77
+ {
78
+ "metric": "acc_norm",
79
+ "aggregation": "mean",
80
+ "higher_is_better": true
81
+ }
82
+ ],
83
+ "output_type": "multiple_choice",
84
+ "repeats": 1,
85
+ "should_decontaminate": false
86
+ },
87
+ "medmcqa_g2b": {
88
+ "task": "medmcqa_g2b",
89
+ "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
90
+ "training_split": "train",
91
+ "validation_split": "validation",
92
+ "test_split": "validation",
93
+ "doc_to_text": "<function doc_to_text at 0x7fe2a4fa0430>",
94
+ "doc_to_target": "cop",
95
+ "doc_to_choice": [
96
+ "A",
97
+ "B",
98
+ "C",
99
+ "D"
100
+ ],
101
+ "description": "",
102
+ "target_delimiter": " ",
103
+ "fewshot_delimiter": "\n\n",
104
+ "metric_list": [
105
+ {
106
+ "metric": "acc",
107
+ "aggregation": "mean",
108
+ "higher_is_better": true
109
+ },
110
+ {
111
+ "metric": "acc_norm",
112
+ "aggregation": "mean",
113
+ "higher_is_better": true
114
+ }
115
+ ],
116
+ "output_type": "multiple_choice",
117
+ "repeats": 1,
118
+ "should_decontaminate": true,
119
+ "doc_to_decontamination_query": "{{question}}"
120
+ },
121
+ "medmcqa_orig_filtered": {
122
+ "task": "medmcqa_orig_filtered",
123
+ "dataset_path": "AIM-Harvard/medmcqa_original",
124
+ "training_split": "train",
125
+ "validation_split": "validation",
126
+ "test_split": "validation",
127
+ "doc_to_text": "<function doc_to_text at 0x7fe2a4fa05e0>",
128
+ "doc_to_target": "cop",
129
+ "doc_to_choice": [
130
+ "A",
131
+ "B",
132
+ "C",
133
+ "D"
134
+ ],
135
+ "description": "",
136
+ "target_delimiter": " ",
137
+ "fewshot_delimiter": "\n\n",
138
+ "metric_list": [
139
+ {
140
+ "metric": "acc",
141
+ "aggregation": "mean",
142
+ "higher_is_better": true
143
+ },
144
+ {
145
+ "metric": "acc_norm",
146
+ "aggregation": "mean",
147
+ "higher_is_better": true
148
+ }
149
+ ],
150
+ "output_type": "multiple_choice",
151
+ "repeats": 1,
152
+ "should_decontaminate": true,
153
+ "doc_to_decontamination_query": "{{question}}"
154
+ },
155
+ "medqa_4options_g2b": {
156
+ "task": "medqa_4options_g2b",
157
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
158
+ "training_split": "train",
159
+ "validation_split": "validation",
160
+ "test_split": "test",
161
+ "doc_to_text": "<function doc_to_text at 0x7fe2a4fa35b0>",
162
+ "doc_to_target": "<function doc_to_target at 0x7fe2a4fa3910>",
163
+ "doc_to_choice": [
164
+ "A",
165
+ "B",
166
+ "C",
167
+ "D"
168
+ ],
169
+ "description": "",
170
+ "target_delimiter": " ",
171
+ "fewshot_delimiter": "\n\n",
172
+ "metric_list": [
173
+ {
174
+ "metric": "acc",
175
+ "aggregation": "mean",
176
+ "higher_is_better": true
177
+ },
178
+ {
179
+ "metric": "acc_norm",
180
+ "aggregation": "mean",
181
+ "higher_is_better": true
182
+ }
183
+ ],
184
+ "output_type": "multiple_choice",
185
+ "repeats": 1,
186
+ "should_decontaminate": false
187
+ },
188
+ "medqa_4options_orig_filtered": {
189
+ "task": "medqa_4options_orig_filtered",
190
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
191
+ "training_split": "train",
192
+ "validation_split": "validation",
193
+ "test_split": "test",
194
+ "doc_to_text": "<function doc_to_text at 0x7fe2a4fa3b50>",
195
+ "doc_to_target": "<function doc_to_target at 0x7fe2a4fa3d90>",
196
+ "doc_to_choice": [
197
+ "A",
198
+ "B",
199
+ "C",
200
+ "D"
201
+ ],
202
+ "description": "",
203
+ "target_delimiter": " ",
204
+ "fewshot_delimiter": "\n\n",
205
+ "metric_list": [
206
+ {
207
+ "metric": "acc",
208
+ "aggregation": "mean",
209
+ "higher_is_better": true
210
+ },
211
+ {
212
+ "metric": "acc_norm",
213
+ "aggregation": "mean",
214
+ "higher_is_better": true
215
+ }
216
+ ],
217
+ "output_type": "multiple_choice",
218
+ "repeats": 1,
219
+ "should_decontaminate": false
220
+ }
221
+ },
222
+ "versions": {
223
+ "b4b": "N/A",
224
+ "b4bqa": "Yaml",
225
+ "medmcqa_g2b": "Yaml",
226
+ "medmcqa_orig_filtered": "Yaml",
227
+ "medqa_4options_g2b": "Yaml",
228
+ "medqa_4options_orig_filtered": "Yaml"
229
+ },
230
+ "n-shot": {
231
+ "b4b": 0,
232
+ "b4bqa": 0,
233
+ "medmcqa_g2b": 0,
234
+ "medmcqa_orig_filtered": 0,
235
+ "medqa_4options_g2b": 0,
236
+ "medqa_4options_orig_filtered": 0
237
+ },
238
+ "config": {
239
+ "model": "hf",
240
+ "model_args": "pretrained=Qwen/Qwen2-72B,load_in_4bit=True",
241
+ "batch_size": "4",
242
+ "batch_sizes": [],
243
+ "device": "cuda:0",
244
+ "use_cache": null,
245
+ "limit": null,
246
+ "bootstrap_iters": 100000,
247
+ "gen_kwargs": null
248
+ },
249
+ "git_hash": "928c7657"
250
+ }
data/raw-eval-outputs/Qwen-Qwen2-7B_results.json ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "b4b": {
4
+ "acc,none": 0.7028360049321827,
5
+ "acc_stderr,none": 0.1004832322485701,
6
+ "acc_norm,none": 0.7028360049321827,
7
+ "acc_norm_stderr,none": 0.1004832322485701,
8
+ "alias": "b4b"
9
+ },
10
+ "b4bqa": {
11
+ "acc,none": 0.8041294642857143,
12
+ "acc_stderr,none": 0.009377773744245437,
13
+ "acc_norm,none": 0.8041294642857143,
14
+ "acc_norm_stderr,none": 0.009377773744245437,
15
+ "alias": " - b4bqa"
16
+ },
17
+ "medmcqa_g2b": {
18
+ "acc,none": 0.5517241379310345,
19
+ "acc_stderr,none": 0.02669739777037782,
20
+ "acc_norm,none": 0.5517241379310345,
21
+ "acc_norm_stderr,none": 0.02669739777037782,
22
+ "alias": " - medmcqa_g2b"
23
+ },
24
+ "medmcqa_orig_filtered": {
25
+ "acc,none": 0.6350574712643678,
26
+ "acc_stderr,none": 0.025843659831273274,
27
+ "acc_norm,none": 0.6350574712643678,
28
+ "acc_norm_stderr,none": 0.025843659831273274,
29
+ "alias": " - medmcqa_orig_filtered"
30
+ },
31
+ "medqa_4options_g2b": {
32
+ "acc,none": 0.5370370370370371,
33
+ "acc_stderr,none": 0.025680564640056882,
34
+ "acc_norm,none": 0.5370370370370371,
35
+ "acc_norm_stderr,none": 0.025680564640056882,
36
+ "alias": " - medqa_4options_g2b"
37
+ },
38
+ "medqa_4options_orig_filtered": {
39
+ "acc,none": 0.58994708994709,
40
+ "acc_stderr,none": 0.025331202438944444,
41
+ "acc_norm,none": 0.58994708994709,
42
+ "acc_norm_stderr,none": 0.025331202438944444,
43
+ "alias": " - medqa_4options_orig_filtered"
44
+ }
45
+ },
46
+ "groups": {
47
+ "b4b": {
48
+ "acc,none": 0.7028360049321827,
49
+ "acc_stderr,none": 0.1004832322485701,
50
+ "acc_norm,none": 0.7028360049321827,
51
+ "acc_norm_stderr,none": 0.1004832322485701,
52
+ "alias": "b4b"
53
+ }
54
+ },
55
+ "configs": {
56
+ "b4bqa": {
57
+ "task": "b4bqa",
58
+ "dataset_path": "AIM-Harvard/b4b_drug_qa",
59
+ "test_split": "test",
60
+ "doc_to_text": "<function process_cd at 0x7f8319c60ee0>",
61
+ "doc_to_target": "correct_choice",
62
+ "doc_to_choice": [
63
+ "A",
64
+ "B",
65
+ "C",
66
+ "D"
67
+ ],
68
+ "description": "",
69
+ "target_delimiter": " ",
70
+ "fewshot_delimiter": "\n\n",
71
+ "metric_list": [
72
+ {
73
+ "metric": "acc",
74
+ "aggregation": "mean",
75
+ "higher_is_better": true
76
+ },
77
+ {
78
+ "metric": "acc_norm",
79
+ "aggregation": "mean",
80
+ "higher_is_better": true
81
+ }
82
+ ],
83
+ "output_type": "multiple_choice",
84
+ "repeats": 1,
85
+ "should_decontaminate": false
86
+ },
87
+ "medmcqa_g2b": {
88
+ "task": "medmcqa_g2b",
89
+ "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
90
+ "training_split": "train",
91
+ "validation_split": "validation",
92
+ "test_split": "validation",
93
+ "doc_to_text": "<function doc_to_text at 0x7f831a19e3a0>",
94
+ "doc_to_target": "cop",
95
+ "doc_to_choice": [
96
+ "A",
97
+ "B",
98
+ "C",
99
+ "D"
100
+ ],
101
+ "description": "",
102
+ "target_delimiter": " ",
103
+ "fewshot_delimiter": "\n\n",
104
+ "metric_list": [
105
+ {
106
+ "metric": "acc",
107
+ "aggregation": "mean",
108
+ "higher_is_better": true
109
+ },
110
+ {
111
+ "metric": "acc_norm",
112
+ "aggregation": "mean",
113
+ "higher_is_better": true
114
+ }
115
+ ],
116
+ "output_type": "multiple_choice",
117
+ "repeats": 1,
118
+ "should_decontaminate": true,
119
+ "doc_to_decontamination_query": "{{question}}"
120
+ },
121
+ "medmcqa_orig_filtered": {
122
+ "task": "medmcqa_orig_filtered",
123
+ "dataset_path": "AIM-Harvard/medmcqa_original",
124
+ "training_split": "train",
125
+ "validation_split": "validation",
126
+ "test_split": "validation",
127
+ "doc_to_text": "<function doc_to_text at 0x7f8319ac8310>",
128
+ "doc_to_target": "cop",
129
+ "doc_to_choice": [
130
+ "A",
131
+ "B",
132
+ "C",
133
+ "D"
134
+ ],
135
+ "description": "",
136
+ "target_delimiter": " ",
137
+ "fewshot_delimiter": "\n\n",
138
+ "metric_list": [
139
+ {
140
+ "metric": "acc",
141
+ "aggregation": "mean",
142
+ "higher_is_better": true
143
+ },
144
+ {
145
+ "metric": "acc_norm",
146
+ "aggregation": "mean",
147
+ "higher_is_better": true
148
+ }
149
+ ],
150
+ "output_type": "multiple_choice",
151
+ "repeats": 1,
152
+ "should_decontaminate": true,
153
+ "doc_to_decontamination_query": "{{question}}"
154
+ },
155
+ "medqa_4options_g2b": {
156
+ "task": "medqa_4options_g2b",
157
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
158
+ "training_split": "train",
159
+ "validation_split": "validation",
160
+ "test_split": "test",
161
+ "doc_to_text": "<function doc_to_text at 0x7f831a19e820>",
162
+ "doc_to_target": "<function doc_to_target at 0x7f831a19eb80>",
163
+ "doc_to_choice": [
164
+ "A",
165
+ "B",
166
+ "C",
167
+ "D"
168
+ ],
169
+ "description": "",
170
+ "target_delimiter": " ",
171
+ "fewshot_delimiter": "\n\n",
172
+ "metric_list": [
173
+ {
174
+ "metric": "acc",
175
+ "aggregation": "mean",
176
+ "higher_is_better": true
177
+ },
178
+ {
179
+ "metric": "acc_norm",
180
+ "aggregation": "mean",
181
+ "higher_is_better": true
182
+ }
183
+ ],
184
+ "output_type": "multiple_choice",
185
+ "repeats": 1,
186
+ "should_decontaminate": false
187
+ },
188
+ "medqa_4options_orig_filtered": {
189
+ "task": "medqa_4options_orig_filtered",
190
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
191
+ "training_split": "train",
192
+ "validation_split": "validation",
193
+ "test_split": "test",
194
+ "doc_to_text": "<function doc_to_text at 0x7f8319c844c0>",
195
+ "doc_to_target": "<function doc_to_target at 0x7f8319c30ee0>",
196
+ "doc_to_choice": [
197
+ "A",
198
+ "B",
199
+ "C",
200
+ "D"
201
+ ],
202
+ "description": "",
203
+ "target_delimiter": " ",
204
+ "fewshot_delimiter": "\n\n",
205
+ "metric_list": [
206
+ {
207
+ "metric": "acc",
208
+ "aggregation": "mean",
209
+ "higher_is_better": true
210
+ },
211
+ {
212
+ "metric": "acc_norm",
213
+ "aggregation": "mean",
214
+ "higher_is_better": true
215
+ }
216
+ ],
217
+ "output_type": "multiple_choice",
218
+ "repeats": 1,
219
+ "should_decontaminate": false
220
+ }
221
+ },
222
+ "versions": {
223
+ "b4b": "N/A",
224
+ "b4bqa": "Yaml",
225
+ "medmcqa_g2b": "Yaml",
226
+ "medmcqa_orig_filtered": "Yaml",
227
+ "medqa_4options_g2b": "Yaml",
228
+ "medqa_4options_orig_filtered": "Yaml"
229
+ },
230
+ "n-shot": {
231
+ "b4b": 0,
232
+ "b4bqa": 0,
233
+ "medmcqa_g2b": 0,
234
+ "medmcqa_orig_filtered": 0,
235
+ "medqa_4options_g2b": 0,
236
+ "medqa_4options_orig_filtered": 0
237
+ },
238
+ "config": {
239
+ "model": "hf",
240
+ "model_args": "pretrained=Qwen/Qwen2-7B,load_in_4bit=True",
241
+ "batch_size": "auto:64",
242
+ "batch_sizes": [
243
+ 8,
244
+ 8,
245
+ 16,
246
+ 16,
247
+ 16,
248
+ 16,
249
+ 16,
250
+ 16,
251
+ 16,
252
+ 16,
253
+ 16,
254
+ 16,
255
+ 32,
256
+ 32,
257
+ 32,
258
+ 64,
259
+ 64,
260
+ 64,
261
+ 64,
262
+ 64,
263
+ 64,
264
+ 64,
265
+ 64,
266
+ 64,
267
+ 64,
268
+ 64,
269
+ 64,
270
+ 64,
271
+ 64,
272
+ 64,
273
+ 64,
274
+ 64,
275
+ 64,
276
+ 64,
277
+ 64,
278
+ 64,
279
+ 64,
280
+ 64,
281
+ 64,
282
+ 64,
283
+ 64,
284
+ 64,
285
+ 64,
286
+ 64,
287
+ 64,
288
+ 64,
289
+ 64,
290
+ 64,
291
+ 64,
292
+ 64,
293
+ 64,
294
+ 64,
295
+ 64,
296
+ 64,
297
+ 64,
298
+ 64,
299
+ 64,
300
+ 64,
301
+ 64,
302
+ 64,
303
+ 64,
304
+ 64,
305
+ 64,
306
+ 64,
307
+ 64
308
+ ],
309
+ "device": "cuda:0",
310
+ "use_cache": null,
311
+ "limit": null,
312
+ "bootstrap_iters": 100000,
313
+ "gen_kwargs": null
314
+ },
315
+ "git_hash": "928c7657"
316
+ }
data/raw-eval-outputs/aaditya-Llama3-OpenBioLLM-70B_results.json ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "b4b": {
4
+ "acc,none": 0.7876078914919852,
5
+ "acc_stderr,none": 0.06728010300021042,
6
+ "acc_norm,none": 0.7876078914919852,
7
+ "acc_norm_stderr,none": 0.06728010300021042,
8
+ "alias": "b4b"
9
+ },
10
+ "b4bqa": {
11
+ "acc,none": 0.8510044642857143,
12
+ "acc_stderr,none": 0.008414043525477657,
13
+ "acc_norm,none": 0.8510044642857143,
14
+ "acc_norm_stderr,none": 0.008414043525477657,
15
+ "alias": " - b4bqa"
16
+ },
17
+ "medmcqa_g2b": {
18
+ "acc,none": 0.632183908045977,
19
+ "acc_stderr,none": 0.025886440903166212,
20
+ "acc_norm,none": 0.632183908045977,
21
+ "acc_norm_stderr,none": 0.025886440903166212,
22
+ "alias": " - medmcqa_g2b"
23
+ },
24
+ "medmcqa_orig_filtered": {
25
+ "acc,none": 0.7385057471264368,
26
+ "acc_stderr,none": 0.023590833013480327,
27
+ "acc_norm,none": 0.7385057471264368,
28
+ "acc_norm_stderr,none": 0.023590833013480327,
29
+ "alias": " - medmcqa_orig_filtered"
30
+ },
31
+ "medqa_4options_g2b": {
32
+ "acc,none": 0.708994708994709,
33
+ "acc_stderr,none": 0.02339382650048486,
34
+ "acc_norm,none": 0.708994708994709,
35
+ "acc_norm_stderr,none": 0.02339382650048486,
36
+ "alias": " - medqa_4options_g2b"
37
+ },
38
+ "medqa_4options_orig_filtered": {
39
+ "acc,none": 0.753968253968254,
40
+ "acc_stderr,none": 0.022182037202948368,
41
+ "acc_norm,none": 0.753968253968254,
42
+ "acc_norm_stderr,none": 0.022182037202948368,
43
+ "alias": " - medqa_4options_orig_filtered"
44
+ }
45
+ },
46
+ "groups": {
47
+ "b4b": {
48
+ "acc,none": 0.7876078914919852,
49
+ "acc_stderr,none": 0.06728010300021042,
50
+ "acc_norm,none": 0.7876078914919852,
51
+ "acc_norm_stderr,none": 0.06728010300021042,
52
+ "alias": "b4b"
53
+ }
54
+ },
55
+ "configs": {
56
+ "b4bqa": {
57
+ "task": "b4bqa",
58
+ "dataset_path": "AIM-Harvard/b4b_drug_qa",
59
+ "test_split": "test",
60
+ "doc_to_text": "<function process_cd at 0x7f1d7499c820>",
61
+ "doc_to_target": "correct_choice",
62
+ "doc_to_choice": [
63
+ "A",
64
+ "B",
65
+ "C",
66
+ "D"
67
+ ],
68
+ "description": "",
69
+ "target_delimiter": " ",
70
+ "fewshot_delimiter": "\n\n",
71
+ "metric_list": [
72
+ {
73
+ "metric": "acc",
74
+ "aggregation": "mean",
75
+ "higher_is_better": true
76
+ },
77
+ {
78
+ "metric": "acc_norm",
79
+ "aggregation": "mean",
80
+ "higher_is_better": true
81
+ }
82
+ ],
83
+ "output_type": "multiple_choice",
84
+ "repeats": 1,
85
+ "should_decontaminate": false
86
+ },
87
+ "medmcqa_g2b": {
88
+ "task": "medmcqa_g2b",
89
+ "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
90
+ "training_split": "train",
91
+ "validation_split": "validation",
92
+ "test_split": "validation",
93
+ "doc_to_text": "<function doc_to_text at 0x7f1d749e6050>",
94
+ "doc_to_target": "cop",
95
+ "doc_to_choice": [
96
+ "A",
97
+ "B",
98
+ "C",
99
+ "D"
100
+ ],
101
+ "description": "",
102
+ "target_delimiter": " ",
103
+ "fewshot_delimiter": "\n\n",
104
+ "metric_list": [
105
+ {
106
+ "metric": "acc",
107
+ "aggregation": "mean",
108
+ "higher_is_better": true
109
+ },
110
+ {
111
+ "metric": "acc_norm",
112
+ "aggregation": "mean",
113
+ "higher_is_better": true
114
+ }
115
+ ],
116
+ "output_type": "multiple_choice",
117
+ "repeats": 1,
118
+ "should_decontaminate": true,
119
+ "doc_to_decontamination_query": "{{question}}"
120
+ },
121
+ "medmcqa_orig_filtered": {
122
+ "task": "medmcqa_orig_filtered",
123
+ "dataset_path": "AIM-Harvard/medmcqa_original",
124
+ "training_split": "train",
125
+ "validation_split": "validation",
126
+ "test_split": "validation",
127
+ "doc_to_text": "<function doc_to_text at 0x7f1d66619240>",
128
+ "doc_to_target": "cop",
129
+ "doc_to_choice": [
130
+ "A",
131
+ "B",
132
+ "C",
133
+ "D"
134
+ ],
135
+ "description": "",
136
+ "target_delimiter": " ",
137
+ "fewshot_delimiter": "\n\n",
138
+ "metric_list": [
139
+ {
140
+ "metric": "acc",
141
+ "aggregation": "mean",
142
+ "higher_is_better": true
143
+ },
144
+ {
145
+ "metric": "acc_norm",
146
+ "aggregation": "mean",
147
+ "higher_is_better": true
148
+ }
149
+ ],
150
+ "output_type": "multiple_choice",
151
+ "repeats": 1,
152
+ "should_decontaminate": true,
153
+ "doc_to_decontamination_query": "{{question}}"
154
+ },
155
+ "medqa_4options_g2b": {
156
+ "task": "medqa_4options_g2b",
157
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
158
+ "training_split": "train",
159
+ "validation_split": "validation",
160
+ "test_split": "test",
161
+ "doc_to_text": "<function doc_to_text at 0x7f1d748a8820>",
162
+ "doc_to_target": "<function doc_to_target at 0x7f1d748a8b80>",
163
+ "doc_to_choice": [
164
+ "A",
165
+ "B",
166
+ "C",
167
+ "D"
168
+ ],
169
+ "description": "",
170
+ "target_delimiter": " ",
171
+ "fewshot_delimiter": "\n\n",
172
+ "metric_list": [
173
+ {
174
+ "metric": "acc",
175
+ "aggregation": "mean",
176
+ "higher_is_better": true
177
+ },
178
+ {
179
+ "metric": "acc_norm",
180
+ "aggregation": "mean",
181
+ "higher_is_better": true
182
+ }
183
+ ],
184
+ "output_type": "multiple_choice",
185
+ "repeats": 1,
186
+ "should_decontaminate": false
187
+ },
188
+ "medqa_4options_orig_filtered": {
189
+ "task": "medqa_4options_orig_filtered",
190
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
191
+ "training_split": "train",
192
+ "validation_split": "validation",
193
+ "test_split": "test",
194
+ "doc_to_text": "<function doc_to_text at 0x7f1d74962680>",
195
+ "doc_to_target": "<function doc_to_target at 0x7f1d74981120>",
196
+ "doc_to_choice": [
197
+ "A",
198
+ "B",
199
+ "C",
200
+ "D"
201
+ ],
202
+ "description": "",
203
+ "target_delimiter": " ",
204
+ "fewshot_delimiter": "\n\n",
205
+ "metric_list": [
206
+ {
207
+ "metric": "acc",
208
+ "aggregation": "mean",
209
+ "higher_is_better": true
210
+ },
211
+ {
212
+ "metric": "acc_norm",
213
+ "aggregation": "mean",
214
+ "higher_is_better": true
215
+ }
216
+ ],
217
+ "output_type": "multiple_choice",
218
+ "repeats": 1,
219
+ "should_decontaminate": false
220
+ }
221
+ },
222
+ "versions": {
223
+ "b4b": "N/A",
224
+ "b4bqa": "Yaml",
225
+ "medmcqa_g2b": "Yaml",
226
+ "medmcqa_orig_filtered": "Yaml",
227
+ "medqa_4options_g2b": "Yaml",
228
+ "medqa_4options_orig_filtered": "Yaml"
229
+ },
230
+ "n-shot": {
231
+ "b4b": 0,
232
+ "b4bqa": 0,
233
+ "medmcqa_g2b": 0,
234
+ "medmcqa_orig_filtered": 0,
235
+ "medqa_4options_g2b": 0,
236
+ "medqa_4options_orig_filtered": 0
237
+ },
238
+ "config": {
239
+ "model": "hf",
240
+ "model_args": "pretrained=aaditya/Llama3-OpenBioLLM-70B,parallelize=True,load_in_4bit=True",
241
+ "batch_size": "auto",
242
+ "batch_sizes": [
243
+ 32
244
+ ],
245
+ "device": null,
246
+ "use_cache": null,
247
+ "limit": null,
248
+ "bootstrap_iters": 100000,
249
+ "gen_kwargs": null
250
+ },
251
+ "git_hash": "928c7657"
252
+ }
data/raw-eval-outputs/johnsnowlabs-JSL-MedLlama-3-8B-v9_results.json ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "b4b": {
4
+ "acc,none": 0.7444512946979038,
5
+ "acc_stderr,none": 0.04274747119698657,
6
+ "acc_norm,none": 0.7444512946979038,
7
+ "acc_norm_stderr,none": 0.04274747119698657,
8
+ "alias": "b4b"
9
+ },
10
+ "b4bqa": {
11
+ "acc,none": 0.7516741071428571,
12
+ "acc_stderr,none": 0.010208877794084196,
13
+ "acc_norm,none": 0.7516741071428571,
14
+ "acc_norm_stderr,none": 0.010208877794084196,
15
+ "alias": " - b4bqa"
16
+ },
17
+ "medmcqa_g2b": {
18
+ "acc,none": 0.6408045977011494,
19
+ "acc_stderr,none": 0.025755112822545917,
20
+ "acc_norm,none": 0.6408045977011494,
21
+ "acc_norm_stderr,none": 0.025755112822545917,
22
+ "alias": " - medmcqa_g2b"
23
+ },
24
+ "medmcqa_orig_filtered": {
25
+ "acc,none": 0.7701149425287356,
26
+ "acc_stderr,none": 0.022587512669518847,
27
+ "acc_norm,none": 0.7701149425287356,
28
+ "acc_norm_stderr,none": 0.022587512669518847,
29
+ "alias": " - medmcqa_orig_filtered"
30
+ },
31
+ "medqa_4options_g2b": {
32
+ "acc,none": 0.7063492063492064,
33
+ "acc_stderr,none": 0.023456037383982033,
34
+ "acc_norm,none": 0.7063492063492064,
35
+ "acc_norm_stderr,none": 0.023456037383982033,
36
+ "alias": " - medqa_4options_g2b"
37
+ },
38
+ "medqa_4options_orig_filtered": {
39
+ "acc,none": 0.8201058201058201,
40
+ "acc_stderr,none": 0.01978211983276641,
41
+ "acc_norm,none": 0.8201058201058201,
42
+ "acc_norm_stderr,none": 0.01978211983276641,
43
+ "alias": " - medqa_4options_orig_filtered"
44
+ }
45
+ },
46
+ "groups": {
47
+ "b4b": {
48
+ "acc,none": 0.7444512946979038,
49
+ "acc_stderr,none": 0.04274747119698657,
50
+ "acc_norm,none": 0.7444512946979038,
51
+ "acc_norm_stderr,none": 0.04274747119698657,
52
+ "alias": "b4b"
53
+ }
54
+ },
55
+ "configs": {
56
+ "b4bqa": {
57
+ "task": "b4bqa",
58
+ "dataset_path": "AIM-Harvard/b4b_drug_qa",
59
+ "test_split": "test",
60
+ "doc_to_text": "<function process_cd at 0x78639f2e7040>",
61
+ "doc_to_target": "correct_choice",
62
+ "doc_to_choice": [
63
+ "A",
64
+ "B",
65
+ "C",
66
+ "D"
67
+ ],
68
+ "description": "",
69
+ "target_delimiter": " ",
70
+ "fewshot_delimiter": "\n\n",
71
+ "metric_list": [
72
+ {
73
+ "metric": "acc",
74
+ "aggregation": "mean",
75
+ "higher_is_better": true
76
+ },
77
+ {
78
+ "metric": "acc_norm",
79
+ "aggregation": "mean",
80
+ "higher_is_better": true
81
+ }
82
+ ],
83
+ "output_type": "multiple_choice",
84
+ "repeats": 1,
85
+ "should_decontaminate": false
86
+ },
87
+ "medmcqa_g2b": {
88
+ "task": "medmcqa_g2b",
89
+ "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
90
+ "training_split": "train",
91
+ "validation_split": "validation",
92
+ "test_split": "validation",
93
+ "doc_to_text": "<function doc_to_text at 0x78639ef36280>",
94
+ "doc_to_target": "cop",
95
+ "doc_to_choice": [
96
+ "A",
97
+ "B",
98
+ "C",
99
+ "D"
100
+ ],
101
+ "description": "",
102
+ "target_delimiter": " ",
103
+ "fewshot_delimiter": "\n\n",
104
+ "metric_list": [
105
+ {
106
+ "metric": "acc",
107
+ "aggregation": "mean",
108
+ "higher_is_better": true
109
+ },
110
+ {
111
+ "metric": "acc_norm",
112
+ "aggregation": "mean",
113
+ "higher_is_better": true
114
+ }
115
+ ],
116
+ "output_type": "multiple_choice",
117
+ "repeats": 1,
118
+ "should_decontaminate": true,
119
+ "doc_to_decontamination_query": "{{question}}"
120
+ },
121
+ "medmcqa_orig_filtered": {
122
+ "task": "medmcqa_orig_filtered",
123
+ "dataset_path": "AIM-Harvard/medmcqa_original",
124
+ "training_split": "train",
125
+ "validation_split": "validation",
126
+ "test_split": "validation",
127
+ "doc_to_text": "<function doc_to_text at 0x78639f2d9e50>",
128
+ "doc_to_target": "cop",
129
+ "doc_to_choice": [
130
+ "A",
131
+ "B",
132
+ "C",
133
+ "D"
134
+ ],
135
+ "description": "",
136
+ "target_delimiter": " ",
137
+ "fewshot_delimiter": "\n\n",
138
+ "metric_list": [
139
+ {
140
+ "metric": "acc",
141
+ "aggregation": "mean",
142
+ "higher_is_better": true
143
+ },
144
+ {
145
+ "metric": "acc_norm",
146
+ "aggregation": "mean",
147
+ "higher_is_better": true
148
+ }
149
+ ],
150
+ "output_type": "multiple_choice",
151
+ "repeats": 1,
152
+ "should_decontaminate": true,
153
+ "doc_to_decontamination_query": "{{question}}"
154
+ },
155
+ "medqa_4options_g2b": {
156
+ "task": "medqa_4options_g2b",
157
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
158
+ "training_split": "train",
159
+ "validation_split": "validation",
160
+ "test_split": "test",
161
+ "doc_to_text": "<function doc_to_text at 0x78639f24a0d0>",
162
+ "doc_to_target": "<function doc_to_target at 0x78639f24a550>",
163
+ "doc_to_choice": [
164
+ "A",
165
+ "B",
166
+ "C",
167
+ "D"
168
+ ],
169
+ "description": "",
170
+ "target_delimiter": " ",
171
+ "fewshot_delimiter": "\n\n",
172
+ "metric_list": [
173
+ {
174
+ "metric": "acc",
175
+ "aggregation": "mean",
176
+ "higher_is_better": true
177
+ },
178
+ {
179
+ "metric": "acc_norm",
180
+ "aggregation": "mean",
181
+ "higher_is_better": true
182
+ }
183
+ ],
184
+ "output_type": "multiple_choice",
185
+ "repeats": 1,
186
+ "should_decontaminate": false
187
+ },
188
+ "medqa_4options_orig_filtered": {
189
+ "task": "medqa_4options_orig_filtered",
190
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
191
+ "training_split": "train",
192
+ "validation_split": "validation",
193
+ "test_split": "test",
194
+ "doc_to_text": "<function doc_to_text at 0x78639f24a670>",
195
+ "doc_to_target": "<function doc_to_target at 0x78639f24a8b0>",
196
+ "doc_to_choice": [
197
+ "A",
198
+ "B",
199
+ "C",
200
+ "D"
201
+ ],
202
+ "description": "",
203
+ "target_delimiter": " ",
204
+ "fewshot_delimiter": "\n\n",
205
+ "metric_list": [
206
+ {
207
+ "metric": "acc",
208
+ "aggregation": "mean",
209
+ "higher_is_better": true
210
+ },
211
+ {
212
+ "metric": "acc_norm",
213
+ "aggregation": "mean",
214
+ "higher_is_better": true
215
+ }
216
+ ],
217
+ "output_type": "multiple_choice",
218
+ "repeats": 1,
219
+ "should_decontaminate": false
220
+ }
221
+ },
222
+ "versions": {
223
+ "b4b": "N/A",
224
+ "b4bqa": "Yaml",
225
+ "medmcqa_g2b": "Yaml",
226
+ "medmcqa_orig_filtered": "Yaml",
227
+ "medqa_4options_g2b": "Yaml",
228
+ "medqa_4options_orig_filtered": "Yaml"
229
+ },
230
+ "n-shot": {
231
+ "b4b": 0,
232
+ "b4bqa": 0,
233
+ "medmcqa_g2b": 0,
234
+ "medmcqa_orig_filtered": 0,
235
+ "medqa_4options_g2b": 0,
236
+ "medqa_4options_orig_filtered": 0
237
+ },
238
+ "config": {
239
+ "model": "hf",
240
+ "model_args": "pretrained=johnsnowlabs/JSL-MedLlama-3-8B-v9,parallelize=True,load_in_4bit=True",
241
+ "batch_size": "auto",
242
+ "batch_sizes": [
243
+ 8
244
+ ],
245
+ "device": null,
246
+ "use_cache": null,
247
+ "limit": null,
248
+ "bootstrap_iters": 100000,
249
+ "gen_kwargs": null
250
+ },
251
+ "git_hash": "a6ca0b90"
252
+ }
data/raw-eval-outputs/meta-llama-Llama-2-70B-hf_results.json ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "b4b": {
4
+ "acc,none": 0.656288532675709,
5
+ "acc_stderr,none": 0.11099422321488661,
6
+ "acc_norm,none": 0.656288532675709,
7
+ "acc_norm_stderr,none": 0.11099422321488661,
8
+ "alias": "b4b"
9
+ },
10
+ "b4bqa": {
11
+ "acc,none": 0.7700892857142857,
12
+ "acc_stderr,none": 0.009942654607749084,
13
+ "acc_norm,none": 0.7700892857142857,
14
+ "acc_norm_stderr,none": 0.009942654607749084,
15
+ "alias": " - b4bqa"
16
+ },
17
+ "medmcqa_g2b": {
18
+ "acc,none": 0.45977011494252873,
19
+ "acc_stderr,none": 0.026754382675705738,
20
+ "acc_norm,none": 0.45977011494252873,
21
+ "acc_norm_stderr,none": 0.026754382675705738,
22
+ "alias": " - medmcqa_g2b"
23
+ },
24
+ "medmcqa_orig_filtered": {
25
+ "acc,none": 0.5229885057471264,
26
+ "acc_stderr,none": 0.026813021515239517,
27
+ "acc_norm,none": 0.5229885057471264,
28
+ "acc_norm_stderr,none": 0.026813021515239517,
29
+ "alias": " - medmcqa_orig_filtered"
30
+ },
31
+ "medqa_4options_g2b": {
32
+ "acc,none": 0.5264550264550265,
33
+ "acc_stderr,none": 0.025715239811346758,
34
+ "acc_norm,none": 0.5264550264550265,
35
+ "acc_norm_stderr,none": 0.025715239811346758,
36
+ "alias": " - medqa_4options_g2b"
37
+ },
38
+ "medqa_4options_orig_filtered": {
39
+ "acc,none": 0.5502645502645502,
40
+ "acc_stderr,none": 0.02562085704293665,
41
+ "acc_norm,none": 0.5502645502645502,
42
+ "acc_norm_stderr,none": 0.02562085704293665,
43
+ "alias": " - medqa_4options_orig_filtered"
44
+ }
45
+ },
46
+ "groups": {
47
+ "b4b": {
48
+ "acc,none": 0.656288532675709,
49
+ "acc_stderr,none": 0.11099422321488661,
50
+ "acc_norm,none": 0.656288532675709,
51
+ "acc_norm_stderr,none": 0.11099422321488661,
52
+ "alias": "b4b"
53
+ }
54
+ },
55
+ "configs": {
56
+ "b4bqa": {
57
+ "task": "b4bqa",
58
+ "dataset_path": "AIM-Harvard/b4b_drug_qa",
59
+ "test_split": "test",
60
+ "doc_to_text": "<function process_cd at 0x7fc9f1d3d090>",
61
+ "doc_to_target": "correct_choice",
62
+ "doc_to_choice": [
63
+ "A",
64
+ "B",
65
+ "C",
66
+ "D"
67
+ ],
68
+ "description": "",
69
+ "target_delimiter": " ",
70
+ "fewshot_delimiter": "\n\n",
71
+ "metric_list": [
72
+ {
73
+ "metric": "acc",
74
+ "aggregation": "mean",
75
+ "higher_is_better": true
76
+ },
77
+ {
78
+ "metric": "acc_norm",
79
+ "aggregation": "mean",
80
+ "higher_is_better": true
81
+ }
82
+ ],
83
+ "output_type": "multiple_choice",
84
+ "repeats": 1,
85
+ "should_decontaminate": false
86
+ },
87
+ "medmcqa_g2b": {
88
+ "task": "medmcqa_g2b",
89
+ "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
90
+ "training_split": "train",
91
+ "validation_split": "validation",
92
+ "test_split": "validation",
93
+ "doc_to_text": "<function doc_to_text at 0x7fc9f0108550>",
94
+ "doc_to_target": "cop",
95
+ "doc_to_choice": [
96
+ "A",
97
+ "B",
98
+ "C",
99
+ "D"
100
+ ],
101
+ "description": "",
102
+ "target_delimiter": " ",
103
+ "fewshot_delimiter": "\n\n",
104
+ "metric_list": [
105
+ {
106
+ "metric": "acc",
107
+ "aggregation": "mean",
108
+ "higher_is_better": true
109
+ },
110
+ {
111
+ "metric": "acc_norm",
112
+ "aggregation": "mean",
113
+ "higher_is_better": true
114
+ }
115
+ ],
116
+ "output_type": "multiple_choice",
117
+ "repeats": 1,
118
+ "should_decontaminate": true,
119
+ "doc_to_decontamination_query": "{{question}}"
120
+ },
121
+ "medmcqa_orig_filtered": {
122
+ "task": "medmcqa_orig_filtered",
123
+ "dataset_path": "AIM-Harvard/medmcqa_original",
124
+ "training_split": "train",
125
+ "validation_split": "validation",
126
+ "test_split": "validation",
127
+ "doc_to_text": "<function doc_to_text at 0x7fc9f0108700>",
128
+ "doc_to_target": "cop",
129
+ "doc_to_choice": [
130
+ "A",
131
+ "B",
132
+ "C",
133
+ "D"
134
+ ],
135
+ "description": "",
136
+ "target_delimiter": " ",
137
+ "fewshot_delimiter": "\n\n",
138
+ "metric_list": [
139
+ {
140
+ "metric": "acc",
141
+ "aggregation": "mean",
142
+ "higher_is_better": true
143
+ },
144
+ {
145
+ "metric": "acc_norm",
146
+ "aggregation": "mean",
147
+ "higher_is_better": true
148
+ }
149
+ ],
150
+ "output_type": "multiple_choice",
151
+ "repeats": 1,
152
+ "should_decontaminate": true,
153
+ "doc_to_decontamination_query": "{{question}}"
154
+ },
155
+ "medqa_4options_g2b": {
156
+ "task": "medqa_4options_g2b",
157
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
158
+ "training_split": "train",
159
+ "validation_split": "validation",
160
+ "test_split": "test",
161
+ "doc_to_text": "<function doc_to_text at 0x7fc9f010b6d0>",
162
+ "doc_to_target": "<function doc_to_target at 0x7fc9f010ba30>",
163
+ "doc_to_choice": [
164
+ "A",
165
+ "B",
166
+ "C",
167
+ "D"
168
+ ],
169
+ "description": "",
170
+ "target_delimiter": " ",
171
+ "fewshot_delimiter": "\n\n",
172
+ "metric_list": [
173
+ {
174
+ "metric": "acc",
175
+ "aggregation": "mean",
176
+ "higher_is_better": true
177
+ },
178
+ {
179
+ "metric": "acc_norm",
180
+ "aggregation": "mean",
181
+ "higher_is_better": true
182
+ }
183
+ ],
184
+ "output_type": "multiple_choice",
185
+ "repeats": 1,
186
+ "should_decontaminate": false
187
+ },
188
+ "medqa_4options_orig_filtered": {
189
+ "task": "medqa_4options_orig_filtered",
190
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
191
+ "training_split": "train",
192
+ "validation_split": "validation",
193
+ "test_split": "test",
194
+ "doc_to_text": "<function doc_to_text at 0x7fc9f010bc70>",
195
+ "doc_to_target": "<function doc_to_target at 0x7fc9f010beb0>",
196
+ "doc_to_choice": [
197
+ "A",
198
+ "B",
199
+ "C",
200
+ "D"
201
+ ],
202
+ "description": "",
203
+ "target_delimiter": " ",
204
+ "fewshot_delimiter": "\n\n",
205
+ "metric_list": [
206
+ {
207
+ "metric": "acc",
208
+ "aggregation": "mean",
209
+ "higher_is_better": true
210
+ },
211
+ {
212
+ "metric": "acc_norm",
213
+ "aggregation": "mean",
214
+ "higher_is_better": true
215
+ }
216
+ ],
217
+ "output_type": "multiple_choice",
218
+ "repeats": 1,
219
+ "should_decontaminate": false
220
+ }
221
+ },
222
+ "versions": {
223
+ "b4b": "N/A",
224
+ "b4bqa": "Yaml",
225
+ "medmcqa_g2b": "Yaml",
226
+ "medmcqa_orig_filtered": "Yaml",
227
+ "medqa_4options_g2b": "Yaml",
228
+ "medqa_4options_orig_filtered": "Yaml"
229
+ },
230
+ "n-shot": {
231
+ "b4b": 0,
232
+ "b4bqa": 0,
233
+ "medmcqa_g2b": 0,
234
+ "medmcqa_orig_filtered": 0,
235
+ "medqa_4options_g2b": 0,
236
+ "medqa_4options_orig_filtered": 0
237
+ },
238
+ "config": {
239
+ "model": "hf",
240
+ "model_args": "pretrained=meta-llama/Llama-2-70B-hf,load_in_4bit=True",
241
+ "batch_size": "4",
242
+ "batch_sizes": [],
243
+ "device": "cuda:0",
244
+ "use_cache": null,
245
+ "limit": null,
246
+ "bootstrap_iters": 100000,
247
+ "gen_kwargs": null
248
+ },
249
+ "git_hash": "928c7657"
250
+ }
data/raw-eval-outputs/meta-llama-Llama-2-7b-hf_results.json ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "b4b": {
4
+ "acc,none": 0.3600493218249075,
5
+ "acc_stderr,none": 0.021816304388272503,
6
+ "acc_norm,none": 0.3600493218249075,
7
+ "acc_norm_stderr,none": 0.021816304388272503,
8
+ "alias": "b4b"
9
+ },
10
+ "b4bqa": {
11
+ "acc,none": 0.36830357142857145,
12
+ "acc_stderr,none": 0.011397494280772988,
13
+ "acc_norm,none": 0.36830357142857145,
14
+ "acc_norm_stderr,none": 0.011397494280772988,
15
+ "alias": " - b4bqa"
16
+ },
17
+ "medmcqa_g2b": {
18
+ "acc,none": 0.3390804597701149,
19
+ "acc_stderr,none": 0.02541329280547327,
20
+ "acc_norm,none": 0.3390804597701149,
21
+ "acc_norm_stderr,none": 0.02541329280547327,
22
+ "alias": " - medmcqa_g2b"
23
+ },
24
+ "medmcqa_orig_filtered": {
25
+ "acc,none": 0.34195402298850575,
26
+ "acc_stderr,none": 0.025465208743331563,
27
+ "acc_norm,none": 0.34195402298850575,
28
+ "acc_norm_stderr,none": 0.025465208743331563,
29
+ "alias": " - medmcqa_orig_filtered"
30
+ },
31
+ "medqa_4options_g2b": {
32
+ "acc,none": 0.3439153439153439,
33
+ "acc_stderr,none": 0.024464426625596437,
34
+ "acc_norm,none": 0.3439153439153439,
35
+ "acc_norm_stderr,none": 0.024464426625596437,
36
+ "alias": " - medqa_4options_g2b"
37
+ },
38
+ "medqa_4options_orig_filtered": {
39
+ "acc,none": 0.373015873015873,
40
+ "acc_stderr,none": 0.02490699045899257,
41
+ "acc_norm,none": 0.373015873015873,
42
+ "acc_norm_stderr,none": 0.02490699045899257,
43
+ "alias": " - medqa_4options_orig_filtered"
44
+ }
45
+ },
46
+ "groups": {
47
+ "b4b": {
48
+ "acc,none": 0.3600493218249075,
49
+ "acc_stderr,none": 0.021816304388272503,
50
+ "acc_norm,none": 0.3600493218249075,
51
+ "acc_norm_stderr,none": 0.021816304388272503,
52
+ "alias": "b4b"
53
+ }
54
+ },
55
+ "configs": {
56
+ "b4bqa": {
57
+ "task": "b4bqa",
58
+ "dataset_path": "AIM-Harvard/b4b_drug_qa",
59
+ "test_split": "test",
60
+ "doc_to_text": "<function process_cd at 0x7f9fc69011b0>",
61
+ "doc_to_target": "correct_choice",
62
+ "doc_to_choice": [
63
+ "A",
64
+ "B",
65
+ "C",
66
+ "D"
67
+ ],
68
+ "description": "",
69
+ "target_delimiter": " ",
70
+ "fewshot_delimiter": "\n\n",
71
+ "metric_list": [
72
+ {
73
+ "metric": "acc",
74
+ "aggregation": "mean",
75
+ "higher_is_better": true
76
+ },
77
+ {
78
+ "metric": "acc_norm",
79
+ "aggregation": "mean",
80
+ "higher_is_better": true
81
+ }
82
+ ],
83
+ "output_type": "multiple_choice",
84
+ "repeats": 1,
85
+ "should_decontaminate": false
86
+ },
87
+ "medmcqa_g2b": {
88
+ "task": "medmcqa_g2b",
89
+ "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
90
+ "training_split": "train",
91
+ "validation_split": "validation",
92
+ "test_split": "validation",
93
+ "doc_to_text": "<function doc_to_text at 0x7f9fc4d94670>",
94
+ "doc_to_target": "cop",
95
+ "doc_to_choice": [
96
+ "A",
97
+ "B",
98
+ "C",
99
+ "D"
100
+ ],
101
+ "description": "",
102
+ "target_delimiter": " ",
103
+ "fewshot_delimiter": "\n\n",
104
+ "metric_list": [
105
+ {
106
+ "metric": "acc",
107
+ "aggregation": "mean",
108
+ "higher_is_better": true
109
+ },
110
+ {
111
+ "metric": "acc_norm",
112
+ "aggregation": "mean",
113
+ "higher_is_better": true
114
+ }
115
+ ],
116
+ "output_type": "multiple_choice",
117
+ "repeats": 1,
118
+ "should_decontaminate": true,
119
+ "doc_to_decontamination_query": "{{question}}"
120
+ },
121
+ "medmcqa_orig_filtered": {
122
+ "task": "medmcqa_orig_filtered",
123
+ "dataset_path": "AIM-Harvard/medmcqa_original",
124
+ "training_split": "train",
125
+ "validation_split": "validation",
126
+ "test_split": "validation",
127
+ "doc_to_text": "<function doc_to_text at 0x7f9fc4d94820>",
128
+ "doc_to_target": "cop",
129
+ "doc_to_choice": [
130
+ "A",
131
+ "B",
132
+ "C",
133
+ "D"
134
+ ],
135
+ "description": "",
136
+ "target_delimiter": " ",
137
+ "fewshot_delimiter": "\n\n",
138
+ "metric_list": [
139
+ {
140
+ "metric": "acc",
141
+ "aggregation": "mean",
142
+ "higher_is_better": true
143
+ },
144
+ {
145
+ "metric": "acc_norm",
146
+ "aggregation": "mean",
147
+ "higher_is_better": true
148
+ }
149
+ ],
150
+ "output_type": "multiple_choice",
151
+ "repeats": 1,
152
+ "should_decontaminate": true,
153
+ "doc_to_decontamination_query": "{{question}}"
154
+ },
155
+ "medqa_4options_g2b": {
156
+ "task": "medqa_4options_g2b",
157
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
158
+ "training_split": "train",
159
+ "validation_split": "validation",
160
+ "test_split": "test",
161
+ "doc_to_text": "<function doc_to_text at 0x7f9fc4d977f0>",
162
+ "doc_to_target": "<function doc_to_target at 0x7f9fc4d97b50>",
163
+ "doc_to_choice": [
164
+ "A",
165
+ "B",
166
+ "C",
167
+ "D"
168
+ ],
169
+ "description": "",
170
+ "target_delimiter": " ",
171
+ "fewshot_delimiter": "\n\n",
172
+ "metric_list": [
173
+ {
174
+ "metric": "acc",
175
+ "aggregation": "mean",
176
+ "higher_is_better": true
177
+ },
178
+ {
179
+ "metric": "acc_norm",
180
+ "aggregation": "mean",
181
+ "higher_is_better": true
182
+ }
183
+ ],
184
+ "output_type": "multiple_choice",
185
+ "repeats": 1,
186
+ "should_decontaminate": false
187
+ },
188
+ "medqa_4options_orig_filtered": {
189
+ "task": "medqa_4options_orig_filtered",
190
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
191
+ "training_split": "train",
192
+ "validation_split": "validation",
193
+ "test_split": "test",
194
+ "doc_to_text": "<function doc_to_text at 0x7f9fc4d97d90>",
195
+ "doc_to_target": "<function doc_to_target at 0x7f9fc4db8040>",
196
+ "doc_to_choice": [
197
+ "A",
198
+ "B",
199
+ "C",
200
+ "D"
201
+ ],
202
+ "description": "",
203
+ "target_delimiter": " ",
204
+ "fewshot_delimiter": "\n\n",
205
+ "metric_list": [
206
+ {
207
+ "metric": "acc",
208
+ "aggregation": "mean",
209
+ "higher_is_better": true
210
+ },
211
+ {
212
+ "metric": "acc_norm",
213
+ "aggregation": "mean",
214
+ "higher_is_better": true
215
+ }
216
+ ],
217
+ "output_type": "multiple_choice",
218
+ "repeats": 1,
219
+ "should_decontaminate": false
220
+ }
221
+ },
222
+ "versions": {
223
+ "b4b": "N/A",
224
+ "b4bqa": "Yaml",
225
+ "medmcqa_g2b": "Yaml",
226
+ "medmcqa_orig_filtered": "Yaml",
227
+ "medqa_4options_g2b": "Yaml",
228
+ "medqa_4options_orig_filtered": "Yaml"
229
+ },
230
+ "n-shot": {
231
+ "b4b": 0,
232
+ "b4bqa": 0,
233
+ "medmcqa_g2b": 0,
234
+ "medmcqa_orig_filtered": 0,
235
+ "medqa_4options_g2b": 0,
236
+ "medqa_4options_orig_filtered": 0
237
+ },
238
+ "config": {
239
+ "model": "hf",
240
+ "model_args": "pretrained=meta-llama/Llama-2-7b-hf,load_in_4bit=True",
241
+ "batch_size": "4",
242
+ "batch_sizes": [],
243
+ "device": "cuda:0",
244
+ "use_cache": null,
245
+ "limit": null,
246
+ "bootstrap_iters": 100000,
247
+ "gen_kwargs": null
248
+ },
249
+ "git_hash": "928c7657"
250
+ }
data/raw-eval-outputs/meta-llama-Meta-Llama-3-70B_results.json ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "b4b": {
4
+ "acc,none": 0.8255240443896424,
5
+ "acc_stderr,none": 0.07700722588574725,
6
+ "acc_norm,none": 0.8255240443896424,
7
+ "acc_norm_stderr,none": 0.07700722588574725,
8
+ "alias": "b4b"
9
+ },
10
+ "b4bqa": {
11
+ "acc,none": 0.9012276785714286,
12
+ "acc_stderr,none": 0.007049967229617683,
13
+ "acc_norm,none": 0.9012276785714286,
14
+ "acc_norm_stderr,none": 0.007049967229617683,
15
+ "alias": " - b4bqa"
16
+ },
17
+ "medmcqa_g2b": {
18
+ "acc,none": 0.6666666666666666,
19
+ "acc_stderr,none": 0.025306320600037485,
20
+ "acc_norm,none": 0.6666666666666666,
21
+ "acc_norm_stderr,none": 0.025306320600037485,
22
+ "alias": " - medmcqa_g2b"
23
+ },
24
+ "medmcqa_orig_filtered": {
25
+ "acc,none": 0.7816091954022989,
26
+ "acc_stderr,none": 0.02217927096875997,
27
+ "acc_norm,none": 0.7816091954022989,
28
+ "acc_norm_stderr,none": 0.02217927096875997,
29
+ "alias": " - medmcqa_orig_filtered"
30
+ },
31
+ "medqa_4options_g2b": {
32
+ "acc,none": 0.7275132275132276,
33
+ "acc_stderr,none": 0.022930973071633363,
34
+ "acc_norm,none": 0.7275132275132276,
35
+ "acc_norm_stderr,none": 0.022930973071633363,
36
+ "alias": " - medqa_4options_g2b"
37
+ },
38
+ "medqa_4options_orig_filtered": {
39
+ "acc,none": 0.7513227513227513,
40
+ "acc_stderr,none": 0.022261817692400168,
41
+ "acc_norm,none": 0.7513227513227513,
42
+ "acc_norm_stderr,none": 0.022261817692400168,
43
+ "alias": " - medqa_4options_orig_filtered"
44
+ }
45
+ },
46
+ "groups": {
47
+ "b4b": {
48
+ "acc,none": 0.8255240443896424,
49
+ "acc_stderr,none": 0.07700722588574725,
50
+ "acc_norm,none": 0.8255240443896424,
51
+ "acc_norm_stderr,none": 0.07700722588574725,
52
+ "alias": "b4b"
53
+ }
54
+ },
55
+ "configs": {
56
+ "b4bqa": {
57
+ "task": "b4bqa",
58
+ "dataset_path": "AIM-Harvard/b4b_drug_qa",
59
+ "test_split": "test",
60
+ "doc_to_text": "<function process_cd at 0x7f572baed090>",
61
+ "doc_to_target": "correct_choice",
62
+ "doc_to_choice": [
63
+ "A",
64
+ "B",
65
+ "C",
66
+ "D"
67
+ ],
68
+ "description": "",
69
+ "target_delimiter": " ",
70
+ "fewshot_delimiter": "\n\n",
71
+ "metric_list": [
72
+ {
73
+ "metric": "acc",
74
+ "aggregation": "mean",
75
+ "higher_is_better": true
76
+ },
77
+ {
78
+ "metric": "acc_norm",
79
+ "aggregation": "mean",
80
+ "higher_is_better": true
81
+ }
82
+ ],
83
+ "output_type": "multiple_choice",
84
+ "repeats": 1,
85
+ "should_decontaminate": false
86
+ },
87
+ "medmcqa_g2b": {
88
+ "task": "medmcqa_g2b",
89
+ "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
90
+ "training_split": "train",
91
+ "validation_split": "validation",
92
+ "test_split": "validation",
93
+ "doc_to_text": "<function doc_to_text at 0x7f5729f00550>",
94
+ "doc_to_target": "cop",
95
+ "doc_to_choice": [
96
+ "A",
97
+ "B",
98
+ "C",
99
+ "D"
100
+ ],
101
+ "description": "",
102
+ "target_delimiter": " ",
103
+ "fewshot_delimiter": "\n\n",
104
+ "metric_list": [
105
+ {
106
+ "metric": "acc",
107
+ "aggregation": "mean",
108
+ "higher_is_better": true
109
+ },
110
+ {
111
+ "metric": "acc_norm",
112
+ "aggregation": "mean",
113
+ "higher_is_better": true
114
+ }
115
+ ],
116
+ "output_type": "multiple_choice",
117
+ "repeats": 1,
118
+ "should_decontaminate": true,
119
+ "doc_to_decontamination_query": "{{question}}"
120
+ },
121
+ "medmcqa_orig_filtered": {
122
+ "task": "medmcqa_orig_filtered",
123
+ "dataset_path": "AIM-Harvard/medmcqa_original",
124
+ "training_split": "train",
125
+ "validation_split": "validation",
126
+ "test_split": "validation",
127
+ "doc_to_text": "<function doc_to_text at 0x7f5729f00700>",
128
+ "doc_to_target": "cop",
129
+ "doc_to_choice": [
130
+ "A",
131
+ "B",
132
+ "C",
133
+ "D"
134
+ ],
135
+ "description": "",
136
+ "target_delimiter": " ",
137
+ "fewshot_delimiter": "\n\n",
138
+ "metric_list": [
139
+ {
140
+ "metric": "acc",
141
+ "aggregation": "mean",
142
+ "higher_is_better": true
143
+ },
144
+ {
145
+ "metric": "acc_norm",
146
+ "aggregation": "mean",
147
+ "higher_is_better": true
148
+ }
149
+ ],
150
+ "output_type": "multiple_choice",
151
+ "repeats": 1,
152
+ "should_decontaminate": true,
153
+ "doc_to_decontamination_query": "{{question}}"
154
+ },
155
+ "medqa_4options_g2b": {
156
+ "task": "medqa_4options_g2b",
157
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
158
+ "training_split": "train",
159
+ "validation_split": "validation",
160
+ "test_split": "test",
161
+ "doc_to_text": "<function doc_to_text at 0x7f5729f036d0>",
162
+ "doc_to_target": "<function doc_to_target at 0x7f5729f03a30>",
163
+ "doc_to_choice": [
164
+ "A",
165
+ "B",
166
+ "C",
167
+ "D"
168
+ ],
169
+ "description": "",
170
+ "target_delimiter": " ",
171
+ "fewshot_delimiter": "\n\n",
172
+ "metric_list": [
173
+ {
174
+ "metric": "acc",
175
+ "aggregation": "mean",
176
+ "higher_is_better": true
177
+ },
178
+ {
179
+ "metric": "acc_norm",
180
+ "aggregation": "mean",
181
+ "higher_is_better": true
182
+ }
183
+ ],
184
+ "output_type": "multiple_choice",
185
+ "repeats": 1,
186
+ "should_decontaminate": false
187
+ },
188
+ "medqa_4options_orig_filtered": {
189
+ "task": "medqa_4options_orig_filtered",
190
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
191
+ "training_split": "train",
192
+ "validation_split": "validation",
193
+ "test_split": "test",
194
+ "doc_to_text": "<function doc_to_text at 0x7f5729f03c70>",
195
+ "doc_to_target": "<function doc_to_target at 0x7f5729f03eb0>",
196
+ "doc_to_choice": [
197
+ "A",
198
+ "B",
199
+ "C",
200
+ "D"
201
+ ],
202
+ "description": "",
203
+ "target_delimiter": " ",
204
+ "fewshot_delimiter": "\n\n",
205
+ "metric_list": [
206
+ {
207
+ "metric": "acc",
208
+ "aggregation": "mean",
209
+ "higher_is_better": true
210
+ },
211
+ {
212
+ "metric": "acc_norm",
213
+ "aggregation": "mean",
214
+ "higher_is_better": true
215
+ }
216
+ ],
217
+ "output_type": "multiple_choice",
218
+ "repeats": 1,
219
+ "should_decontaminate": false
220
+ }
221
+ },
222
+ "versions": {
223
+ "b4b": "N/A",
224
+ "b4bqa": "Yaml",
225
+ "medmcqa_g2b": "Yaml",
226
+ "medmcqa_orig_filtered": "Yaml",
227
+ "medqa_4options_g2b": "Yaml",
228
+ "medqa_4options_orig_filtered": "Yaml"
229
+ },
230
+ "n-shot": {
231
+ "b4b": 0,
232
+ "b4bqa": 0,
233
+ "medmcqa_g2b": 0,
234
+ "medmcqa_orig_filtered": 0,
235
+ "medqa_4options_g2b": 0,
236
+ "medqa_4options_orig_filtered": 0
237
+ },
238
+ "config": {
239
+ "model": "hf",
240
+ "model_args": "pretrained=meta-llama/Meta-Llama-3-70B,load_in_4bit=True",
241
+ "batch_size": "4",
242
+ "batch_sizes": [],
243
+ "device": "cuda:0",
244
+ "use_cache": null,
245
+ "limit": null,
246
+ "bootstrap_iters": 100000,
247
+ "gen_kwargs": null
248
+ },
249
+ "git_hash": "928c7657"
250
+ }
data/raw-eval-outputs/meta-llama-Meta-Llama-3-8B_results.json ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "b4b": {
4
+ "acc,none": 0.7120838471023427,
5
+ "acc_stderr,none": 0.11202233860795015,
6
+ "acc_norm,none": 0.7120838471023427,
7
+ "acc_norm_stderr,none": 0.11202233860795015,
8
+ "alias": "b4b"
9
+ },
10
+ "b4bqa": {
11
+ "acc,none": 0.8270089285714286,
12
+ "acc_stderr,none": 0.00893756370730241,
13
+ "acc_norm,none": 0.8270089285714286,
14
+ "acc_norm_stderr,none": 0.00893756370730241,
15
+ "alias": " - b4bqa"
16
+ },
17
+ "medmcqa_g2b": {
18
+ "acc,none": 0.5287356321839081,
19
+ "acc_stderr,none": 0.02679704183010415,
20
+ "acc_norm,none": 0.5287356321839081,
21
+ "acc_norm_stderr,none": 0.02679704183010415,
22
+ "alias": " - medmcqa_g2b"
23
+ },
24
+ "medmcqa_orig_filtered": {
25
+ "acc,none": 0.5919540229885057,
26
+ "acc_stderr,none": 0.026383584629731508,
27
+ "acc_norm,none": 0.5919540229885057,
28
+ "acc_norm_stderr,none": 0.026383584629731508,
29
+ "alias": " - medmcqa_orig_filtered"
30
+ },
31
+ "medqa_4options_g2b": {
32
+ "acc,none": 0.5502645502645502,
33
+ "acc_stderr,none": 0.025620857042936655,
34
+ "acc_norm,none": 0.5502645502645502,
35
+ "acc_norm_stderr,none": 0.025620857042936655,
36
+ "alias": " - medqa_4options_g2b"
37
+ },
38
+ "medqa_4options_orig_filtered": {
39
+ "acc,none": 0.6084656084656085,
40
+ "acc_stderr,none": 0.025138091388851102,
41
+ "acc_norm,none": 0.6084656084656085,
42
+ "acc_norm_stderr,none": 0.025138091388851102,
43
+ "alias": " - medqa_4options_orig_filtered"
44
+ }
45
+ },
46
+ "groups": {
47
+ "b4b": {
48
+ "acc,none": 0.7120838471023427,
49
+ "acc_stderr,none": 0.11202233860795015,
50
+ "acc_norm,none": 0.7120838471023427,
51
+ "acc_norm_stderr,none": 0.11202233860795015,
52
+ "alias": "b4b"
53
+ }
54
+ },
55
+ "configs": {
56
+ "b4bqa": {
57
+ "task": "b4bqa",
58
+ "dataset_path": "AIM-Harvard/b4b_drug_qa",
59
+ "test_split": "test",
60
+ "doc_to_text": "<function process_cd at 0x7ff55118d090>",
61
+ "doc_to_target": "correct_choice",
62
+ "doc_to_choice": [
63
+ "A",
64
+ "B",
65
+ "C",
66
+ "D"
67
+ ],
68
+ "description": "",
69
+ "target_delimiter": " ",
70
+ "fewshot_delimiter": "\n\n",
71
+ "metric_list": [
72
+ {
73
+ "metric": "acc",
74
+ "aggregation": "mean",
75
+ "higher_is_better": true
76
+ },
77
+ {
78
+ "metric": "acc_norm",
79
+ "aggregation": "mean",
80
+ "higher_is_better": true
81
+ }
82
+ ],
83
+ "output_type": "multiple_choice",
84
+ "repeats": 1,
85
+ "should_decontaminate": false
86
+ },
87
+ "medmcqa_g2b": {
88
+ "task": "medmcqa_g2b",
89
+ "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
90
+ "training_split": "train",
91
+ "validation_split": "validation",
92
+ "test_split": "validation",
93
+ "doc_to_text": "<function doc_to_text at 0x7ff55058c550>",
94
+ "doc_to_target": "cop",
95
+ "doc_to_choice": [
96
+ "A",
97
+ "B",
98
+ "C",
99
+ "D"
100
+ ],
101
+ "description": "",
102
+ "target_delimiter": " ",
103
+ "fewshot_delimiter": "\n\n",
104
+ "metric_list": [
105
+ {
106
+ "metric": "acc",
107
+ "aggregation": "mean",
108
+ "higher_is_better": true
109
+ },
110
+ {
111
+ "metric": "acc_norm",
112
+ "aggregation": "mean",
113
+ "higher_is_better": true
114
+ }
115
+ ],
116
+ "output_type": "multiple_choice",
117
+ "repeats": 1,
118
+ "should_decontaminate": true,
119
+ "doc_to_decontamination_query": "{{question}}"
120
+ },
121
+ "medmcqa_orig_filtered": {
122
+ "task": "medmcqa_orig_filtered",
123
+ "dataset_path": "AIM-Harvard/medmcqa_original",
124
+ "training_split": "train",
125
+ "validation_split": "validation",
126
+ "test_split": "validation",
127
+ "doc_to_text": "<function doc_to_text at 0x7ff55058c700>",
128
+ "doc_to_target": "cop",
129
+ "doc_to_choice": [
130
+ "A",
131
+ "B",
132
+ "C",
133
+ "D"
134
+ ],
135
+ "description": "",
136
+ "target_delimiter": " ",
137
+ "fewshot_delimiter": "\n\n",
138
+ "metric_list": [
139
+ {
140
+ "metric": "acc",
141
+ "aggregation": "mean",
142
+ "higher_is_better": true
143
+ },
144
+ {
145
+ "metric": "acc_norm",
146
+ "aggregation": "mean",
147
+ "higher_is_better": true
148
+ }
149
+ ],
150
+ "output_type": "multiple_choice",
151
+ "repeats": 1,
152
+ "should_decontaminate": true,
153
+ "doc_to_decontamination_query": "{{question}}"
154
+ },
155
+ "medqa_4options_g2b": {
156
+ "task": "medqa_4options_g2b",
157
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
158
+ "training_split": "train",
159
+ "validation_split": "validation",
160
+ "test_split": "test",
161
+ "doc_to_text": "<function doc_to_text at 0x7ff55058f6d0>",
162
+ "doc_to_target": "<function doc_to_target at 0x7ff55058fa30>",
163
+ "doc_to_choice": [
164
+ "A",
165
+ "B",
166
+ "C",
167
+ "D"
168
+ ],
169
+ "description": "",
170
+ "target_delimiter": " ",
171
+ "fewshot_delimiter": "\n\n",
172
+ "metric_list": [
173
+ {
174
+ "metric": "acc",
175
+ "aggregation": "mean",
176
+ "higher_is_better": true
177
+ },
178
+ {
179
+ "metric": "acc_norm",
180
+ "aggregation": "mean",
181
+ "higher_is_better": true
182
+ }
183
+ ],
184
+ "output_type": "multiple_choice",
185
+ "repeats": 1,
186
+ "should_decontaminate": false
187
+ },
188
+ "medqa_4options_orig_filtered": {
189
+ "task": "medqa_4options_orig_filtered",
190
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
191
+ "training_split": "train",
192
+ "validation_split": "validation",
193
+ "test_split": "test",
194
+ "doc_to_text": "<function doc_to_text at 0x7ff55058fc70>",
195
+ "doc_to_target": "<function doc_to_target at 0x7ff55058feb0>",
196
+ "doc_to_choice": [
197
+ "A",
198
+ "B",
199
+ "C",
200
+ "D"
201
+ ],
202
+ "description": "",
203
+ "target_delimiter": " ",
204
+ "fewshot_delimiter": "\n\n",
205
+ "metric_list": [
206
+ {
207
+ "metric": "acc",
208
+ "aggregation": "mean",
209
+ "higher_is_better": true
210
+ },
211
+ {
212
+ "metric": "acc_norm",
213
+ "aggregation": "mean",
214
+ "higher_is_better": true
215
+ }
216
+ ],
217
+ "output_type": "multiple_choice",
218
+ "repeats": 1,
219
+ "should_decontaminate": false
220
+ }
221
+ },
222
+ "versions": {
223
+ "b4b": "N/A",
224
+ "b4bqa": "Yaml",
225
+ "medmcqa_g2b": "Yaml",
226
+ "medmcqa_orig_filtered": "Yaml",
227
+ "medqa_4options_g2b": "Yaml",
228
+ "medqa_4options_orig_filtered": "Yaml"
229
+ },
230
+ "n-shot": {
231
+ "b4b": 0,
232
+ "b4bqa": 0,
233
+ "medmcqa_g2b": 0,
234
+ "medmcqa_orig_filtered": 0,
235
+ "medqa_4options_g2b": 0,
236
+ "medqa_4options_orig_filtered": 0
237
+ },
238
+ "config": {
239
+ "model": "hf",
240
+ "model_args": "pretrained=meta-llama/Meta-Llama-3-8B,load_in_4bit=True",
241
+ "batch_size": "4",
242
+ "batch_sizes": [],
243
+ "device": "cuda:0",
244
+ "use_cache": null,
245
+ "limit": null,
246
+ "bootstrap_iters": 100000,
247
+ "gen_kwargs": null
248
+ },
249
+ "git_hash": "928c7657"
250
+ }
data/raw-eval-outputs/microsoft-Phi-3-medium-4k-instruct_results.json ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "b4b": {
4
+ "acc,none": 0.6593711467324291,
5
+ "acc_stderr,none": 0.05882406104148581,
6
+ "acc_norm,none": 0.6593711467324291,
7
+ "acc_norm_stderr,none": 0.05882406104148581,
8
+ "alias": "b4b"
9
+ },
10
+ "b4bqa": {
11
+ "acc,none": 0.6997767857142857,
12
+ "acc_stderr,none": 0.010830639682891873,
13
+ "acc_norm,none": 0.6997767857142857,
14
+ "acc_norm_stderr,none": 0.010830639682891873,
15
+ "alias": " - b4bqa"
16
+ },
17
+ "medmcqa_g2b": {
18
+ "acc,none": 0.603448275862069,
19
+ "acc_stderr,none": 0.026260634141933786,
20
+ "acc_norm,none": 0.603448275862069,
21
+ "acc_norm_stderr,none": 0.026260634141933786,
22
+ "alias": " - medmcqa_g2b"
23
+ },
24
+ "medmcqa_orig_filtered": {
25
+ "acc,none": 0.7241379310344828,
26
+ "acc_stderr,none": 0.023993406146998367,
27
+ "acc_norm,none": 0.7241379310344828,
28
+ "acc_norm_stderr,none": 0.023993406146998367,
29
+ "alias": " - medmcqa_orig_filtered"
30
+ },
31
+ "medqa_4options_g2b": {
32
+ "acc,none": 0.5343915343915344,
33
+ "acc_stderr,none": 0.025690321762493848,
34
+ "acc_norm,none": 0.5343915343915344,
35
+ "acc_norm_stderr,none": 0.025690321762493848,
36
+ "alias": " - medqa_4options_g2b"
37
+ },
38
+ "medqa_4options_orig_filtered": {
39
+ "acc,none": 0.5846560846560847,
40
+ "acc_stderr,none": 0.025379524910778398,
41
+ "acc_norm,none": 0.5846560846560847,
42
+ "acc_norm_stderr,none": 0.025379524910778398,
43
+ "alias": " - medqa_4options_orig_filtered"
44
+ }
45
+ },
46
+ "groups": {
47
+ "b4b": {
48
+ "acc,none": 0.6593711467324291,
49
+ "acc_stderr,none": 0.05882406104148581,
50
+ "acc_norm,none": 0.6593711467324291,
51
+ "acc_norm_stderr,none": 0.05882406104148581,
52
+ "alias": "b4b"
53
+ }
54
+ },
55
+ "configs": {
56
+ "b4bqa": {
57
+ "task": "b4bqa",
58
+ "dataset_path": "AIM-Harvard/b4b_drug_qa",
59
+ "test_split": "test",
60
+ "doc_to_text": "<function process_cd at 0x7f872445dee0>",
61
+ "doc_to_target": "correct_choice",
62
+ "doc_to_choice": [
63
+ "A",
64
+ "B",
65
+ "C",
66
+ "D"
67
+ ],
68
+ "description": "",
69
+ "target_delimiter": " ",
70
+ "fewshot_delimiter": "\n\n",
71
+ "metric_list": [
72
+ {
73
+ "metric": "acc",
74
+ "aggregation": "mean",
75
+ "higher_is_better": true
76
+ },
77
+ {
78
+ "metric": "acc_norm",
79
+ "aggregation": "mean",
80
+ "higher_is_better": true
81
+ }
82
+ ],
83
+ "output_type": "multiple_choice",
84
+ "repeats": 1,
85
+ "should_decontaminate": false
86
+ },
87
+ "medmcqa_g2b": {
88
+ "task": "medmcqa_g2b",
89
+ "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
90
+ "training_split": "train",
91
+ "validation_split": "validation",
92
+ "test_split": "validation",
93
+ "doc_to_text": "<function doc_to_text at 0x7f87249823a0>",
94
+ "doc_to_target": "cop",
95
+ "doc_to_choice": [
96
+ "A",
97
+ "B",
98
+ "C",
99
+ "D"
100
+ ],
101
+ "description": "",
102
+ "target_delimiter": " ",
103
+ "fewshot_delimiter": "\n\n",
104
+ "metric_list": [
105
+ {
106
+ "metric": "acc",
107
+ "aggregation": "mean",
108
+ "higher_is_better": true
109
+ },
110
+ {
111
+ "metric": "acc_norm",
112
+ "aggregation": "mean",
113
+ "higher_is_better": true
114
+ }
115
+ ],
116
+ "output_type": "multiple_choice",
117
+ "repeats": 1,
118
+ "should_decontaminate": true,
119
+ "doc_to_decontamination_query": "{{question}}"
120
+ },
121
+ "medmcqa_orig_filtered": {
122
+ "task": "medmcqa_orig_filtered",
123
+ "dataset_path": "AIM-Harvard/medmcqa_original",
124
+ "training_split": "train",
125
+ "validation_split": "validation",
126
+ "test_split": "validation",
127
+ "doc_to_text": "<function doc_to_text at 0x7f87242cb310>",
128
+ "doc_to_target": "cop",
129
+ "doc_to_choice": [
130
+ "A",
131
+ "B",
132
+ "C",
133
+ "D"
134
+ ],
135
+ "description": "",
136
+ "target_delimiter": " ",
137
+ "fewshot_delimiter": "\n\n",
138
+ "metric_list": [
139
+ {
140
+ "metric": "acc",
141
+ "aggregation": "mean",
142
+ "higher_is_better": true
143
+ },
144
+ {
145
+ "metric": "acc_norm",
146
+ "aggregation": "mean",
147
+ "higher_is_better": true
148
+ }
149
+ ],
150
+ "output_type": "multiple_choice",
151
+ "repeats": 1,
152
+ "should_decontaminate": true,
153
+ "doc_to_decontamination_query": "{{question}}"
154
+ },
155
+ "medqa_4options_g2b": {
156
+ "task": "medqa_4options_g2b",
157
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
158
+ "training_split": "train",
159
+ "validation_split": "validation",
160
+ "test_split": "test",
161
+ "doc_to_text": "<function doc_to_text at 0x7f8724982820>",
162
+ "doc_to_target": "<function doc_to_target at 0x7f8724982b80>",
163
+ "doc_to_choice": [
164
+ "A",
165
+ "B",
166
+ "C",
167
+ "D"
168
+ ],
169
+ "description": "",
170
+ "target_delimiter": " ",
171
+ "fewshot_delimiter": "\n\n",
172
+ "metric_list": [
173
+ {
174
+ "metric": "acc",
175
+ "aggregation": "mean",
176
+ "higher_is_better": true
177
+ },
178
+ {
179
+ "metric": "acc_norm",
180
+ "aggregation": "mean",
181
+ "higher_is_better": true
182
+ }
183
+ ],
184
+ "output_type": "multiple_choice",
185
+ "repeats": 1,
186
+ "should_decontaminate": false
187
+ },
188
+ "medqa_4options_orig_filtered": {
189
+ "task": "medqa_4options_orig_filtered",
190
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
191
+ "training_split": "train",
192
+ "validation_split": "validation",
193
+ "test_split": "test",
194
+ "doc_to_text": "<function doc_to_text at 0x7f872447f4c0>",
195
+ "doc_to_target": "<function doc_to_target at 0x7f872442cee0>",
196
+ "doc_to_choice": [
197
+ "A",
198
+ "B",
199
+ "C",
200
+ "D"
201
+ ],
202
+ "description": "",
203
+ "target_delimiter": " ",
204
+ "fewshot_delimiter": "\n\n",
205
+ "metric_list": [
206
+ {
207
+ "metric": "acc",
208
+ "aggregation": "mean",
209
+ "higher_is_better": true
210
+ },
211
+ {
212
+ "metric": "acc_norm",
213
+ "aggregation": "mean",
214
+ "higher_is_better": true
215
+ }
216
+ ],
217
+ "output_type": "multiple_choice",
218
+ "repeats": 1,
219
+ "should_decontaminate": false
220
+ }
221
+ },
222
+ "versions": {
223
+ "b4b": "N/A",
224
+ "b4bqa": "Yaml",
225
+ "medmcqa_g2b": "Yaml",
226
+ "medmcqa_orig_filtered": "Yaml",
227
+ "medqa_4options_g2b": "Yaml",
228
+ "medqa_4options_orig_filtered": "Yaml"
229
+ },
230
+ "n-shot": {
231
+ "b4b": 0,
232
+ "b4bqa": 0,
233
+ "medmcqa_g2b": 0,
234
+ "medmcqa_orig_filtered": 0,
235
+ "medqa_4options_g2b": 0,
236
+ "medqa_4options_orig_filtered": 0
237
+ },
238
+ "config": {
239
+ "model": "hf",
240
+ "model_args": "pretrained=microsoft/Phi-3-medium-4k-instruct,load_in_4bit=True",
241
+ "batch_size": "auto:64",
242
+ "batch_sizes": [
243
+ 8,
244
+ 16,
245
+ 32,
246
+ 32,
247
+ 32,
248
+ 32,
249
+ 32,
250
+ 32,
251
+ 32,
252
+ 32,
253
+ 32,
254
+ 32,
255
+ 64,
256
+ 64,
257
+ 64,
258
+ 64,
259
+ 64,
260
+ 64,
261
+ 64,
262
+ 64,
263
+ 64,
264
+ 64,
265
+ 64,
266
+ 64,
267
+ 64,
268
+ 64,
269
+ 64,
270
+ 64,
271
+ 64,
272
+ 64,
273
+ 64,
274
+ 64,
275
+ 64,
276
+ 64,
277
+ 64,
278
+ 64,
279
+ 64,
280
+ 64,
281
+ 64,
282
+ 64,
283
+ 64,
284
+ 64,
285
+ 64,
286
+ 64,
287
+ 64,
288
+ 64,
289
+ 64,
290
+ 64,
291
+ 64,
292
+ 64,
293
+ 64,
294
+ 64,
295
+ 64,
296
+ 64,
297
+ 64,
298
+ 64,
299
+ 64,
300
+ 64,
301
+ 64,
302
+ 64,
303
+ 64,
304
+ 64,
305
+ 64,
306
+ 64,
307
+ 64
308
+ ],
309
+ "device": "cuda:0",
310
+ "use_cache": null,
311
+ "limit": null,
312
+ "bootstrap_iters": 100000,
313
+ "gen_kwargs": null
314
+ },
315
+ "git_hash": "928c7657"
316
+ }
data/raw-eval-outputs/microsoft-phi-1_5_results.json ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "b4b": {
4
+ "acc,none": 0.3024044389642417,
5
+ "acc_stderr,none": 0.030335029823792846,
6
+ "acc_norm,none": 0.3024044389642417,
7
+ "acc_norm_stderr,none": 0.030335029823792846,
8
+ "alias": "b4b"
9
+ },
10
+ "b4bqa": {
11
+ "acc,none": 0.28013392857142855,
12
+ "acc_stderr,none": 0.010611112414051155,
13
+ "acc_norm,none": 0.28013392857142855,
14
+ "acc_norm_stderr,none": 0.010611112414051155,
15
+ "alias": " - b4bqa"
16
+ },
17
+ "medmcqa_g2b": {
18
+ "acc,none": 0.3160919540229885,
19
+ "acc_stderr,none": 0.024959784982131285,
20
+ "acc_norm,none": 0.3160919540229885,
21
+ "acc_norm_stderr,none": 0.024959784982131285,
22
+ "alias": " - medmcqa_g2b"
23
+ },
24
+ "medmcqa_orig_filtered": {
25
+ "acc,none": 0.3045977011494253,
26
+ "acc_stderr,none": 0.024706807658616183,
27
+ "acc_norm,none": 0.3045977011494253,
28
+ "acc_norm_stderr,none": 0.024706807658616183,
29
+ "alias": " - medmcqa_orig_filtered"
30
+ },
31
+ "medqa_4options_g2b": {
32
+ "acc,none": 0.3492063492063492,
33
+ "acc_stderr,none": 0.024552292209342654,
34
+ "acc_norm,none": 0.3492063492063492,
35
+ "acc_norm_stderr,none": 0.024552292209342654,
36
+ "alias": " - medqa_4options_g2b"
37
+ },
38
+ "medqa_4options_orig_filtered": {
39
+ "acc,none": 0.34656084656084657,
40
+ "acc_stderr,none": 0.024508777521028435,
41
+ "acc_norm,none": 0.34656084656084657,
42
+ "acc_norm_stderr,none": 0.024508777521028435,
43
+ "alias": " - medqa_4options_orig_filtered"
44
+ }
45
+ },
46
+ "groups": {
47
+ "b4b": {
48
+ "acc,none": 0.3024044389642417,
49
+ "acc_stderr,none": 0.030335029823792846,
50
+ "acc_norm,none": 0.3024044389642417,
51
+ "acc_norm_stderr,none": 0.030335029823792846,
52
+ "alias": "b4b"
53
+ }
54
+ },
55
+ "configs": {
56
+ "b4bqa": {
57
+ "task": "b4bqa",
58
+ "dataset_path": "AIM-Harvard/b4b_drug_qa",
59
+ "test_split": "test",
60
+ "doc_to_text": "<function process_cd at 0x7f202b05ff70>",
61
+ "doc_to_target": "correct_choice",
62
+ "doc_to_choice": [
63
+ "A",
64
+ "B",
65
+ "C",
66
+ "D"
67
+ ],
68
+ "description": "",
69
+ "target_delimiter": " ",
70
+ "fewshot_delimiter": "\n\n",
71
+ "metric_list": [
72
+ {
73
+ "metric": "acc",
74
+ "aggregation": "mean",
75
+ "higher_is_better": true
76
+ },
77
+ {
78
+ "metric": "acc_norm",
79
+ "aggregation": "mean",
80
+ "higher_is_better": true
81
+ }
82
+ ],
83
+ "output_type": "multiple_choice",
84
+ "repeats": 1,
85
+ "should_decontaminate": false
86
+ },
87
+ "medmcqa_g2b": {
88
+ "task": "medmcqa_g2b",
89
+ "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
90
+ "training_split": "train",
91
+ "validation_split": "validation",
92
+ "test_split": "validation",
93
+ "doc_to_text": "<function doc_to_text at 0x7f202b59e430>",
94
+ "doc_to_target": "cop",
95
+ "doc_to_choice": [
96
+ "A",
97
+ "B",
98
+ "C",
99
+ "D"
100
+ ],
101
+ "description": "",
102
+ "target_delimiter": " ",
103
+ "fewshot_delimiter": "\n\n",
104
+ "metric_list": [
105
+ {
106
+ "metric": "acc",
107
+ "aggregation": "mean",
108
+ "higher_is_better": true
109
+ },
110
+ {
111
+ "metric": "acc_norm",
112
+ "aggregation": "mean",
113
+ "higher_is_better": true
114
+ }
115
+ ],
116
+ "output_type": "multiple_choice",
117
+ "repeats": 1,
118
+ "should_decontaminate": true,
119
+ "doc_to_decontamination_query": "{{question}}"
120
+ },
121
+ "medmcqa_orig_filtered": {
122
+ "task": "medmcqa_orig_filtered",
123
+ "dataset_path": "AIM-Harvard/medmcqa_original",
124
+ "training_split": "train",
125
+ "validation_split": "validation",
126
+ "test_split": "validation",
127
+ "doc_to_text": "<function doc_to_text at 0x7f202aecb3a0>",
128
+ "doc_to_target": "cop",
129
+ "doc_to_choice": [
130
+ "A",
131
+ "B",
132
+ "C",
133
+ "D"
134
+ ],
135
+ "description": "",
136
+ "target_delimiter": " ",
137
+ "fewshot_delimiter": "\n\n",
138
+ "metric_list": [
139
+ {
140
+ "metric": "acc",
141
+ "aggregation": "mean",
142
+ "higher_is_better": true
143
+ },
144
+ {
145
+ "metric": "acc_norm",
146
+ "aggregation": "mean",
147
+ "higher_is_better": true
148
+ }
149
+ ],
150
+ "output_type": "multiple_choice",
151
+ "repeats": 1,
152
+ "should_decontaminate": true,
153
+ "doc_to_decontamination_query": "{{question}}"
154
+ },
155
+ "medqa_4options_g2b": {
156
+ "task": "medqa_4options_g2b",
157
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
158
+ "training_split": "train",
159
+ "validation_split": "validation",
160
+ "test_split": "test",
161
+ "doc_to_text": "<function doc_to_text at 0x7f202b59e8b0>",
162
+ "doc_to_target": "<function doc_to_target at 0x7f202b59ec10>",
163
+ "doc_to_choice": [
164
+ "A",
165
+ "B",
166
+ "C",
167
+ "D"
168
+ ],
169
+ "description": "",
170
+ "target_delimiter": " ",
171
+ "fewshot_delimiter": "\n\n",
172
+ "metric_list": [
173
+ {
174
+ "metric": "acc",
175
+ "aggregation": "mean",
176
+ "higher_is_better": true
177
+ },
178
+ {
179
+ "metric": "acc_norm",
180
+ "aggregation": "mean",
181
+ "higher_is_better": true
182
+ }
183
+ ],
184
+ "output_type": "multiple_choice",
185
+ "repeats": 1,
186
+ "should_decontaminate": false
187
+ },
188
+ "medqa_4options_orig_filtered": {
189
+ "task": "medqa_4options_orig_filtered",
190
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
191
+ "training_split": "train",
192
+ "validation_split": "validation",
193
+ "test_split": "test",
194
+ "doc_to_text": "<function doc_to_text at 0x7f202b084550>",
195
+ "doc_to_target": "<function doc_to_target at 0x7f202b030f70>",
196
+ "doc_to_choice": [
197
+ "A",
198
+ "B",
199
+ "C",
200
+ "D"
201
+ ],
202
+ "description": "",
203
+ "target_delimiter": " ",
204
+ "fewshot_delimiter": "\n\n",
205
+ "metric_list": [
206
+ {
207
+ "metric": "acc",
208
+ "aggregation": "mean",
209
+ "higher_is_better": true
210
+ },
211
+ {
212
+ "metric": "acc_norm",
213
+ "aggregation": "mean",
214
+ "higher_is_better": true
215
+ }
216
+ ],
217
+ "output_type": "multiple_choice",
218
+ "repeats": 1,
219
+ "should_decontaminate": false
220
+ }
221
+ },
222
+ "versions": {
223
+ "b4b": "N/A",
224
+ "b4bqa": "Yaml",
225
+ "medmcqa_g2b": "Yaml",
226
+ "medmcqa_orig_filtered": "Yaml",
227
+ "medqa_4options_g2b": "Yaml",
228
+ "medqa_4options_orig_filtered": "Yaml"
229
+ },
230
+ "n-shot": {
231
+ "b4b": 0,
232
+ "b4bqa": 0,
233
+ "medmcqa_g2b": 0,
234
+ "medmcqa_orig_filtered": 0,
235
+ "medqa_4options_g2b": 0,
236
+ "medqa_4options_orig_filtered": 0
237
+ },
238
+ "config": {
239
+ "model": "hf",
240
+ "model_args": "pretrained=microsoft/phi-1_5,load_in_4bit=True",
241
+ "batch_size": "auto:64",
242
+ "batch_sizes": [
243
+ 32,
244
+ 64,
245
+ 64,
246
+ 64,
247
+ 64,
248
+ 64,
249
+ 64,
250
+ 64,
251
+ 64,
252
+ 64,
253
+ 64,
254
+ 64,
255
+ 64,
256
+ 64,
257
+ 64,
258
+ 64,
259
+ 64,
260
+ 64,
261
+ 64,
262
+ 64,
263
+ 64,
264
+ 64,
265
+ 64,
266
+ 64,
267
+ 64,
268
+ 64,
269
+ 64,
270
+ 64,
271
+ 64,
272
+ 64,
273
+ 64,
274
+ 64,
275
+ 64,
276
+ 64,
277
+ 64,
278
+ 64,
279
+ 64,
280
+ 64,
281
+ 64,
282
+ 64,
283
+ 64,
284
+ 64,
285
+ 64,
286
+ 64,
287
+ 64,
288
+ 64,
289
+ 64,
290
+ 64,
291
+ 64,
292
+ 64,
293
+ 64,
294
+ 64,
295
+ 64,
296
+ 64,
297
+ 64,
298
+ 64,
299
+ 64,
300
+ 64,
301
+ 64,
302
+ 64,
303
+ 64,
304
+ 64,
305
+ 64,
306
+ 64,
307
+ 64
308
+ ],
309
+ "device": "cuda:0",
310
+ "use_cache": null,
311
+ "limit": null,
312
+ "bootstrap_iters": 100000,
313
+ "gen_kwargs": null
314
+ },
315
+ "git_hash": "928c7657"
316
+ }
data/raw-eval-outputs/microsoft-phi-1_results.json ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "b4b": {
4
+ "acc,none": 0.21177558569667077,
5
+ "acc_stderr,none": 0.024570863489409633,
6
+ "acc_norm,none": 0.21177558569667077,
7
+ "acc_norm_stderr,none": 0.024570863489409633,
8
+ "alias": "b4b"
9
+ },
10
+ "b4bqa": {
11
+ "acc,none": 0.19642857142857142,
12
+ "acc_stderr,none": 0.009387863785916705,
13
+ "acc_norm,none": 0.19642857142857142,
14
+ "acc_norm_stderr,none": 0.009387863785916705,
15
+ "alias": " - b4bqa"
16
+ },
17
+ "medmcqa_g2b": {
18
+ "acc,none": 0.2413793103448276,
19
+ "acc_stderr,none": 0.02297193745254371,
20
+ "acc_norm,none": 0.2413793103448276,
21
+ "acc_norm_stderr,none": 0.02297193745254371,
22
+ "alias": " - medmcqa_g2b"
23
+ },
24
+ "medmcqa_orig_filtered": {
25
+ "acc,none": 0.25862068965517243,
26
+ "acc_stderr,none": 0.023506454355379604,
27
+ "acc_norm,none": 0.25862068965517243,
28
+ "acc_norm_stderr,none": 0.023506454355379604,
29
+ "alias": " - medmcqa_orig_filtered"
30
+ },
31
+ "medqa_4options_g2b": {
32
+ "acc,none": 0.21693121693121692,
33
+ "acc_stderr,none": 0.02122708244944506,
34
+ "acc_norm,none": 0.21693121693121692,
35
+ "acc_norm_stderr,none": 0.02122708244944506,
36
+ "alias": " - medqa_4options_g2b"
37
+ },
38
+ "medqa_4options_orig_filtered": {
39
+ "acc,none": 0.20899470899470898,
40
+ "acc_stderr,none": 0.02094048156533485,
41
+ "acc_norm,none": 0.20899470899470898,
42
+ "acc_norm_stderr,none": 0.02094048156533485,
43
+ "alias": " - medqa_4options_orig_filtered"
44
+ }
45
+ },
46
+ "groups": {
47
+ "b4b": {
48
+ "acc,none": 0.21177558569667077,
49
+ "acc_stderr,none": 0.024570863489409633,
50
+ "acc_norm,none": 0.21177558569667077,
51
+ "acc_norm_stderr,none": 0.024570863489409633,
52
+ "alias": "b4b"
53
+ }
54
+ },
55
+ "configs": {
56
+ "b4bqa": {
57
+ "task": "b4bqa",
58
+ "dataset_path": "AIM-Harvard/b4b_drug_qa",
59
+ "test_split": "test",
60
+ "doc_to_text": "<function process_cd at 0x7f3613c5fee0>",
61
+ "doc_to_target": "correct_choice",
62
+ "doc_to_choice": [
63
+ "A",
64
+ "B",
65
+ "C",
66
+ "D"
67
+ ],
68
+ "description": "",
69
+ "target_delimiter": " ",
70
+ "fewshot_delimiter": "\n\n",
71
+ "metric_list": [
72
+ {
73
+ "metric": "acc",
74
+ "aggregation": "mean",
75
+ "higher_is_better": true
76
+ },
77
+ {
78
+ "metric": "acc_norm",
79
+ "aggregation": "mean",
80
+ "higher_is_better": true
81
+ }
82
+ ],
83
+ "output_type": "multiple_choice",
84
+ "repeats": 1,
85
+ "should_decontaminate": false
86
+ },
87
+ "medmcqa_g2b": {
88
+ "task": "medmcqa_g2b",
89
+ "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
90
+ "training_split": "train",
91
+ "validation_split": "validation",
92
+ "test_split": "validation",
93
+ "doc_to_text": "<function doc_to_text at 0x7f36141953a0>",
94
+ "doc_to_target": "cop",
95
+ "doc_to_choice": [
96
+ "A",
97
+ "B",
98
+ "C",
99
+ "D"
100
+ ],
101
+ "description": "",
102
+ "target_delimiter": " ",
103
+ "fewshot_delimiter": "\n\n",
104
+ "metric_list": [
105
+ {
106
+ "metric": "acc",
107
+ "aggregation": "mean",
108
+ "higher_is_better": true
109
+ },
110
+ {
111
+ "metric": "acc_norm",
112
+ "aggregation": "mean",
113
+ "higher_is_better": true
114
+ }
115
+ ],
116
+ "output_type": "multiple_choice",
117
+ "repeats": 1,
118
+ "should_decontaminate": true,
119
+ "doc_to_decontamination_query": "{{question}}"
120
+ },
121
+ "medmcqa_orig_filtered": {
122
+ "task": "medmcqa_orig_filtered",
123
+ "dataset_path": "AIM-Harvard/medmcqa_original",
124
+ "training_split": "train",
125
+ "validation_split": "validation",
126
+ "test_split": "validation",
127
+ "doc_to_text": "<function doc_to_text at 0x7f3613acb310>",
128
+ "doc_to_target": "cop",
129
+ "doc_to_choice": [
130
+ "A",
131
+ "B",
132
+ "C",
133
+ "D"
134
+ ],
135
+ "description": "",
136
+ "target_delimiter": " ",
137
+ "fewshot_delimiter": "\n\n",
138
+ "metric_list": [
139
+ {
140
+ "metric": "acc",
141
+ "aggregation": "mean",
142
+ "higher_is_better": true
143
+ },
144
+ {
145
+ "metric": "acc_norm",
146
+ "aggregation": "mean",
147
+ "higher_is_better": true
148
+ }
149
+ ],
150
+ "output_type": "multiple_choice",
151
+ "repeats": 1,
152
+ "should_decontaminate": true,
153
+ "doc_to_decontamination_query": "{{question}}"
154
+ },
155
+ "medqa_4options_g2b": {
156
+ "task": "medqa_4options_g2b",
157
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
158
+ "training_split": "train",
159
+ "validation_split": "validation",
160
+ "test_split": "test",
161
+ "doc_to_text": "<function doc_to_text at 0x7f3614195820>",
162
+ "doc_to_target": "<function doc_to_target at 0x7f3614195b80>",
163
+ "doc_to_choice": [
164
+ "A",
165
+ "B",
166
+ "C",
167
+ "D"
168
+ ],
169
+ "description": "",
170
+ "target_delimiter": " ",
171
+ "fewshot_delimiter": "\n\n",
172
+ "metric_list": [
173
+ {
174
+ "metric": "acc",
175
+ "aggregation": "mean",
176
+ "higher_is_better": true
177
+ },
178
+ {
179
+ "metric": "acc_norm",
180
+ "aggregation": "mean",
181
+ "higher_is_better": true
182
+ }
183
+ ],
184
+ "output_type": "multiple_choice",
185
+ "repeats": 1,
186
+ "should_decontaminate": false
187
+ },
188
+ "medqa_4options_orig_filtered": {
189
+ "task": "medqa_4options_orig_filtered",
190
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
191
+ "training_split": "train",
192
+ "validation_split": "validation",
193
+ "test_split": "test",
194
+ "doc_to_text": "<function doc_to_text at 0x7f3613c814c0>",
195
+ "doc_to_target": "<function doc_to_target at 0x7f3613c30ee0>",
196
+ "doc_to_choice": [
197
+ "A",
198
+ "B",
199
+ "C",
200
+ "D"
201
+ ],
202
+ "description": "",
203
+ "target_delimiter": " ",
204
+ "fewshot_delimiter": "\n\n",
205
+ "metric_list": [
206
+ {
207
+ "metric": "acc",
208
+ "aggregation": "mean",
209
+ "higher_is_better": true
210
+ },
211
+ {
212
+ "metric": "acc_norm",
213
+ "aggregation": "mean",
214
+ "higher_is_better": true
215
+ }
216
+ ],
217
+ "output_type": "multiple_choice",
218
+ "repeats": 1,
219
+ "should_decontaminate": false
220
+ }
221
+ },
222
+ "versions": {
223
+ "b4b": "N/A",
224
+ "b4bqa": "Yaml",
225
+ "medmcqa_g2b": "Yaml",
226
+ "medmcqa_orig_filtered": "Yaml",
227
+ "medqa_4options_g2b": "Yaml",
228
+ "medqa_4options_orig_filtered": "Yaml"
229
+ },
230
+ "n-shot": {
231
+ "b4b": 0,
232
+ "b4bqa": 0,
233
+ "medmcqa_g2b": 0,
234
+ "medmcqa_orig_filtered": 0,
235
+ "medqa_4options_g2b": 0,
236
+ "medqa_4options_orig_filtered": 0
237
+ },
238
+ "config": {
239
+ "model": "hf",
240
+ "model_args": "pretrained=microsoft/phi-1,load_in_4bit=True",
241
+ "batch_size": "auto:64",
242
+ "batch_sizes": [
243
+ 32,
244
+ 64,
245
+ 64,
246
+ 64,
247
+ 64,
248
+ 64,
249
+ 64,
250
+ 64,
251
+ 64,
252
+ 64,
253
+ 64,
254
+ 64,
255
+ 64,
256
+ 64,
257
+ 64,
258
+ 64,
259
+ 64,
260
+ 64,
261
+ 64,
262
+ 64,
263
+ 64,
264
+ 64,
265
+ 64,
266
+ 64,
267
+ 64,
268
+ 64,
269
+ 64,
270
+ 64,
271
+ 64,
272
+ 64,
273
+ 64,
274
+ 64,
275
+ 64,
276
+ 64,
277
+ 64,
278
+ 64,
279
+ 64,
280
+ 64,
281
+ 64,
282
+ 64,
283
+ 64,
284
+ 64,
285
+ 64,
286
+ 64,
287
+ 64,
288
+ 64,
289
+ 64,
290
+ 64,
291
+ 64,
292
+ 64,
293
+ 64,
294
+ 64,
295
+ 64,
296
+ 64,
297
+ 64,
298
+ 64,
299
+ 64,
300
+ 64,
301
+ 64,
302
+ 64,
303
+ 64,
304
+ 64,
305
+ 64,
306
+ 64,
307
+ 64
308
+ ],
309
+ "device": "cuda:0",
310
+ "use_cache": null,
311
+ "limit": null,
312
+ "bootstrap_iters": 100000,
313
+ "gen_kwargs": null
314
+ },
315
+ "git_hash": "928c7657"
316
+ }
data/raw-eval-outputs/microsoft-phi-2_results.json ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "b4b": {
4
+ "acc,none": 0.44790382244143034,
5
+ "acc_stderr,none": 0.0343882858973779,
6
+ "acc_norm,none": 0.44790382244143034,
7
+ "acc_norm_stderr,none": 0.0343882858973779,
8
+ "alias": "b4b"
9
+ },
10
+ "b4bqa": {
11
+ "acc,none": 0.47488839285714285,
12
+ "acc_stderr,none": 0.01179977682900124,
13
+ "acc_norm,none": 0.47488839285714285,
14
+ "acc_norm_stderr,none": 0.01179977682900124,
15
+ "alias": " - b4bqa"
16
+ },
17
+ "medmcqa_g2b": {
18
+ "acc,none": 0.3764367816091954,
19
+ "acc_stderr,none": 0.02600887296285643,
20
+ "acc_norm,none": 0.3764367816091954,
21
+ "acc_norm_stderr,none": 0.02600887296285643,
22
+ "alias": " - medmcqa_g2b"
23
+ },
24
+ "medmcqa_orig_filtered": {
25
+ "acc,none": 0.4224137931034483,
26
+ "acc_stderr,none": 0.02651628723013287,
27
+ "acc_norm,none": 0.4224137931034483,
28
+ "acc_norm_stderr,none": 0.02651628723013287,
29
+ "alias": " - medmcqa_orig_filtered"
30
+ },
31
+ "medqa_4options_g2b": {
32
+ "acc,none": 0.41798941798941797,
33
+ "acc_stderr,none": 0.02540255550326091,
34
+ "acc_norm,none": 0.41798941798941797,
35
+ "acc_norm_stderr,none": 0.02540255550326091,
36
+ "alias": " - medqa_4options_g2b"
37
+ },
38
+ "medqa_4options_orig_filtered": {
39
+ "acc,none": 0.43915343915343913,
40
+ "acc_stderr,none": 0.025559920550531003,
41
+ "acc_norm,none": 0.43915343915343913,
42
+ "acc_norm_stderr,none": 0.025559920550531003,
43
+ "alias": " - medqa_4options_orig_filtered"
44
+ }
45
+ },
46
+ "groups": {
47
+ "b4b": {
48
+ "acc,none": 0.44790382244143034,
49
+ "acc_stderr,none": 0.0343882858973779,
50
+ "acc_norm,none": 0.44790382244143034,
51
+ "acc_norm_stderr,none": 0.0343882858973779,
52
+ "alias": "b4b"
53
+ }
54
+ },
55
+ "configs": {
56
+ "b4bqa": {
57
+ "task": "b4bqa",
58
+ "dataset_path": "AIM-Harvard/b4b_drug_qa",
59
+ "test_split": "test",
60
+ "doc_to_text": "<function process_cd at 0x7fe111a60ee0>",
61
+ "doc_to_target": "correct_choice",
62
+ "doc_to_choice": [
63
+ "A",
64
+ "B",
65
+ "C",
66
+ "D"
67
+ ],
68
+ "description": "",
69
+ "target_delimiter": " ",
70
+ "fewshot_delimiter": "\n\n",
71
+ "metric_list": [
72
+ {
73
+ "metric": "acc",
74
+ "aggregation": "mean",
75
+ "higher_is_better": true
76
+ },
77
+ {
78
+ "metric": "acc_norm",
79
+ "aggregation": "mean",
80
+ "higher_is_better": true
81
+ }
82
+ ],
83
+ "output_type": "multiple_choice",
84
+ "repeats": 1,
85
+ "should_decontaminate": false
86
+ },
87
+ "medmcqa_g2b": {
88
+ "task": "medmcqa_g2b",
89
+ "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
90
+ "training_split": "train",
91
+ "validation_split": "validation",
92
+ "test_split": "validation",
93
+ "doc_to_text": "<function doc_to_text at 0x7fe111f9e3a0>",
94
+ "doc_to_target": "cop",
95
+ "doc_to_choice": [
96
+ "A",
97
+ "B",
98
+ "C",
99
+ "D"
100
+ ],
101
+ "description": "",
102
+ "target_delimiter": " ",
103
+ "fewshot_delimiter": "\n\n",
104
+ "metric_list": [
105
+ {
106
+ "metric": "acc",
107
+ "aggregation": "mean",
108
+ "higher_is_better": true
109
+ },
110
+ {
111
+ "metric": "acc_norm",
112
+ "aggregation": "mean",
113
+ "higher_is_better": true
114
+ }
115
+ ],
116
+ "output_type": "multiple_choice",
117
+ "repeats": 1,
118
+ "should_decontaminate": true,
119
+ "doc_to_decontamination_query": "{{question}}"
120
+ },
121
+ "medmcqa_orig_filtered": {
122
+ "task": "medmcqa_orig_filtered",
123
+ "dataset_path": "AIM-Harvard/medmcqa_original",
124
+ "training_split": "train",
125
+ "validation_split": "validation",
126
+ "test_split": "validation",
127
+ "doc_to_text": "<function doc_to_text at 0x7fe1118cb310>",
128
+ "doc_to_target": "cop",
129
+ "doc_to_choice": [
130
+ "A",
131
+ "B",
132
+ "C",
133
+ "D"
134
+ ],
135
+ "description": "",
136
+ "target_delimiter": " ",
137
+ "fewshot_delimiter": "\n\n",
138
+ "metric_list": [
139
+ {
140
+ "metric": "acc",
141
+ "aggregation": "mean",
142
+ "higher_is_better": true
143
+ },
144
+ {
145
+ "metric": "acc_norm",
146
+ "aggregation": "mean",
147
+ "higher_is_better": true
148
+ }
149
+ ],
150
+ "output_type": "multiple_choice",
151
+ "repeats": 1,
152
+ "should_decontaminate": true,
153
+ "doc_to_decontamination_query": "{{question}}"
154
+ },
155
+ "medqa_4options_g2b": {
156
+ "task": "medqa_4options_g2b",
157
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
158
+ "training_split": "train",
159
+ "validation_split": "validation",
160
+ "test_split": "test",
161
+ "doc_to_text": "<function doc_to_text at 0x7fe111f9e820>",
162
+ "doc_to_target": "<function doc_to_target at 0x7fe111f9eb80>",
163
+ "doc_to_choice": [
164
+ "A",
165
+ "B",
166
+ "C",
167
+ "D"
168
+ ],
169
+ "description": "",
170
+ "target_delimiter": " ",
171
+ "fewshot_delimiter": "\n\n",
172
+ "metric_list": [
173
+ {
174
+ "metric": "acc",
175
+ "aggregation": "mean",
176
+ "higher_is_better": true
177
+ },
178
+ {
179
+ "metric": "acc_norm",
180
+ "aggregation": "mean",
181
+ "higher_is_better": true
182
+ }
183
+ ],
184
+ "output_type": "multiple_choice",
185
+ "repeats": 1,
186
+ "should_decontaminate": false
187
+ },
188
+ "medqa_4options_orig_filtered": {
189
+ "task": "medqa_4options_orig_filtered",
190
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
191
+ "training_split": "train",
192
+ "validation_split": "validation",
193
+ "test_split": "test",
194
+ "doc_to_text": "<function doc_to_text at 0x7fe111a834c0>",
195
+ "doc_to_target": "<function doc_to_target at 0x7fe111a30ee0>",
196
+ "doc_to_choice": [
197
+ "A",
198
+ "B",
199
+ "C",
200
+ "D"
201
+ ],
202
+ "description": "",
203
+ "target_delimiter": " ",
204
+ "fewshot_delimiter": "\n\n",
205
+ "metric_list": [
206
+ {
207
+ "metric": "acc",
208
+ "aggregation": "mean",
209
+ "higher_is_better": true
210
+ },
211
+ {
212
+ "metric": "acc_norm",
213
+ "aggregation": "mean",
214
+ "higher_is_better": true
215
+ }
216
+ ],
217
+ "output_type": "multiple_choice",
218
+ "repeats": 1,
219
+ "should_decontaminate": false
220
+ }
221
+ },
222
+ "versions": {
223
+ "b4b": "N/A",
224
+ "b4bqa": "Yaml",
225
+ "medmcqa_g2b": "Yaml",
226
+ "medmcqa_orig_filtered": "Yaml",
227
+ "medqa_4options_g2b": "Yaml",
228
+ "medqa_4options_orig_filtered": "Yaml"
229
+ },
230
+ "n-shot": {
231
+ "b4b": 0,
232
+ "b4bqa": 0,
233
+ "medmcqa_g2b": 0,
234
+ "medmcqa_orig_filtered": 0,
235
+ "medqa_4options_g2b": 0,
236
+ "medqa_4options_orig_filtered": 0
237
+ },
238
+ "config": {
239
+ "model": "hf",
240
+ "model_args": "pretrained=microsoft/phi-2,load_in_4bit=True",
241
+ "batch_size": "auto:64",
242
+ "batch_sizes": [
243
+ 32,
244
+ 32,
245
+ 64,
246
+ 64,
247
+ 64,
248
+ 64,
249
+ 64,
250
+ 64,
251
+ 64,
252
+ 64,
253
+ 64,
254
+ 64,
255
+ 64,
256
+ 64,
257
+ 64,
258
+ 64,
259
+ 64,
260
+ 64,
261
+ 64,
262
+ 64,
263
+ 64,
264
+ 64,
265
+ 64,
266
+ 64,
267
+ 64,
268
+ 64,
269
+ 64,
270
+ 64,
271
+ 64,
272
+ 64,
273
+ 64,
274
+ 64,
275
+ 64,
276
+ 64,
277
+ 64,
278
+ 64,
279
+ 64,
280
+ 64,
281
+ 64,
282
+ 64,
283
+ 64,
284
+ 64,
285
+ 64,
286
+ 64,
287
+ 64,
288
+ 64,
289
+ 64,
290
+ 64,
291
+ 64,
292
+ 64,
293
+ 64,
294
+ 64,
295
+ 64,
296
+ 64,
297
+ 64,
298
+ 64,
299
+ 64,
300
+ 64,
301
+ 64,
302
+ 64,
303
+ 64,
304
+ 64,
305
+ 64,
306
+ 64,
307
+ 64
308
+ ],
309
+ "device": "cuda:0",
310
+ "use_cache": null,
311
+ "limit": null,
312
+ "bootstrap_iters": 100000,
313
+ "gen_kwargs": null
314
+ },
315
+ "git_hash": "928c7657"
316
+ }
data/raw-eval-outputs/mistralai-Mistral-7B-v0.3_results.json ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "b4b": {
4
+ "acc,none": 0.6199136868064118,
5
+ "acc_stderr,none": 0.0837373393352743,
6
+ "acc_norm,none": 0.6199136868064118,
7
+ "acc_norm_stderr,none": 0.0837373393352743,
8
+ "alias": "b4b"
9
+ },
10
+ "b4bqa": {
11
+ "acc,none": 0.703125,
12
+ "acc_stderr,none": 0.010795811437682205,
13
+ "acc_norm,none": 0.703125,
14
+ "acc_norm_stderr,none": 0.010795811437682205,
15
+ "alias": " - b4bqa"
16
+ },
17
+ "medmcqa_g2b": {
18
+ "acc,none": 0.4827586206896552,
19
+ "acc_stderr,none": 0.026825443578224806,
20
+ "acc_norm,none": 0.4827586206896552,
21
+ "acc_norm_stderr,none": 0.026825443578224806,
22
+ "alias": " - medmcqa_g2b"
23
+ },
24
+ "medmcqa_orig_filtered": {
25
+ "acc,none": 0.5689655172413793,
26
+ "acc_stderr,none": 0.026584851780353615,
27
+ "acc_norm,none": 0.5689655172413793,
28
+ "acc_norm_stderr,none": 0.026584851780353615,
29
+ "alias": " - medmcqa_orig_filtered"
30
+ },
31
+ "medqa_4options_g2b": {
32
+ "acc,none": 0.48677248677248675,
33
+ "acc_stderr,none": 0.025742297289575142,
34
+ "acc_norm,none": 0.48677248677248675,
35
+ "acc_norm_stderr,none": 0.025742297289575142,
36
+ "alias": " - medqa_4options_g2b"
37
+ },
38
+ "medqa_4options_orig_filtered": {
39
+ "acc,none": 0.5317460317460317,
40
+ "acc_stderr,none": 0.0256993528321318,
41
+ "acc_norm,none": 0.5317460317460317,
42
+ "acc_norm_stderr,none": 0.0256993528321318,
43
+ "alias": " - medqa_4options_orig_filtered"
44
+ }
45
+ },
46
+ "groups": {
47
+ "b4b": {
48
+ "acc,none": 0.6199136868064118,
49
+ "acc_stderr,none": 0.0837373393352743,
50
+ "acc_norm,none": 0.6199136868064118,
51
+ "acc_norm_stderr,none": 0.0837373393352743,
52
+ "alias": "b4b"
53
+ }
54
+ },
55
+ "configs": {
56
+ "b4bqa": {
57
+ "task": "b4bqa",
58
+ "dataset_path": "AIM-Harvard/b4b_drug_qa",
59
+ "test_split": "test",
60
+ "doc_to_text": "<function process_cd at 0x7f6e18c5ff70>",
61
+ "doc_to_target": "correct_choice",
62
+ "doc_to_choice": [
63
+ "A",
64
+ "B",
65
+ "C",
66
+ "D"
67
+ ],
68
+ "description": "",
69
+ "target_delimiter": " ",
70
+ "fewshot_delimiter": "\n\n",
71
+ "metric_list": [
72
+ {
73
+ "metric": "acc",
74
+ "aggregation": "mean",
75
+ "higher_is_better": true
76
+ },
77
+ {
78
+ "metric": "acc_norm",
79
+ "aggregation": "mean",
80
+ "higher_is_better": true
81
+ }
82
+ ],
83
+ "output_type": "multiple_choice",
84
+ "repeats": 1,
85
+ "should_decontaminate": false
86
+ },
87
+ "medmcqa_g2b": {
88
+ "task": "medmcqa_g2b",
89
+ "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
90
+ "training_split": "train",
91
+ "validation_split": "validation",
92
+ "test_split": "validation",
93
+ "doc_to_text": "<function doc_to_text at 0x7f6e1919e430>",
94
+ "doc_to_target": "cop",
95
+ "doc_to_choice": [
96
+ "A",
97
+ "B",
98
+ "C",
99
+ "D"
100
+ ],
101
+ "description": "",
102
+ "target_delimiter": " ",
103
+ "fewshot_delimiter": "\n\n",
104
+ "metric_list": [
105
+ {
106
+ "metric": "acc",
107
+ "aggregation": "mean",
108
+ "higher_is_better": true
109
+ },
110
+ {
111
+ "metric": "acc_norm",
112
+ "aggregation": "mean",
113
+ "higher_is_better": true
114
+ }
115
+ ],
116
+ "output_type": "multiple_choice",
117
+ "repeats": 1,
118
+ "should_decontaminate": true,
119
+ "doc_to_decontamination_query": "{{question}}"
120
+ },
121
+ "medmcqa_orig_filtered": {
122
+ "task": "medmcqa_orig_filtered",
123
+ "dataset_path": "AIM-Harvard/medmcqa_original",
124
+ "training_split": "train",
125
+ "validation_split": "validation",
126
+ "test_split": "validation",
127
+ "doc_to_text": "<function doc_to_text at 0x7f6e18acb3a0>",
128
+ "doc_to_target": "cop",
129
+ "doc_to_choice": [
130
+ "A",
131
+ "B",
132
+ "C",
133
+ "D"
134
+ ],
135
+ "description": "",
136
+ "target_delimiter": " ",
137
+ "fewshot_delimiter": "\n\n",
138
+ "metric_list": [
139
+ {
140
+ "metric": "acc",
141
+ "aggregation": "mean",
142
+ "higher_is_better": true
143
+ },
144
+ {
145
+ "metric": "acc_norm",
146
+ "aggregation": "mean",
147
+ "higher_is_better": true
148
+ }
149
+ ],
150
+ "output_type": "multiple_choice",
151
+ "repeats": 1,
152
+ "should_decontaminate": true,
153
+ "doc_to_decontamination_query": "{{question}}"
154
+ },
155
+ "medqa_4options_g2b": {
156
+ "task": "medqa_4options_g2b",
157
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
158
+ "training_split": "train",
159
+ "validation_split": "validation",
160
+ "test_split": "test",
161
+ "doc_to_text": "<function doc_to_text at 0x7f6e1919e8b0>",
162
+ "doc_to_target": "<function doc_to_target at 0x7f6e1919ec10>",
163
+ "doc_to_choice": [
164
+ "A",
165
+ "B",
166
+ "C",
167
+ "D"
168
+ ],
169
+ "description": "",
170
+ "target_delimiter": " ",
171
+ "fewshot_delimiter": "\n\n",
172
+ "metric_list": [
173
+ {
174
+ "metric": "acc",
175
+ "aggregation": "mean",
176
+ "higher_is_better": true
177
+ },
178
+ {
179
+ "metric": "acc_norm",
180
+ "aggregation": "mean",
181
+ "higher_is_better": true
182
+ }
183
+ ],
184
+ "output_type": "multiple_choice",
185
+ "repeats": 1,
186
+ "should_decontaminate": false
187
+ },
188
+ "medqa_4options_orig_filtered": {
189
+ "task": "medqa_4options_orig_filtered",
190
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
191
+ "training_split": "train",
192
+ "validation_split": "validation",
193
+ "test_split": "test",
194
+ "doc_to_text": "<function doc_to_text at 0x7f6e18c80550>",
195
+ "doc_to_target": "<function doc_to_target at 0x7f6e18c2ff70>",
196
+ "doc_to_choice": [
197
+ "A",
198
+ "B",
199
+ "C",
200
+ "D"
201
+ ],
202
+ "description": "",
203
+ "target_delimiter": " ",
204
+ "fewshot_delimiter": "\n\n",
205
+ "metric_list": [
206
+ {
207
+ "metric": "acc",
208
+ "aggregation": "mean",
209
+ "higher_is_better": true
210
+ },
211
+ {
212
+ "metric": "acc_norm",
213
+ "aggregation": "mean",
214
+ "higher_is_better": true
215
+ }
216
+ ],
217
+ "output_type": "multiple_choice",
218
+ "repeats": 1,
219
+ "should_decontaminate": false
220
+ }
221
+ },
222
+ "versions": {
223
+ "b4b": "N/A",
224
+ "b4bqa": "Yaml",
225
+ "medmcqa_g2b": "Yaml",
226
+ "medmcqa_orig_filtered": "Yaml",
227
+ "medqa_4options_g2b": "Yaml",
228
+ "medqa_4options_orig_filtered": "Yaml"
229
+ },
230
+ "n-shot": {
231
+ "b4b": 0,
232
+ "b4bqa": 0,
233
+ "medmcqa_g2b": 0,
234
+ "medmcqa_orig_filtered": 0,
235
+ "medqa_4options_g2b": 0,
236
+ "medqa_4options_orig_filtered": 0
237
+ },
238
+ "config": {
239
+ "model": "hf",
240
+ "model_args": "pretrained=mistralai/Mistral-7B-v0.3,load_in_4bit=True",
241
+ "batch_size": "auto:64",
242
+ "batch_sizes": [
243
+ 32,
244
+ 64,
245
+ 64,
246
+ 64,
247
+ 64,
248
+ 64,
249
+ 64,
250
+ 64,
251
+ 64,
252
+ 64,
253
+ 64,
254
+ 64,
255
+ 64,
256
+ 64,
257
+ 64,
258
+ 64,
259
+ 64,
260
+ 64,
261
+ 64,
262
+ 64,
263
+ 64,
264
+ 64,
265
+ 64,
266
+ 64,
267
+ 64,
268
+ 64,
269
+ 64,
270
+ 64,
271
+ 64,
272
+ 64,
273
+ 64,
274
+ 64,
275
+ 64,
276
+ 64,
277
+ 64,
278
+ 64,
279
+ 64,
280
+ 64,
281
+ 64,
282
+ 64,
283
+ 64,
284
+ 64,
285
+ 64,
286
+ 64,
287
+ 64,
288
+ 64,
289
+ 64,
290
+ 64,
291
+ 64,
292
+ 64,
293
+ 64,
294
+ 64,
295
+ 64,
296
+ 64,
297
+ 64,
298
+ 64,
299
+ 64,
300
+ 64,
301
+ 64,
302
+ 64,
303
+ 64,
304
+ 64,
305
+ 64,
306
+ 64,
307
+ 64
308
+ ],
309
+ "device": "cuda:0",
310
+ "use_cache": null,
311
+ "limit": null,
312
+ "bootstrap_iters": 100000,
313
+ "gen_kwargs": null
314
+ },
315
+ "git_hash": "928c7657"
316
+ }
data/raw-eval-outputs/mistralai-Mixtral-8x22B-v0.1_results.json ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "b4b": {
4
+ "acc,none": 0.7882244143033293,
5
+ "acc_stderr,none": 0.08841138813945006,
6
+ "acc_norm,none": 0.7882244143033293,
7
+ "acc_norm_stderr,none": 0.08841138813945006,
8
+ "alias": "b4b"
9
+ },
10
+ "b4bqa": {
11
+ "acc,none": 0.8772321428571429,
12
+ "acc_stderr,none": 0.007754464516034243,
13
+ "acc_norm,none": 0.8772321428571429,
14
+ "acc_norm_stderr,none": 0.007754464516034243,
15
+ "alias": " - b4bqa"
16
+ },
17
+ "medmcqa_g2b": {
18
+ "acc,none": 0.617816091954023,
19
+ "acc_stderr,none": 0.026085614333362674,
20
+ "acc_norm,none": 0.617816091954023,
21
+ "acc_norm_stderr,none": 0.026085614333362674,
22
+ "alias": " - medmcqa_g2b"
23
+ },
24
+ "medmcqa_orig_filtered": {
25
+ "acc,none": 0.7040229885057471,
26
+ "acc_stderr,none": 0.024505167376090542,
27
+ "acc_norm,none": 0.7040229885057471,
28
+ "acc_norm_stderr,none": 0.024505167376090542,
29
+ "alias": " - medmcqa_orig_filtered"
30
+ },
31
+ "medqa_4options_g2b": {
32
+ "acc,none": 0.6746031746031746,
33
+ "acc_stderr,none": 0.024130158299762613,
34
+ "acc_norm,none": 0.6746031746031746,
35
+ "acc_norm_stderr,none": 0.024130158299762613,
36
+ "alias": " - medqa_4options_g2b"
37
+ },
38
+ "medqa_4options_orig_filtered": {
39
+ "acc,none": 0.7142857142857143,
40
+ "acc_stderr,none": 0.023266512213730585,
41
+ "acc_norm,none": 0.7142857142857143,
42
+ "acc_norm_stderr,none": 0.023266512213730585,
43
+ "alias": " - medqa_4options_orig_filtered"
44
+ }
45
+ },
46
+ "groups": {
47
+ "b4b": {
48
+ "acc,none": 0.7882244143033293,
49
+ "acc_stderr,none": 0.08841138813945006,
50
+ "acc_norm,none": 0.7882244143033293,
51
+ "acc_norm_stderr,none": 0.08841138813945006,
52
+ "alias": "b4b"
53
+ }
54
+ },
55
+ "configs": {
56
+ "b4bqa": {
57
+ "task": "b4bqa",
58
+ "dataset_path": "AIM-Harvard/b4b_drug_qa",
59
+ "test_split": "test",
60
+ "doc_to_text": "<function process_cd at 0x7ff2d0094820>",
61
+ "doc_to_target": "correct_choice",
62
+ "doc_to_choice": [
63
+ "A",
64
+ "B",
65
+ "C",
66
+ "D"
67
+ ],
68
+ "description": "",
69
+ "target_delimiter": " ",
70
+ "fewshot_delimiter": "\n\n",
71
+ "metric_list": [
72
+ {
73
+ "metric": "acc",
74
+ "aggregation": "mean",
75
+ "higher_is_better": true
76
+ },
77
+ {
78
+ "metric": "acc_norm",
79
+ "aggregation": "mean",
80
+ "higher_is_better": true
81
+ }
82
+ ],
83
+ "output_type": "multiple_choice",
84
+ "repeats": 1,
85
+ "should_decontaminate": false
86
+ },
87
+ "medmcqa_g2b": {
88
+ "task": "medmcqa_g2b",
89
+ "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
90
+ "training_split": "train",
91
+ "validation_split": "validation",
92
+ "test_split": "validation",
93
+ "doc_to_text": "<function doc_to_text at 0x7ff2d00de050>",
94
+ "doc_to_target": "cop",
95
+ "doc_to_choice": [
96
+ "A",
97
+ "B",
98
+ "C",
99
+ "D"
100
+ ],
101
+ "description": "",
102
+ "target_delimiter": " ",
103
+ "fewshot_delimiter": "\n\n",
104
+ "metric_list": [
105
+ {
106
+ "metric": "acc",
107
+ "aggregation": "mean",
108
+ "higher_is_better": true
109
+ },
110
+ {
111
+ "metric": "acc_norm",
112
+ "aggregation": "mean",
113
+ "higher_is_better": true
114
+ }
115
+ ],
116
+ "output_type": "multiple_choice",
117
+ "repeats": 1,
118
+ "should_decontaminate": true,
119
+ "doc_to_decontamination_query": "{{question}}"
120
+ },
121
+ "medmcqa_orig_filtered": {
122
+ "task": "medmcqa_orig_filtered",
123
+ "dataset_path": "AIM-Harvard/medmcqa_original",
124
+ "training_split": "train",
125
+ "validation_split": "validation",
126
+ "test_split": "validation",
127
+ "doc_to_text": "<function doc_to_text at 0x7ff2c2d29240>",
128
+ "doc_to_target": "cop",
129
+ "doc_to_choice": [
130
+ "A",
131
+ "B",
132
+ "C",
133
+ "D"
134
+ ],
135
+ "description": "",
136
+ "target_delimiter": " ",
137
+ "fewshot_delimiter": "\n\n",
138
+ "metric_list": [
139
+ {
140
+ "metric": "acc",
141
+ "aggregation": "mean",
142
+ "higher_is_better": true
143
+ },
144
+ {
145
+ "metric": "acc_norm",
146
+ "aggregation": "mean",
147
+ "higher_is_better": true
148
+ }
149
+ ],
150
+ "output_type": "multiple_choice",
151
+ "repeats": 1,
152
+ "should_decontaminate": true,
153
+ "doc_to_decontamination_query": "{{question}}"
154
+ },
155
+ "medqa_4options_g2b": {
156
+ "task": "medqa_4options_g2b",
157
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
158
+ "training_split": "train",
159
+ "validation_split": "validation",
160
+ "test_split": "test",
161
+ "doc_to_text": "<function doc_to_text at 0x7ff2c4f34820>",
162
+ "doc_to_target": "<function doc_to_target at 0x7ff2c4f34b80>",
163
+ "doc_to_choice": [
164
+ "A",
165
+ "B",
166
+ "C",
167
+ "D"
168
+ ],
169
+ "description": "",
170
+ "target_delimiter": " ",
171
+ "fewshot_delimiter": "\n\n",
172
+ "metric_list": [
173
+ {
174
+ "metric": "acc",
175
+ "aggregation": "mean",
176
+ "higher_is_better": true
177
+ },
178
+ {
179
+ "metric": "acc_norm",
180
+ "aggregation": "mean",
181
+ "higher_is_better": true
182
+ }
183
+ ],
184
+ "output_type": "multiple_choice",
185
+ "repeats": 1,
186
+ "should_decontaminate": false
187
+ },
188
+ "medqa_4options_orig_filtered": {
189
+ "task": "medqa_4options_orig_filtered",
190
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
191
+ "training_split": "train",
192
+ "validation_split": "validation",
193
+ "test_split": "test",
194
+ "doc_to_text": "<function doc_to_text at 0x7ff2d005a680>",
195
+ "doc_to_target": "<function doc_to_target at 0x7ff2d0079120>",
196
+ "doc_to_choice": [
197
+ "A",
198
+ "B",
199
+ "C",
200
+ "D"
201
+ ],
202
+ "description": "",
203
+ "target_delimiter": " ",
204
+ "fewshot_delimiter": "\n\n",
205
+ "metric_list": [
206
+ {
207
+ "metric": "acc",
208
+ "aggregation": "mean",
209
+ "higher_is_better": true
210
+ },
211
+ {
212
+ "metric": "acc_norm",
213
+ "aggregation": "mean",
214
+ "higher_is_better": true
215
+ }
216
+ ],
217
+ "output_type": "multiple_choice",
218
+ "repeats": 1,
219
+ "should_decontaminate": false
220
+ }
221
+ },
222
+ "versions": {
223
+ "b4b": "N/A",
224
+ "b4bqa": "Yaml",
225
+ "medmcqa_g2b": "Yaml",
226
+ "medmcqa_orig_filtered": "Yaml",
227
+ "medqa_4options_g2b": "Yaml",
228
+ "medqa_4options_orig_filtered": "Yaml"
229
+ },
230
+ "n-shot": {
231
+ "b4b": 0,
232
+ "b4bqa": 0,
233
+ "medmcqa_g2b": 0,
234
+ "medmcqa_orig_filtered": 0,
235
+ "medqa_4options_g2b": 0,
236
+ "medqa_4options_orig_filtered": 0
237
+ },
238
+ "config": {
239
+ "model": "hf",
240
+ "model_args": "pretrained=mistralai/Mixtral-8x22B-v0.1,parallelize=True,load_in_4bit=True",
241
+ "batch_size": "auto",
242
+ "batch_sizes": [
243
+ 32
244
+ ],
245
+ "device": null,
246
+ "use_cache": null,
247
+ "limit": null,
248
+ "bootstrap_iters": 100000,
249
+ "gen_kwargs": null
250
+ },
251
+ "git_hash": "928c7657"
252
+ }
data/raw-eval-outputs/mistralai-Mixtral-8x7B-v0.1_results.json ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "b4b": {
4
+ "acc,none": 0.7475339087546239,
5
+ "acc_stderr,none": 0.11087824048509952,
6
+ "acc_norm,none": 0.7475339087546239,
7
+ "acc_norm_stderr,none": 0.11087824048509952,
8
+ "alias": "b4b"
9
+ },
10
+ "b4bqa": {
11
+ "acc,none": 0.8610491071428571,
12
+ "acc_stderr,none": 0.008173288677884256,
13
+ "acc_norm,none": 0.8610491071428571,
14
+ "acc_norm_stderr,none": 0.008173288677884256,
15
+ "alias": " - b4bqa"
16
+ },
17
+ "medmcqa_g2b": {
18
+ "acc,none": 0.5545977011494253,
19
+ "acc_stderr,none": 0.026680902895795475,
20
+ "acc_norm,none": 0.5545977011494253,
21
+ "acc_norm_stderr,none": 0.026680902895795475,
22
+ "alias": " - medmcqa_g2b"
23
+ },
24
+ "medmcqa_orig_filtered": {
25
+ "acc,none": 0.6494252873563219,
26
+ "acc_stderr,none": 0.025614751890362768,
27
+ "acc_norm,none": 0.6494252873563219,
28
+ "acc_norm_stderr,none": 0.025614751890362768,
29
+ "alias": " - medmcqa_orig_filtered"
30
+ },
31
+ "medqa_4options_g2b": {
32
+ "acc,none": 0.6005291005291006,
33
+ "acc_stderr,none": 0.025225450284067932,
34
+ "acc_norm,none": 0.6005291005291006,
35
+ "acc_norm_stderr,none": 0.025225450284067932,
36
+ "alias": " - medqa_4options_g2b"
37
+ },
38
+ "medqa_4options_orig_filtered": {
39
+ "acc,none": 0.6243386243386243,
40
+ "acc_stderr,none": 0.02494236893115979,
41
+ "acc_norm,none": 0.6243386243386243,
42
+ "acc_norm_stderr,none": 0.02494236893115979,
43
+ "alias": " - medqa_4options_orig_filtered"
44
+ }
45
+ },
46
+ "groups": {
47
+ "b4b": {
48
+ "acc,none": 0.7475339087546239,
49
+ "acc_stderr,none": 0.11087824048509952,
50
+ "acc_norm,none": 0.7475339087546239,
51
+ "acc_norm_stderr,none": 0.11087824048509952,
52
+ "alias": "b4b"
53
+ }
54
+ },
55
+ "configs": {
56
+ "b4bqa": {
57
+ "task": "b4bqa",
58
+ "dataset_path": "AIM-Harvard/b4b_drug_qa",
59
+ "test_split": "test",
60
+ "doc_to_text": "<function process_cd at 0x7fb0afadd090>",
61
+ "doc_to_target": "correct_choice",
62
+ "doc_to_choice": [
63
+ "A",
64
+ "B",
65
+ "C",
66
+ "D"
67
+ ],
68
+ "description": "",
69
+ "target_delimiter": " ",
70
+ "fewshot_delimiter": "\n\n",
71
+ "metric_list": [
72
+ {
73
+ "metric": "acc",
74
+ "aggregation": "mean",
75
+ "higher_is_better": true
76
+ },
77
+ {
78
+ "metric": "acc_norm",
79
+ "aggregation": "mean",
80
+ "higher_is_better": true
81
+ }
82
+ ],
83
+ "output_type": "multiple_choice",
84
+ "repeats": 1,
85
+ "should_decontaminate": false
86
+ },
87
+ "medmcqa_g2b": {
88
+ "task": "medmcqa_g2b",
89
+ "dataset_path": "AIM-Harvard/medmcqa_generic_to_brand",
90
+ "training_split": "train",
91
+ "validation_split": "validation",
92
+ "test_split": "validation",
93
+ "doc_to_text": "<function doc_to_text at 0x7fb0adf3c550>",
94
+ "doc_to_target": "cop",
95
+ "doc_to_choice": [
96
+ "A",
97
+ "B",
98
+ "C",
99
+ "D"
100
+ ],
101
+ "description": "",
102
+ "target_delimiter": " ",
103
+ "fewshot_delimiter": "\n\n",
104
+ "metric_list": [
105
+ {
106
+ "metric": "acc",
107
+ "aggregation": "mean",
108
+ "higher_is_better": true
109
+ },
110
+ {
111
+ "metric": "acc_norm",
112
+ "aggregation": "mean",
113
+ "higher_is_better": true
114
+ }
115
+ ],
116
+ "output_type": "multiple_choice",
117
+ "repeats": 1,
118
+ "should_decontaminate": true,
119
+ "doc_to_decontamination_query": "{{question}}"
120
+ },
121
+ "medmcqa_orig_filtered": {
122
+ "task": "medmcqa_orig_filtered",
123
+ "dataset_path": "AIM-Harvard/medmcqa_original",
124
+ "training_split": "train",
125
+ "validation_split": "validation",
126
+ "test_split": "validation",
127
+ "doc_to_text": "<function doc_to_text at 0x7fb0adf3c700>",
128
+ "doc_to_target": "cop",
129
+ "doc_to_choice": [
130
+ "A",
131
+ "B",
132
+ "C",
133
+ "D"
134
+ ],
135
+ "description": "",
136
+ "target_delimiter": " ",
137
+ "fewshot_delimiter": "\n\n",
138
+ "metric_list": [
139
+ {
140
+ "metric": "acc",
141
+ "aggregation": "mean",
142
+ "higher_is_better": true
143
+ },
144
+ {
145
+ "metric": "acc_norm",
146
+ "aggregation": "mean",
147
+ "higher_is_better": true
148
+ }
149
+ ],
150
+ "output_type": "multiple_choice",
151
+ "repeats": 1,
152
+ "should_decontaminate": true,
153
+ "doc_to_decontamination_query": "{{question}}"
154
+ },
155
+ "medqa_4options_g2b": {
156
+ "task": "medqa_4options_g2b",
157
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_generic_to_brand",
158
+ "training_split": "train",
159
+ "validation_split": "validation",
160
+ "test_split": "test",
161
+ "doc_to_text": "<function doc_to_text at 0x7fb0adf3f6d0>",
162
+ "doc_to_target": "<function doc_to_target at 0x7fb0adf3fa30>",
163
+ "doc_to_choice": [
164
+ "A",
165
+ "B",
166
+ "C",
167
+ "D"
168
+ ],
169
+ "description": "",
170
+ "target_delimiter": " ",
171
+ "fewshot_delimiter": "\n\n",
172
+ "metric_list": [
173
+ {
174
+ "metric": "acc",
175
+ "aggregation": "mean",
176
+ "higher_is_better": true
177
+ },
178
+ {
179
+ "metric": "acc_norm",
180
+ "aggregation": "mean",
181
+ "higher_is_better": true
182
+ }
183
+ ],
184
+ "output_type": "multiple_choice",
185
+ "repeats": 1,
186
+ "should_decontaminate": false
187
+ },
188
+ "medqa_4options_orig_filtered": {
189
+ "task": "medqa_4options_orig_filtered",
190
+ "dataset_path": "AIM-Harvard/gbaker_medqa_usmle_4_options_hf_original",
191
+ "training_split": "train",
192
+ "validation_split": "validation",
193
+ "test_split": "test",
194
+ "doc_to_text": "<function doc_to_text at 0x7fb0adf3fc70>",
195
+ "doc_to_target": "<function doc_to_target at 0x7fb0adf3feb0>",
196
+ "doc_to_choice": [
197
+ "A",
198
+ "B",
199
+ "C",
200
+ "D"
201
+ ],
202
+ "description": "",
203
+ "target_delimiter": " ",
204
+ "fewshot_delimiter": "\n\n",
205
+ "metric_list": [
206
+ {
207
+ "metric": "acc",
208
+ "aggregation": "mean",
209
+ "higher_is_better": true
210
+ },
211
+ {
212
+ "metric": "acc_norm",
213
+ "aggregation": "mean",
214
+ "higher_is_better": true
215
+ }
216
+ ],
217
+ "output_type": "multiple_choice",
218
+ "repeats": 1,
219
+ "should_decontaminate": false
220
+ }
221
+ },
222
+ "versions": {
223
+ "b4b": "N/A",
224
+ "b4bqa": "Yaml",
225
+ "medmcqa_g2b": "Yaml",
226
+ "medmcqa_orig_filtered": "Yaml",
227
+ "medqa_4options_g2b": "Yaml",
228
+ "medqa_4options_orig_filtered": "Yaml"
229
+ },
230
+ "n-shot": {
231
+ "b4b": 0,
232
+ "b4bqa": 0,
233
+ "medmcqa_g2b": 0,
234
+ "medmcqa_orig_filtered": 0,
235
+ "medqa_4options_g2b": 0,
236
+ "medqa_4options_orig_filtered": 0
237
+ },
238
+ "config": {
239
+ "model": "hf",
240
+ "model_args": "pretrained=mistralai/Mixtral-8x7B-v0.1,load_in_4bit=True",
241
+ "batch_size": "4",
242
+ "batch_sizes": [],
243
+ "device": "cuda:0",
244
+ "use_cache": null,
245
+ "limit": null,
246
+ "bootstrap_iters": 100000,
247
+ "gen_kwargs": null
248
+ },
249
+ "git_hash": "928c7657"
250
+ }
src/__pycache__/model_links.cpython-311.pyc ADDED
Binary file (1.97 kB). View file
 
src/__pycache__/models_info.cpython-311.pyc ADDED
Binary file (2.43 kB). View file
 
src/json2df.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import pandas as pd
4
+ from models_info import model_info
5
+
6
+ directory = 'data/raw-eval-outputs'
7
+ data = []
8
+
9
+ def model_hyperlink(link, model_name):
10
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
11
+
12
+ def make_clickable_names(df):
13
+ df["Model"] = df.apply(
14
+ lambda row: model_hyperlink(row["Link"], row["Model"]), axis=1
15
+ )
16
+ return df
17
+
18
+ # Iterate over all the files in the directory
19
+ for filename in os.listdir(directory):
20
+ if filename.endswith(".json"):
21
+ filepath = os.path.join(directory, filename)
22
+ with open(filepath, 'r') as f:
23
+ json_data = json.load(f)
24
+ model_name = filename.replace("_results.json", "")
25
+
26
+ # Extract the accuracy values
27
+ results = json_data['results']
28
+ row = {'Model': model_name}
29
+ for key, value in results.items():
30
+ row[key] = round(value['acc,none'] * 100, 2)
31
+
32
+ # Add the tuning type and link to the row
33
+ row['T'] = model_info[model_name]['tuning']
34
+ row['Link'] = model_info[model_name]['link']
35
+
36
+ data.append(row)
37
+
38
+
39
+ df = pd.DataFrame(data)
40
+ df = make_clickable_names(df)
41
+ df.drop(columns=["Link"], inplace=True)
42
+
43
+ df['medmcqa_diff'] = (df['medmcqa_g2b'] - df['medmcqa_orig_filtered']).round(2)
44
+ df['medqa_diff'] = (df['medqa_4options_g2b'] - df['medqa_4options_orig_filtered']).round(2)
45
+
46
+ # Reorder columns
47
+ cols = [
48
+ "T",
49
+ "Model",
50
+ "b4bqa",
51
+ "b4b",
52
+ "medmcqa_g2b",
53
+ "medmcqa_orig_filtered",
54
+ "medmcqa_diff",
55
+ "medqa_4options_g2b",
56
+ "medqa_4options_orig_filtered",
57
+ "medqa_diff"
58
+ ] + [col for col in df.columns if col not in [
59
+ "T", "Model", "b4bqa", "b4b", "medmcqa_g2b", "medmcqa_orig_filtered", "medmcqa_diff", "medqa_4options_g2b", "medqa_4options_orig_filtered", "medqa_diff"
60
+ ]]
61
+ df = df[cols]
62
+
63
+
64
+ output_csv = 'data/csv/models_data.csv'
65
+ df.to_csv(output_csv, index=False)
66
+
67
+ print(f"DataFrame saved to {output_csv}")
src/models_info.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #feel free to correct these categories, I think size should also be added
2
+ model_info = {
3
+ "meta-llama-Meta-Llama-3-70B": {
4
+ "link": "https://huggingface.co/meta-llama/Meta-Llama-3-70B",
5
+ "tuning": "🟒" # Pre-trained
6
+ },
7
+ "meta-llama-Meta-Llama-3-8B": {
8
+ "link": "https://huggingface.co/meta-llama/Meta-Llama-3-8B",
9
+ "tuning": "🟒" # Pre-trained
10
+ },
11
+ "01-ai-Yi-1.5-34B": {
12
+ "link": "https://huggingface.co/01-ai/Yi-1.5-34B",
13
+ "tuning": "πŸ”Ά" # Fine-tuned on task specific dataset
14
+ },
15
+ "aaditya-Llama3-OpenBioLLM-70B": {
16
+ "link": "https://huggingface.co/aaditya/Llama3-OpenBioLLM-70B",
17
+ "tuning": "πŸ”Ά" # Fine-tuned on task specific dataset
18
+ },
19
+ "CohereForAI-aya-23-35B": {
20
+ "link": "https://huggingface.co/CohereForAI/aya-23-35B",
21
+ "tuning": "πŸ”Ά" # Fine-tuned on task specific dataset
22
+ },
23
+ "CohereForAI-c4ai-command-r-plus": {
24
+ "link": "https://huggingface.co/CohereForAI/c4ai-command-r-plus",
25
+ "tuning": "πŸ’¬" # Chat-model (RLHF, DPO, IFT, etc.)
26
+ },
27
+ "johnsnowlabs-JSL-MedLlama-3-8B-v9": {
28
+ "link": "https://huggingface.co/johnsnowlabs/JSL-MedLlama-3-8B-v9",
29
+ "tuning": "πŸ”Ά" # Fine-tuned on task specific dataset
30
+ },
31
+ "meta-llama-Llama-2-70B-hf": {
32
+ "link": "https://huggingface.co/meta-llama/Llama-2-70B-hf",
33
+ "tuning": "🟒" # Pre-trained
34
+ },
35
+ "meta-llama-Llama-2-7b-hf": {
36
+ "link": "https://huggingface.co/meta-llama/Llama-2-7b-hf",
37
+ "tuning": "🟒" # Pre-trained
38
+ },
39
+ "microsoft-phi-1_5": {
40
+ "link": "https://huggingface.co/microsoft/phi-1_5",
41
+ "tuning": "🟒" # Pre-trained
42
+ },
43
+ "microsoft-phi-1": {
44
+ "link": "https://huggingface.co/microsoft/phi-1",
45
+ "tuning": "🟒" # Pre-trained
46
+ },
47
+ "microsoft-phi-2": {
48
+ "link": "https://huggingface.co/microsoft/phi-2",
49
+ "tuning": "🟒" # Pre-trained
50
+ },
51
+ "microsoft-Phi-3-medium-4k-instruct": {
52
+ "link": "https://huggingface.co/microsoft/Phi-3-medium-4k-instruct",
53
+ "tuning": "πŸ’¬" # Chat-model (RLHF, DPO, IFT, etc.)
54
+ },
55
+ "mistralai-Mistral-7B-v0.3": {
56
+ "link": "https://huggingface.co/mistralai/Mistral-7B-v0.3",
57
+ "tuning": "🟒" # Continuously pre-trained
58
+ },
59
+ "mistralai-Mixtral-8x22B-v0.1": {
60
+ "link": "https://huggingface.co/mistralai/Mixtral-8x22B-v0.1",
61
+ "tuning": "🟒" # Continuously pre-trained
62
+ },
63
+ "mistralai-Mixtral-8x7B-v0.1": {
64
+ "link": "https://huggingface.co/mistralai/Mixtral-8x7B-v0.1",
65
+ "tuning": "🟒" # Continuously pre-trained
66
+ },
67
+ "ProbeMedicalYonseiMAILab-medllama3-v20": {
68
+ "link": "https://huggingface.co/ProbeMedicalYonseiMAILab/medllama3-v20",
69
+ "tuning": "πŸ”Ά" # Fine-tuned on task specific dataset
70
+ },
71
+ "Qwen-Qwen2-72B": {
72
+ "link": "https://huggingface.co/Qwen/Qwen2-72B",
73
+ "tuning": "🟒" # Pre-trained
74
+ },
75
+ "Qwen-Qwen2-7B": {
76
+ "link": "https://huggingface.co/Qwen/Qwen2-7B",
77
+ "tuning": "🟒" # Pre-trained
78
+ },
79
+ }