Spaces:
Runtime error
Runtime error
Fix TruthQA typo
Browse files
app.py
CHANGED
@@ -43,11 +43,11 @@ def load_results(model, benchmark, metric):
|
|
43 |
with open(file_path) as fp:
|
44 |
data = json.load(fp)
|
45 |
accs = np.array([v[metric] for k, v in data["results"].items()])
|
46 |
-
mean_acc = np.mean(accs)
|
47 |
return mean_acc, data["config"]["model_args"]
|
48 |
|
49 |
|
50 |
-
COLS = ["Model", "Revision", "Average β¬οΈ", "ARC (25-shot) β¬οΈ", "HellaSwag (10-shot) β¬οΈ", "MMLU (5-shot) β¬οΈ", "
|
51 |
TYPES = ["markdown","str", "number", "number", "number", "number", "number", ]
|
52 |
|
53 |
if not IS_PUBLIC:
|
@@ -57,36 +57,36 @@ if not IS_PUBLIC:
|
|
57 |
EVAL_COLS = ["model", "revision", "private", "8bit_eval", "is_delta_weight", "status"]
|
58 |
EVAL_TYPES = ["markdown","str", "bool", "bool", "bool", "str"]
|
59 |
def get_leaderboard():
|
60 |
-
if repo:
|
61 |
print("pulling changes")
|
62 |
repo.git_pull()
|
63 |
-
|
64 |
all_data = get_eval_results_dicts(IS_PUBLIC)
|
65 |
-
|
66 |
if not IS_PUBLIC:
|
67 |
gpt4_values = {
|
68 |
-
"Model":f'<a target="_blank" href=https://arxiv.org/abs/2303.08774 style="color: blue; text-decoration: underline;text-decoration-style: dotted;">gpt4</a>',
|
69 |
-
"Revision":"tech report",
|
70 |
"8bit":None,
|
71 |
"Average β¬οΈ":84.3,
|
72 |
"ARC (25-shot) β¬οΈ":96.3,
|
73 |
"HellaSwag (10-shot) β¬οΈ":95.3,
|
74 |
"MMLU (5-shot) β¬οΈ":86.4,
|
75 |
-
"
|
76 |
}
|
77 |
all_data.append(gpt4_values)
|
78 |
gpt35_values = {
|
79 |
-
"Model":f'<a target="_blank" href=https://arxiv.org/abs/2303.08774 style="color: blue; text-decoration: underline;text-decoration-style: dotted;">gpt3.5</a>',
|
80 |
-
"Revision":"tech report",
|
81 |
"8bit":None,
|
82 |
"Average β¬οΈ":71.9,
|
83 |
"ARC (25-shot) β¬οΈ":85.2,
|
84 |
"HellaSwag (10-shot) β¬οΈ":85.5,
|
85 |
"MMLU (5-shot) β¬οΈ":70.0,
|
86 |
-
"
|
87 |
}
|
88 |
all_data.append(gpt35_values)
|
89 |
-
|
90 |
dataframe = pd.DataFrame.from_records(all_data)
|
91 |
dataframe = dataframe.sort_values(by=['Average β¬οΈ'], ascending=False)
|
92 |
print(dataframe)
|
@@ -94,38 +94,38 @@ def get_leaderboard():
|
|
94 |
return dataframe
|
95 |
|
96 |
def get_eval_table():
|
97 |
-
if repo:
|
98 |
print("pulling changes for eval")
|
99 |
repo.git_pull()
|
100 |
-
entries = [entry for entry in os.listdir("evals/eval_requests") if not entry.startswith('.')]
|
101 |
all_evals = []
|
102 |
-
|
103 |
for entry in entries:
|
104 |
print(entry)
|
105 |
if ".json"in entry:
|
106 |
file_path = os.path.join("evals/eval_requests", entry)
|
107 |
with open(file_path) as fp:
|
108 |
data = json.load(fp)
|
109 |
-
|
110 |
data["# params"] = "unknown"
|
111 |
data["model"] = make_clickable_model(data["model"])
|
112 |
data["revision"] = data.get("revision", "main")
|
113 |
-
|
114 |
|
115 |
all_evals.append(data)
|
116 |
else:
|
117 |
# this is a folder
|
118 |
-
sub_entries = [e for e in os.listdir(f"evals/eval_requests/{entry}") if not e.startswith('.')]
|
119 |
for sub_entry in sub_entries:
|
120 |
file_path = os.path.join("evals/eval_requests", entry, sub_entry)
|
121 |
with open(file_path) as fp:
|
122 |
data = json.load(fp)
|
123 |
-
|
124 |
#data["# params"] = get_n_params(data["model"])
|
125 |
data["model"] = make_clickable_model(data["model"])
|
126 |
all_evals.append(data)
|
127 |
|
128 |
-
|
129 |
dataframe = pd.DataFrame.from_records(all_evals)
|
130 |
return dataframe[EVAL_COLS]
|
131 |
|
@@ -137,12 +137,12 @@ def is_model_on_hub(model_name, revision) -> bool:
|
|
137 |
try:
|
138 |
config = AutoConfig.from_pretrained(model_name, revision=revision)
|
139 |
return True
|
140 |
-
|
141 |
except Exception as e:
|
142 |
print("Could not get the model config from the hub")
|
143 |
print(e)
|
144 |
return False
|
145 |
-
|
146 |
|
147 |
|
148 |
def add_new_eval(model:str, base_model : str, revision:str, is_8_bit_eval: bool, private:bool, is_delta_weight:bool):
|
@@ -152,12 +152,12 @@ def add_new_eval(model:str, base_model : str, revision:str, is_8_bit_eval: bool,
|
|
152 |
if is_delta_weight and not is_model_on_hub(base_model, revision):
|
153 |
print(base_model, "base model not found on hub")
|
154 |
return
|
155 |
-
|
156 |
if not is_model_on_hub(model, revision):
|
157 |
print(model, "not found on hub")
|
158 |
return
|
159 |
print("adding new eval")
|
160 |
-
|
161 |
eval_entry = {
|
162 |
"model" : model,
|
163 |
"base_model" : base_model,
|
@@ -166,22 +166,22 @@ def add_new_eval(model:str, base_model : str, revision:str, is_8_bit_eval: bool,
|
|
166 |
"8bit_eval" : is_8_bit_eval,
|
167 |
"is_delta_weight" : is_delta_weight,
|
168 |
"status" : "PENDING"
|
169 |
-
}
|
170 |
-
|
171 |
user_name = ""
|
172 |
model_path = model
|
173 |
if "/" in model:
|
174 |
user_name = model.split("/")[0]
|
175 |
model_path = model.split("/")[1]
|
176 |
-
|
177 |
OUT_DIR=f"eval_requests/{user_name}"
|
178 |
os.makedirs(OUT_DIR, exist_ok=True)
|
179 |
out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{is_8_bit_eval}_{is_delta_weight}.json"
|
180 |
-
|
181 |
with open(out_path, "w") as f:
|
182 |
f.write(json.dumps(eval_entry))
|
183 |
LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
|
184 |
-
|
185 |
api = HfApi()
|
186 |
api.upload_file(
|
187 |
path_or_fileobj=out_path,
|
@@ -191,14 +191,14 @@ def add_new_eval(model:str, base_model : str, revision:str, is_8_bit_eval: bool,
|
|
191 |
repo_type="dataset",
|
192 |
)
|
193 |
|
194 |
-
|
195 |
def refresh():
|
196 |
return get_leaderboard(), get_eval_table()
|
197 |
-
|
198 |
|
199 |
|
200 |
block = gr.Blocks()
|
201 |
-
with block:
|
202 |
with gr.Row():
|
203 |
gr.Markdown(f"""
|
204 |
# π€ Open LLM Leaderboard
|
@@ -208,49 +208,47 @@ Evaluation is performed against 4 popular benchmarks:
|
|
208 |
- <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
|
209 |
- <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
|
210 |
- <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
|
211 |
-
- <a href="https://arxiv.org/abs/2109.07958" target="_blank">
|
212 |
|
213 |
We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings. </font>
|
214 |
""")
|
215 |
-
|
216 |
with gr.Row():
|
217 |
leaderboard_table = gr.components.Dataframe(value=leaderboard, headers=COLS,
|
218 |
datatype=TYPES, max_rows=5)
|
219 |
|
220 |
-
|
221 |
-
|
222 |
with gr.Row():
|
223 |
gr.Markdown(f"""
|
224 |
# Evaluation Queue for the π€ Open LLM Leaderboard, these models will be automatically evaluated on the π€ cluster
|
225 |
-
|
226 |
""")
|
227 |
with gr.Accordion("Evaluation Queue", open=False):
|
228 |
with gr.Row():
|
229 |
eval_table = gr.components.Dataframe(value=eval_queue, headers=EVAL_COLS,
|
230 |
-
datatype=EVAL_TYPES, max_rows=5)
|
231 |
-
|
232 |
with gr.Row():
|
233 |
refresh_button = gr.Button("Refresh")
|
234 |
-
refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table, eval_table])
|
235 |
-
|
236 |
with gr.Accordion("Submit a new model for evaluation"):
|
237 |
-
# with gr.Row():
|
238 |
-
# gr.Markdown(f"""# Submit a new model for evaluation""")
|
239 |
with gr.Row():
|
240 |
with gr.Column():
|
241 |
model_name_textbox = gr.Textbox(label="Model name")
|
242 |
revision_name_textbox = gr.Textbox(label="revision", placeholder="main")
|
243 |
-
|
244 |
with gr.Column():
|
245 |
is_8bit_toggle = gr.Checkbox(False, label="8 bit eval", visible=not IS_PUBLIC)
|
246 |
private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
|
247 |
is_delta_weight = gr.Checkbox(False, label="Delta weights")
|
248 |
base_model_name_textbox = gr.Textbox(label="base model (for delta)")
|
249 |
-
|
250 |
with gr.Row():
|
251 |
submit_button = gr.Button("Submit Eval")
|
252 |
submit_button.click(add_new_eval, [model_name_textbox, base_model_name_textbox, revision_name_textbox, is_8bit_toggle, private, is_delta_weight])
|
253 |
-
|
254 |
|
255 |
block.load(refresh, inputs=[], outputs=[leaderboard_table, eval_table])
|
256 |
block.launch()
|
|
|
43 |
with open(file_path) as fp:
|
44 |
data = json.load(fp)
|
45 |
accs = np.array([v[metric] for k, v in data["results"].items()])
|
46 |
+
mean_acc = np.mean(accs)
|
47 |
return mean_acc, data["config"]["model_args"]
|
48 |
|
49 |
|
50 |
+
COLS = ["Model", "Revision", "Average β¬οΈ", "ARC (25-shot) β¬οΈ", "HellaSwag (10-shot) β¬οΈ", "MMLU (5-shot) β¬οΈ", "TruthfulQA (0-shot) β¬οΈ"]
|
51 |
TYPES = ["markdown","str", "number", "number", "number", "number", "number", ]
|
52 |
|
53 |
if not IS_PUBLIC:
|
|
|
57 |
EVAL_COLS = ["model", "revision", "private", "8bit_eval", "is_delta_weight", "status"]
|
58 |
EVAL_TYPES = ["markdown","str", "bool", "bool", "bool", "str"]
|
59 |
def get_leaderboard():
|
60 |
+
if repo:
|
61 |
print("pulling changes")
|
62 |
repo.git_pull()
|
63 |
+
|
64 |
all_data = get_eval_results_dicts(IS_PUBLIC)
|
65 |
+
|
66 |
if not IS_PUBLIC:
|
67 |
gpt4_values = {
|
68 |
+
"Model":f'<a target="_blank" href=https://arxiv.org/abs/2303.08774 style="color: blue; text-decoration: underline;text-decoration-style: dotted;">gpt4</a>',
|
69 |
+
"Revision":"tech report",
|
70 |
"8bit":None,
|
71 |
"Average β¬οΈ":84.3,
|
72 |
"ARC (25-shot) β¬οΈ":96.3,
|
73 |
"HellaSwag (10-shot) β¬οΈ":95.3,
|
74 |
"MMLU (5-shot) β¬οΈ":86.4,
|
75 |
+
"TruthfulQA (0-shot) β¬οΈ":59.0,
|
76 |
}
|
77 |
all_data.append(gpt4_values)
|
78 |
gpt35_values = {
|
79 |
+
"Model":f'<a target="_blank" href=https://arxiv.org/abs/2303.08774 style="color: blue; text-decoration: underline;text-decoration-style: dotted;">gpt3.5</a>',
|
80 |
+
"Revision":"tech report",
|
81 |
"8bit":None,
|
82 |
"Average β¬οΈ":71.9,
|
83 |
"ARC (25-shot) β¬οΈ":85.2,
|
84 |
"HellaSwag (10-shot) β¬οΈ":85.5,
|
85 |
"MMLU (5-shot) β¬οΈ":70.0,
|
86 |
+
"TruthfulQA (0-shot) β¬οΈ":47.0,
|
87 |
}
|
88 |
all_data.append(gpt35_values)
|
89 |
+
|
90 |
dataframe = pd.DataFrame.from_records(all_data)
|
91 |
dataframe = dataframe.sort_values(by=['Average β¬οΈ'], ascending=False)
|
92 |
print(dataframe)
|
|
|
94 |
return dataframe
|
95 |
|
96 |
def get_eval_table():
|
97 |
+
if repo:
|
98 |
print("pulling changes for eval")
|
99 |
repo.git_pull()
|
100 |
+
entries = [entry for entry in os.listdir("evals/eval_requests") if not entry.startswith('.')]
|
101 |
all_evals = []
|
102 |
+
|
103 |
for entry in entries:
|
104 |
print(entry)
|
105 |
if ".json"in entry:
|
106 |
file_path = os.path.join("evals/eval_requests", entry)
|
107 |
with open(file_path) as fp:
|
108 |
data = json.load(fp)
|
109 |
+
|
110 |
data["# params"] = "unknown"
|
111 |
data["model"] = make_clickable_model(data["model"])
|
112 |
data["revision"] = data.get("revision", "main")
|
113 |
+
|
114 |
|
115 |
all_evals.append(data)
|
116 |
else:
|
117 |
# this is a folder
|
118 |
+
sub_entries = [e for e in os.listdir(f"evals/eval_requests/{entry}") if not e.startswith('.')]
|
119 |
for sub_entry in sub_entries:
|
120 |
file_path = os.path.join("evals/eval_requests", entry, sub_entry)
|
121 |
with open(file_path) as fp:
|
122 |
data = json.load(fp)
|
123 |
+
|
124 |
#data["# params"] = get_n_params(data["model"])
|
125 |
data["model"] = make_clickable_model(data["model"])
|
126 |
all_evals.append(data)
|
127 |
|
128 |
+
|
129 |
dataframe = pd.DataFrame.from_records(all_evals)
|
130 |
return dataframe[EVAL_COLS]
|
131 |
|
|
|
137 |
try:
|
138 |
config = AutoConfig.from_pretrained(model_name, revision=revision)
|
139 |
return True
|
140 |
+
|
141 |
except Exception as e:
|
142 |
print("Could not get the model config from the hub")
|
143 |
print(e)
|
144 |
return False
|
145 |
+
|
146 |
|
147 |
|
148 |
def add_new_eval(model:str, base_model : str, revision:str, is_8_bit_eval: bool, private:bool, is_delta_weight:bool):
|
|
|
152 |
if is_delta_weight and not is_model_on_hub(base_model, revision):
|
153 |
print(base_model, "base model not found on hub")
|
154 |
return
|
155 |
+
|
156 |
if not is_model_on_hub(model, revision):
|
157 |
print(model, "not found on hub")
|
158 |
return
|
159 |
print("adding new eval")
|
160 |
+
|
161 |
eval_entry = {
|
162 |
"model" : model,
|
163 |
"base_model" : base_model,
|
|
|
166 |
"8bit_eval" : is_8_bit_eval,
|
167 |
"is_delta_weight" : is_delta_weight,
|
168 |
"status" : "PENDING"
|
169 |
+
}
|
170 |
+
|
171 |
user_name = ""
|
172 |
model_path = model
|
173 |
if "/" in model:
|
174 |
user_name = model.split("/")[0]
|
175 |
model_path = model.split("/")[1]
|
176 |
+
|
177 |
OUT_DIR=f"eval_requests/{user_name}"
|
178 |
os.makedirs(OUT_DIR, exist_ok=True)
|
179 |
out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{is_8_bit_eval}_{is_delta_weight}.json"
|
180 |
+
|
181 |
with open(out_path, "w") as f:
|
182 |
f.write(json.dumps(eval_entry))
|
183 |
LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
|
184 |
+
|
185 |
api = HfApi()
|
186 |
api.upload_file(
|
187 |
path_or_fileobj=out_path,
|
|
|
191 |
repo_type="dataset",
|
192 |
)
|
193 |
|
194 |
+
|
195 |
def refresh():
|
196 |
return get_leaderboard(), get_eval_table()
|
197 |
+
|
198 |
|
199 |
|
200 |
block = gr.Blocks()
|
201 |
+
with block:
|
202 |
with gr.Row():
|
203 |
gr.Markdown(f"""
|
204 |
# π€ Open LLM Leaderboard
|
|
|
208 |
- <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
|
209 |
- <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
|
210 |
- <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU </a> (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
|
211 |
+
- <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA </a> (0-shot) - a benchmark to measure whether a language model is truthful in generating answers to questions.
|
212 |
|
213 |
We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings. </font>
|
214 |
""")
|
215 |
+
|
216 |
with gr.Row():
|
217 |
leaderboard_table = gr.components.Dataframe(value=leaderboard, headers=COLS,
|
218 |
datatype=TYPES, max_rows=5)
|
219 |
|
220 |
+
|
221 |
+
|
222 |
with gr.Row():
|
223 |
gr.Markdown(f"""
|
224 |
# Evaluation Queue for the π€ Open LLM Leaderboard, these models will be automatically evaluated on the π€ cluster
|
225 |
+
|
226 |
""")
|
227 |
with gr.Accordion("Evaluation Queue", open=False):
|
228 |
with gr.Row():
|
229 |
eval_table = gr.components.Dataframe(value=eval_queue, headers=EVAL_COLS,
|
230 |
+
datatype=EVAL_TYPES, max_rows=5)
|
231 |
+
|
232 |
with gr.Row():
|
233 |
refresh_button = gr.Button("Refresh")
|
234 |
+
refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table, eval_table])
|
235 |
+
|
236 |
with gr.Accordion("Submit a new model for evaluation"):
|
|
|
|
|
237 |
with gr.Row():
|
238 |
with gr.Column():
|
239 |
model_name_textbox = gr.Textbox(label="Model name")
|
240 |
revision_name_textbox = gr.Textbox(label="revision", placeholder="main")
|
241 |
+
|
242 |
with gr.Column():
|
243 |
is_8bit_toggle = gr.Checkbox(False, label="8 bit eval", visible=not IS_PUBLIC)
|
244 |
private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
|
245 |
is_delta_weight = gr.Checkbox(False, label="Delta weights")
|
246 |
base_model_name_textbox = gr.Textbox(label="base model (for delta)")
|
247 |
+
|
248 |
with gr.Row():
|
249 |
submit_button = gr.Button("Submit Eval")
|
250 |
submit_button.click(add_new_eval, [model_name_textbox, base_model_name_textbox, revision_name_textbox, is_8bit_toggle, private, is_delta_weight])
|
251 |
+
|
252 |
|
253 |
block.load(refresh, inputs=[], outputs=[leaderboard_table, eval_table])
|
254 |
block.launch()
|
utils.py
CHANGED
@@ -21,7 +21,7 @@ BENCH_TO_NAME = {
|
|
21 |
"arc_challenge":"ARC (25-shot) β¬οΈ",
|
22 |
"hellaswag":"HellaSwag (10-shot) β¬οΈ",
|
23 |
"hendrycks":"MMLU (5-shot) β¬οΈ",
|
24 |
-
"truthfulqa_mc":"
|
25 |
}
|
26 |
def make_clickable_model(model_name):
|
27 |
LLAMAS = ["huggingface/llama-7b", "huggingface/llama-13b", "huggingface/llama-30b", "huggingface/llama-65b"]
|
|
|
21 |
"arc_challenge":"ARC (25-shot) β¬οΈ",
|
22 |
"hellaswag":"HellaSwag (10-shot) β¬οΈ",
|
23 |
"hendrycks":"MMLU (5-shot) β¬οΈ",
|
24 |
+
"truthfulqa_mc":"TruthfulQA (0-shot) β¬οΈ",
|
25 |
}
|
26 |
def make_clickable_model(model_name):
|
27 |
LLAMAS = ["huggingface/llama-7b", "huggingface/llama-13b", "huggingface/llama-30b", "huggingface/llama-65b"]
|