Spaces:
Running
Running
hi-melnikov
commited on
Commit
•
d0e8be9
1
Parent(s):
a70555b
ruff format everything
Browse files- app.py +42 -39
- src/display/css_html_js.py +1 -1
- src/display/utils.py +3 -2
- src/gen/gen_answer.py +54 -46
- src/gen/gen_judgment.py +23 -22
- src/gen/show_result.py +49 -38
- src/gen/utils.py +25 -44
- src/leaderboard/build_leaderboard.py +34 -18
- src/leaderboard/filter_models.py +5 -6
- src/leaderboard/read_evals.py +27 -29
- src/populate.py +1 -3
- src/scripts/create_request_file.py +1 -1
- src/scripts/update_all_request_files.py +2 -2
- src/submission/check_validity.py +1 -1
- src/submission/submit.py +2 -22
- src/tools/plots.py +1 -1
app.py
CHANGED
@@ -24,39 +24,33 @@ from src.envs import (
|
|
24 |
)
|
25 |
from src.leaderboard.build_leaderboard import build_leadearboard_df
|
26 |
|
27 |
-
os.environ[
|
28 |
|
29 |
# Configure logging
|
30 |
-
logging.basicConfig(level=logging.INFO, format=
|
31 |
|
32 |
# Start ephemeral Spaces on PRs (see config in README.md)
|
33 |
enable_space_ci()
|
34 |
|
|
|
35 |
def restart_space():
|
36 |
API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
|
37 |
|
38 |
|
39 |
def build_demo():
|
40 |
-
demo = gr.Blocks(
|
41 |
-
title = "Chatbot Arena Leaderboard",
|
42 |
-
css=custom_css
|
43 |
-
)
|
44 |
leaderboard_df = build_leadearboard_df()
|
45 |
with demo:
|
46 |
gr.HTML(TITLE)
|
47 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
48 |
|
49 |
-
with gr.Tabs(elem_classes="tab-buttons")
|
50 |
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
51 |
-
|
52 |
value=leaderboard_df,
|
53 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
54 |
select_columns=SelectColumns(
|
55 |
-
default_selection=[
|
56 |
-
c.name
|
57 |
-
for c in fields(AutoEvalColumn)
|
58 |
-
if c.displayed_by_default
|
59 |
-
],
|
60 |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
|
61 |
label="Select Columns to Display:",
|
62 |
),
|
@@ -67,50 +61,59 @@ def build_demo():
|
|
67 |
],
|
68 |
)
|
69 |
|
70 |
-
#with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=1):
|
71 |
# gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
72 |
-
#with gr.TabItem("❗FAQ", elem_id="llm-benchmark-tab-table", id=2):
|
73 |
# gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
|
74 |
|
75 |
with gr.TabItem("🚀 Submit ", elem_id="llm-benchmark-tab-table", id=3):
|
76 |
-
|
77 |
with gr.Row():
|
78 |
gr.Markdown("# ✨ Submit your model here!", elem_classes="markdown-text")
|
79 |
|
80 |
with gr.Column():
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
return demo
|
95 |
-
|
|
|
96 |
# print(os.system('cd src/gen && ../../.venv/bin/python gen_judgment.py'))
|
97 |
# print(os.system('cd src/gen/ && python show_result.py --output'))
|
98 |
-
|
|
|
99 |
def update_board():
|
100 |
need_reset = os.environ.get(RESET_JUDGEMENT_ENV)
|
101 |
-
if need_reset !=
|
102 |
return
|
103 |
-
os.environ[RESET_JUDGEMENT_ENV] =
|
104 |
-
subprocess.run([
|
105 |
-
subprocess.Popen(
|
106 |
|
107 |
|
108 |
if __name__ == "__main__":
|
109 |
-
os.environ[RESET_JUDGEMENT_ENV] =
|
110 |
-
|
111 |
scheduler = BackgroundScheduler()
|
112 |
-
scheduler.add_job(update_board, "interval", minutes=10)
|
113 |
scheduler.start()
|
114 |
-
|
115 |
demo_app = build_demo()
|
116 |
demo_app.launch(debug=True)
|
|
|
24 |
)
|
25 |
from src.leaderboard.build_leaderboard import build_leadearboard_df
|
26 |
|
27 |
+
os.environ["GRADIO_ANALYTICS_ENABLED"] = "false"
|
28 |
|
29 |
# Configure logging
|
30 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
31 |
|
32 |
# Start ephemeral Spaces on PRs (see config in README.md)
|
33 |
enable_space_ci()
|
34 |
|
35 |
+
|
36 |
def restart_space():
|
37 |
API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
|
38 |
|
39 |
|
40 |
def build_demo():
|
41 |
+
demo = gr.Blocks(title="Chatbot Arena Leaderboard", css=custom_css)
|
|
|
|
|
|
|
42 |
leaderboard_df = build_leadearboard_df()
|
43 |
with demo:
|
44 |
gr.HTML(TITLE)
|
45 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
46 |
|
47 |
+
with gr.Tabs(elem_classes="tab-buttons"):
|
48 |
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
49 |
+
Leaderboard(
|
50 |
value=leaderboard_df,
|
51 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
52 |
select_columns=SelectColumns(
|
53 |
+
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
|
|
|
|
|
|
|
|
54 |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
|
55 |
label="Select Columns to Display:",
|
56 |
),
|
|
|
61 |
],
|
62 |
)
|
63 |
|
64 |
+
# with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=1):
|
65 |
# gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
66 |
+
# with gr.TabItem("❗FAQ", elem_id="llm-benchmark-tab-table", id=2):
|
67 |
# gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
|
68 |
|
69 |
with gr.TabItem("🚀 Submit ", elem_id="llm-benchmark-tab-table", id=3):
|
|
|
70 |
with gr.Row():
|
71 |
gr.Markdown("# ✨ Submit your model here!", elem_classes="markdown-text")
|
72 |
|
73 |
with gr.Column():
|
74 |
+
model_name_textbox = gr.Textbox(label="Model name")
|
75 |
+
|
76 |
+
def upload_file(file):
|
77 |
+
file_path = file.name.split("/")[-1] if "/" in file.name else file.name
|
78 |
+
logging.info("New submition: file saved to %s", file_path)
|
79 |
+
API.upload_file(
|
80 |
+
path_or_fileobj=file.name,
|
81 |
+
path_in_repo="./external/" + file_path,
|
82 |
+
repo_id="Vikhrmodels/openbench-eval",
|
83 |
+
repo_type="dataset",
|
84 |
+
)
|
85 |
+
os.environ[RESET_JUDGEMENT_ENV] = "1"
|
86 |
+
return file.name
|
87 |
+
|
88 |
+
if model_name_textbox:
|
89 |
+
file_output = gr.File()
|
90 |
+
upload_button = gr.UploadButton(
|
91 |
+
"Click to Upload & Submit Answers", file_types=["*"], file_count="single"
|
92 |
+
)
|
93 |
+
upload_button.upload(upload_file, upload_button, file_output)
|
94 |
+
|
95 |
return demo
|
96 |
+
|
97 |
+
|
98 |
# print(os.system('cd src/gen && ../../.venv/bin/python gen_judgment.py'))
|
99 |
# print(os.system('cd src/gen/ && python show_result.py --output'))
|
100 |
+
|
101 |
+
|
102 |
def update_board():
|
103 |
need_reset = os.environ.get(RESET_JUDGEMENT_ENV)
|
104 |
+
if need_reset != "1":
|
105 |
return
|
106 |
+
os.environ[RESET_JUDGEMENT_ENV] = "0"
|
107 |
+
subprocess.run(["python", "src/gen/gen_judgement.py"], check=False)
|
108 |
+
subprocess.Popen("python3.src/gen/show_result.py --output")
|
109 |
|
110 |
|
111 |
if __name__ == "__main__":
|
112 |
+
os.environ[RESET_JUDGEMENT_ENV] = "1"
|
113 |
+
|
114 |
scheduler = BackgroundScheduler()
|
115 |
+
scheduler.add_job(update_board, "interval", minutes=10)
|
116 |
scheduler.start()
|
117 |
+
|
118 |
demo_app = build_demo()
|
119 |
demo_app.launch(debug=True)
|
src/display/css_html_js.py
CHANGED
@@ -88,4 +88,4 @@ get_window_url_params = """
|
|
88 |
url_params = Object.fromEntries(params);
|
89 |
return url_params;
|
90 |
}
|
91 |
-
"""
|
|
|
88 |
url_params = Object.fromEntries(params);
|
89 |
return url_params;
|
90 |
}
|
91 |
+
"""
|
src/display/utils.py
CHANGED
@@ -7,7 +7,8 @@ import pandas as pd
|
|
7 |
|
8 |
|
9 |
# Configure logging
|
10 |
-
logging.basicConfig(level=logging.INFO, format=
|
|
|
11 |
|
12 |
def parse_datetime(datetime_str):
|
13 |
formats = [
|
@@ -25,6 +26,7 @@ def parse_datetime(datetime_str):
|
|
25 |
logging.error(f"No valid date format found for: {datetime_str}")
|
26 |
return datetime(1970, 1, 1)
|
27 |
|
|
|
28 |
def load_json_data(file_path):
|
29 |
"""Safely load JSON data from a file."""
|
30 |
try:
|
@@ -98,7 +100,6 @@ auto_eval_column_dict.append(["score", ColumnContent, ColumnContent("score", "nu
|
|
98 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
99 |
|
100 |
|
101 |
-
|
102 |
@dataclass(frozen=True)
|
103 |
class EvalQueueColumn: # Queue column
|
104 |
model = ColumnContent("model", "markdown", True)
|
|
|
7 |
|
8 |
|
9 |
# Configure logging
|
10 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
11 |
+
|
12 |
|
13 |
def parse_datetime(datetime_str):
|
14 |
formats = [
|
|
|
26 |
logging.error(f"No valid date format found for: {datetime_str}")
|
27 |
return datetime(1970, 1, 1)
|
28 |
|
29 |
+
|
30 |
def load_json_data(file_path):
|
31 |
"""Safely load JSON data from a file."""
|
32 |
try:
|
|
|
100 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
101 |
|
102 |
|
|
|
103 |
@dataclass(frozen=True)
|
104 |
class EvalQueueColumn: # Queue column
|
105 |
model = ColumnContent("model", "markdown", True)
|
src/gen/gen_answer.py
CHANGED
@@ -33,7 +33,14 @@ from utils import (
|
|
33 |
|
34 |
|
35 |
def get_answer(
|
36 |
-
question: dict,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
):
|
38 |
if question["category"] in temperature_config:
|
39 |
temperature = temperature_config[question["category"]]
|
@@ -54,49 +61,56 @@ def get_answer(
|
|
54 |
for j in range(len(question["turns"])):
|
55 |
conv.append({"role": "user", "content": question["turns"][j]["content"]})
|
56 |
if api_type == "anthropic":
|
57 |
-
output = chat_completion_anthropic(
|
58 |
-
|
59 |
-
|
60 |
-
max_tokens=max_tokens)
|
61 |
elif api_type == "mistral":
|
62 |
-
output = chat_completion_mistral(
|
63 |
-
|
64 |
-
|
65 |
-
max_tokens=max_tokens)
|
66 |
elif api_type == "yandex":
|
67 |
-
output = chat_completion_yandex(
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
|
|
|
|
72 |
elif api_type == "gigachat":
|
73 |
-
output = chat_completion_gigachat(
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
|
|
|
|
78 |
elif api_type == "gemini":
|
79 |
-
output = chat_completion_gemini(
|
80 |
-
|
81 |
-
|
82 |
-
|
|
|
|
|
83 |
elif api_type == "azure":
|
84 |
-
output = chat_completion_openai_azure(
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
|
|
|
|
89 |
elif api_type == "cohere":
|
90 |
-
output = chat_completion_cohere(
|
91 |
-
|
92 |
-
|
93 |
-
max_tokens=max_tokens)
|
94 |
else:
|
95 |
-
output = chat_completion_openai(
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
|
|
|
|
100 |
conv.append({"role": "assistant", "content": output})
|
101 |
|
102 |
turns.append({"content": output, "token_len": len(encoding.encode(output))})
|
@@ -118,12 +132,8 @@ def get_answer(
|
|
118 |
|
119 |
if __name__ == "__main__":
|
120 |
parser = argparse.ArgumentParser()
|
121 |
-
parser.add_argument(
|
122 |
-
|
123 |
-
)
|
124 |
-
parser.add_argument(
|
125 |
-
"--endpoint-file", type=str, default="config/api_config.yaml"
|
126 |
-
)
|
127 |
args = parser.parse_args()
|
128 |
|
129 |
settings = make_config(args.setting_file)
|
@@ -187,9 +197,7 @@ if __name__ == "__main__":
|
|
187 |
futures.append(future)
|
188 |
if count > 0:
|
189 |
print(f"{count} number of existing answers")
|
190 |
-
for future in tqdm.tqdm(
|
191 |
-
concurrent.futures.as_completed(futures), total=len(futures)
|
192 |
-
):
|
193 |
future.result()
|
194 |
|
195 |
reorg_answer_file(answer_file)
|
|
|
33 |
|
34 |
|
35 |
def get_answer(
|
36 |
+
question: dict,
|
37 |
+
model: str,
|
38 |
+
endpoint_info: dict,
|
39 |
+
num_choices: int,
|
40 |
+
max_tokens: int,
|
41 |
+
temperature: float,
|
42 |
+
answer_file: str,
|
43 |
+
api_dict: dict,
|
44 |
):
|
45 |
if question["category"] in temperature_config:
|
46 |
temperature = temperature_config[question["category"]]
|
|
|
61 |
for j in range(len(question["turns"])):
|
62 |
conv.append({"role": "user", "content": question["turns"][j]["content"]})
|
63 |
if api_type == "anthropic":
|
64 |
+
output = chat_completion_anthropic(
|
65 |
+
model=endpoint_info["model_name"], messages=conv, temperature=temperature, max_tokens=max_tokens
|
66 |
+
)
|
|
|
67 |
elif api_type == "mistral":
|
68 |
+
output = chat_completion_mistral(
|
69 |
+
model=endpoint_info["model_name"], messages=conv, temperature=temperature, max_tokens=max_tokens
|
70 |
+
)
|
|
|
71 |
elif api_type == "yandex":
|
72 |
+
output = chat_completion_yandex(
|
73 |
+
model=endpoint_info["model_name"],
|
74 |
+
messages=conv,
|
75 |
+
temperature=temperature,
|
76 |
+
max_tokens=max_tokens,
|
77 |
+
api_dict=api_dict,
|
78 |
+
)
|
79 |
elif api_type == "gigachat":
|
80 |
+
output = chat_completion_gigachat(
|
81 |
+
model=endpoint_info["model_name"],
|
82 |
+
messages=conv,
|
83 |
+
temperature=temperature,
|
84 |
+
max_tokens=max_tokens,
|
85 |
+
api_dict=api_dict,
|
86 |
+
)
|
87 |
elif api_type == "gemini":
|
88 |
+
output = chat_completion_gemini(
|
89 |
+
model=endpoint_info["model_name"],
|
90 |
+
messages=question["turns"][j]["content"],
|
91 |
+
temperature=temperature,
|
92 |
+
max_tokens=max_tokens,
|
93 |
+
)
|
94 |
elif api_type == "azure":
|
95 |
+
output = chat_completion_openai_azure(
|
96 |
+
model=endpoint_info["model_name"],
|
97 |
+
messages=conv,
|
98 |
+
temperature=temperature,
|
99 |
+
max_tokens=max_tokens,
|
100 |
+
api_dict=api_dict,
|
101 |
+
)
|
102 |
elif api_type == "cohere":
|
103 |
+
output = chat_completion_cohere(
|
104 |
+
model=endpoint_info["model_name"], messages=conv, temperature=temperature, max_tokens=max_tokens
|
105 |
+
)
|
|
|
106 |
else:
|
107 |
+
output = chat_completion_openai(
|
108 |
+
model=endpoint_info["model_name"],
|
109 |
+
messages=conv,
|
110 |
+
temperature=temperature,
|
111 |
+
max_tokens=max_tokens,
|
112 |
+
api_dict=api_dict,
|
113 |
+
)
|
114 |
conv.append({"role": "assistant", "content": output})
|
115 |
|
116 |
turns.append({"content": output, "token_len": len(encoding.encode(output))})
|
|
|
132 |
|
133 |
if __name__ == "__main__":
|
134 |
parser = argparse.ArgumentParser()
|
135 |
+
parser.add_argument("--setting-file", type=str, default="config/gen_answer_config.yaml")
|
136 |
+
parser.add_argument("--endpoint-file", type=str, default="config/api_config.yaml")
|
|
|
|
|
|
|
|
|
137 |
args = parser.parse_args()
|
138 |
|
139 |
settings = make_config(args.setting_file)
|
|
|
197 |
futures.append(future)
|
198 |
if count > 0:
|
199 |
print(f"{count} number of existing answers")
|
200 |
+
for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
|
|
|
|
|
201 |
future.result()
|
202 |
|
203 |
reorg_answer_file(answer_file)
|
src/gen/gen_judgment.py
CHANGED
@@ -55,12 +55,7 @@ def judgment(**args):
|
|
55 |
|
56 |
num_games = 2 if configs["pairwise"] else 1
|
57 |
|
58 |
-
output = {
|
59 |
-
"question_id":question["question_id"],
|
60 |
-
"model":answer["model_id"],
|
61 |
-
"judge": model,
|
62 |
-
"games":[]
|
63 |
-
}
|
64 |
|
65 |
for game in range(num_games):
|
66 |
conv = [{"role": "system", "content": configs["system_prompt"]}]
|
@@ -73,7 +68,7 @@ def judgment(**args):
|
|
73 |
base = 1
|
74 |
|
75 |
if baseline:
|
76 |
-
if game % 2 == 1:
|
77 |
temp = baseline
|
78 |
baseline = answer
|
79 |
answer = temp
|
@@ -103,7 +98,7 @@ def judgment(**args):
|
|
103 |
args["endpoint_dict"],
|
104 |
)
|
105 |
|
106 |
-
judgment +=
|
107 |
|
108 |
score, try_again = get_score(judgment, args["regex_pattern"])
|
109 |
|
@@ -112,18 +107,21 @@ def judgment(**args):
|
|
112 |
if not try_again:
|
113 |
break
|
114 |
|
115 |
-
conv.append(
|
|
|
|
|
116 |
|
117 |
-
result = {
|
118 |
-
"user_prompt": conv[1]["content"],
|
119 |
-
"judgment": judgment,
|
120 |
-
"score":score
|
121 |
-
}
|
122 |
output["games"].append(result)
|
123 |
|
124 |
with open(output_file, "a") as f:
|
125 |
f.write(json.dumps(output, ensure_ascii=False) + "\n")
|
126 |
-
huggingface_hub.HfApi().upload_file(
|
|
|
|
|
|
|
|
|
|
|
127 |
|
128 |
|
129 |
if __name__ == "__main__":
|
@@ -136,8 +134,10 @@ if __name__ == "__main__":
|
|
136 |
configs = make_config(args.setting_file)
|
137 |
endpoint_list = make_config(args.endpoint_file)
|
138 |
|
139 |
-
print(
|
140 |
-
|
|
|
|
|
141 |
|
142 |
if configs["regex_pattern"]:
|
143 |
pattern = re.compile(configs["regex_pattern"])
|
@@ -150,12 +150,15 @@ if __name__ == "__main__":
|
|
150 |
questions = load_questions(question_file)
|
151 |
model_answers_external = load_model_answers(external_dir)
|
152 |
model_answers_internal = load_model_answers(internal_dir)
|
153 |
-
|
154 |
# internal has priority
|
155 |
model_answers = {**model_answers_external, **model_answers_internal}
|
156 |
|
157 |
# if user choose a set of models, only judge those models
|
158 |
-
models = [
|
|
|
|
|
|
|
159 |
|
160 |
ref_answers = None
|
161 |
if configs["reference"]:
|
@@ -214,7 +217,5 @@ if __name__ == "__main__":
|
|
214 |
if count > 0:
|
215 |
print(f"{count} number of existing judgments")
|
216 |
|
217 |
-
for future in tqdm(
|
218 |
-
concurrent.futures.as_completed(futures), total=len(futures)
|
219 |
-
):
|
220 |
future.result()
|
|
|
55 |
|
56 |
num_games = 2 if configs["pairwise"] else 1
|
57 |
|
58 |
+
output = {"question_id": question["question_id"], "model": answer["model_id"], "judge": model, "games": []}
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
for game in range(num_games):
|
61 |
conv = [{"role": "system", "content": configs["system_prompt"]}]
|
|
|
68 |
base = 1
|
69 |
|
70 |
if baseline:
|
71 |
+
if game % 2 == 1: # swap position
|
72 |
temp = baseline
|
73 |
baseline = answer
|
74 |
answer = temp
|
|
|
98 |
args["endpoint_dict"],
|
99 |
)
|
100 |
|
101 |
+
judgment += "\n" + new_judgment
|
102 |
|
103 |
score, try_again = get_score(judgment, args["regex_pattern"])
|
104 |
|
|
|
107 |
if not try_again:
|
108 |
break
|
109 |
|
110 |
+
conv.append(
|
111 |
+
{"role": "user", "content": "continue your judgment and finish by outputting a final verdict label"}
|
112 |
+
)
|
113 |
|
114 |
+
result = {"user_prompt": conv[1]["content"], "judgment": judgment, "score": score}
|
|
|
|
|
|
|
|
|
115 |
output["games"].append(result)
|
116 |
|
117 |
with open(output_file, "a") as f:
|
118 |
f.write(json.dumps(output, ensure_ascii=False) + "\n")
|
119 |
+
huggingface_hub.HfApi().upload_file(
|
120 |
+
output_file,
|
121 |
+
path_in_repo=f'model_judgment/{configs['judge_model']}/{output_file.split('/')[-1]}',
|
122 |
+
repo_id="Vikhrmodels/openbench-eval",
|
123 |
+
repo_type="dataset",
|
124 |
+
)
|
125 |
|
126 |
|
127 |
if __name__ == "__main__":
|
|
|
134 |
configs = make_config(args.setting_file)
|
135 |
endpoint_list = make_config(args.endpoint_file)
|
136 |
|
137 |
+
print(
|
138 |
+
f'judge model: {configs["judge_model"]}, baseline: {configs["baseline"]}, baseline model: {configs["baseline_model"]}, reference: {configs["reference"]}, '
|
139 |
+
+ f'reference models: {configs["ref_model"]}, temperature: {configs["temperature"]}, max tokens: {configs["max_tokens"]}, pairwise: {configs["pairwise"]}'
|
140 |
+
)
|
141 |
|
142 |
if configs["regex_pattern"]:
|
143 |
pattern = re.compile(configs["regex_pattern"])
|
|
|
150 |
questions = load_questions(question_file)
|
151 |
model_answers_external = load_model_answers(external_dir)
|
152 |
model_answers_internal = load_model_answers(internal_dir)
|
153 |
+
|
154 |
# internal has priority
|
155 |
model_answers = {**model_answers_external, **model_answers_internal}
|
156 |
|
157 |
# if user choose a set of models, only judge those models
|
158 |
+
models = [
|
159 |
+
model.split("/")[-1].split(".")[0]
|
160 |
+
for model in glob.glob("./data/arena-hard-v0.1/model_answer/external/*.jsonl")
|
161 |
+
]
|
162 |
|
163 |
ref_answers = None
|
164 |
if configs["reference"]:
|
|
|
217 |
if count > 0:
|
218 |
print(f"{count} number of existing judgments")
|
219 |
|
220 |
+
for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
|
|
|
|
|
221 |
future.result()
|
src/gen/show_result.py
CHANGED
@@ -2,7 +2,6 @@ import pandas as pd
|
|
2 |
import numpy as np
|
3 |
import plotly.express as px
|
4 |
|
5 |
-
import tiktoken
|
6 |
import datetime
|
7 |
import argparse
|
8 |
import os
|
@@ -15,6 +14,7 @@ from sklearn.linear_model import LogisticRegression
|
|
15 |
from collections import defaultdict
|
16 |
from utils import load_model_answers
|
17 |
|
|
|
18 |
def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
|
19 |
models = pd.concat([df["model_a"], df["model_b"]]).unique()
|
20 |
models = pd.Series(np.arange(len(models)), index=models)
|
@@ -35,18 +35,18 @@ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
|
|
35 |
# one tie => one A win + one B win
|
36 |
# find tie + tie (both bad) index
|
37 |
tie_idx = (df["winner"] == "tie") | (df["winner"] == "tie (bothbad)")
|
38 |
-
tie_idx[len(tie_idx)//2:] = False
|
39 |
Y[tie_idx] = 1.0
|
40 |
|
41 |
lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-8)
|
42 |
-
lr.fit(X,Y)
|
43 |
|
44 |
elo_scores = SCALE * lr.coef_[0] + INIT_RATING
|
45 |
|
46 |
# set anchor as gpt-3.5-turbo-0125 = 1000
|
47 |
if "gpt-3.5-turbo-0125" in models.index:
|
48 |
elo_scores += 1000 - elo_scores[models["gpt-3.5-turbo-0125"]]
|
49 |
-
return pd.Series(elo_scores, index
|
50 |
|
51 |
|
52 |
def get_bootstrap_result(battles, func_compute_elo, num_round):
|
@@ -58,9 +58,14 @@ def get_bootstrap_result(battles, func_compute_elo, num_round):
|
|
58 |
|
59 |
|
60 |
def preety_print_two_ratings(ratings_1, ratings_2, column_names):
|
61 |
-
df =
|
62 |
-
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
64 |
df[column_names[0]] = (df[column_names[0]] + 0.5).astype(int)
|
65 |
df[column_names[1]] = (df[column_names[1]] + 0.5).astype(int)
|
66 |
df.index = df.index + 1
|
@@ -68,18 +73,24 @@ def preety_print_two_ratings(ratings_1, ratings_2, column_names):
|
|
68 |
|
69 |
|
70 |
def visualize_bootstrap_scores(df, title):
|
71 |
-
bars =
|
72 |
-
lower = df.quantile(.
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
bars[
|
77 |
-
bars[
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
return fig
|
84 |
|
85 |
|
@@ -92,10 +103,7 @@ def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000):
|
|
92 |
wins[a][b] = ea
|
93 |
wins[b][a] = 1 - ea
|
94 |
|
95 |
-
data = {
|
96 |
-
a: [wins[a][b] if a != b else np.NAN for b in names]
|
97 |
-
for a in names
|
98 |
-
}
|
99 |
|
100 |
df = pd.DataFrame(data, index=names)
|
101 |
df.index.name = "model_a"
|
@@ -121,9 +129,7 @@ def get_battles_from_judgment(judge_name, first_game_only=False, WEIGHT=3):
|
|
121 |
|
122 |
for _, row in df.iterrows():
|
123 |
# game 1
|
124 |
-
output = {"question_id": row["question_id"],
|
125 |
-
"model_a": "gpt-3.5-turbo-0125",
|
126 |
-
"model_b": row["model"]}
|
127 |
|
128 |
game = row["games"][0]
|
129 |
|
@@ -148,9 +154,7 @@ def get_battles_from_judgment(judge_name, first_game_only=False, WEIGHT=3):
|
|
148 |
|
149 |
if not first_game_only:
|
150 |
# game 2
|
151 |
-
output = {"question_id": row["question_id"],
|
152 |
-
"model_a": "gpt-3.5-turbo-0125",
|
153 |
-
"model_b": row["model"]}
|
154 |
|
155 |
game = row["games"][1]
|
156 |
|
@@ -190,7 +194,9 @@ if __name__ == "__main__":
|
|
190 |
parser.add_argument("--first-game-only", action="store_true")
|
191 |
args = parser.parse_args()
|
192 |
print(args)
|
193 |
-
assert not args.load_bootstrap or (
|
|
|
|
|
194 |
|
195 |
answer_dir = os.path.join("data", args.bench_name, "model_answer/external")
|
196 |
model_answers = load_model_answers(answer_dir)
|
@@ -203,7 +209,6 @@ if __name__ == "__main__":
|
|
203 |
|
204 |
bootstrap_online_elo = compute_mle_elo(battles)
|
205 |
|
206 |
-
|
207 |
if args.load_bootstrap:
|
208 |
bootstrap_elo_lu = pd.read_json("data/bootstrapping_results.jsonl", lines=True)
|
209 |
else:
|
@@ -213,7 +218,7 @@ if __name__ == "__main__":
|
|
213 |
|
214 |
stats = pd.DataFrame()
|
215 |
stats["results"] = None
|
216 |
-
stats["results"] = stats[
|
217 |
|
218 |
for i, model in enumerate(bootstrap_online_elo.index):
|
219 |
assert model in bootstrap_elo_lu.columns
|
@@ -241,18 +246,24 @@ if __name__ == "__main__":
|
|
241 |
decimal = 1
|
242 |
else:
|
243 |
decimal = 0
|
244 |
-
stats = stats.astype({"score"
|
245 |
|
246 |
stats.sort_values(by="score", ascending=False, inplace=True)
|
247 |
for _, row in stats.iterrows():
|
248 |
-
interval = str((round(row[
|
249 |
-
print(
|
|
|
|
|
250 |
|
251 |
if args.output:
|
252 |
cur_date = datetime.datetime.now()
|
253 |
date_str = cur_date.strftime("%Y%m%d")
|
254 |
stats.to_json(f"arena_hard_leaderboard_{date_str}.json", orient="records", indent=4)
|
255 |
import huggingface_hub
|
256 |
-
|
257 |
-
|
258 |
-
|
|
|
|
|
|
|
|
|
|
2 |
import numpy as np
|
3 |
import plotly.express as px
|
4 |
|
|
|
5 |
import datetime
|
6 |
import argparse
|
7 |
import os
|
|
|
14 |
from collections import defaultdict
|
15 |
from utils import load_model_answers
|
16 |
|
17 |
+
|
18 |
def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
|
19 |
models = pd.concat([df["model_a"], df["model_b"]]).unique()
|
20 |
models = pd.Series(np.arange(len(models)), index=models)
|
|
|
35 |
# one tie => one A win + one B win
|
36 |
# find tie + tie (both bad) index
|
37 |
tie_idx = (df["winner"] == "tie") | (df["winner"] == "tie (bothbad)")
|
38 |
+
tie_idx[len(tie_idx) // 2 :] = False
|
39 |
Y[tie_idx] = 1.0
|
40 |
|
41 |
lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-8)
|
42 |
+
lr.fit(X, Y)
|
43 |
|
44 |
elo_scores = SCALE * lr.coef_[0] + INIT_RATING
|
45 |
|
46 |
# set anchor as gpt-3.5-turbo-0125 = 1000
|
47 |
if "gpt-3.5-turbo-0125" in models.index:
|
48 |
elo_scores += 1000 - elo_scores[models["gpt-3.5-turbo-0125"]]
|
49 |
+
return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)
|
50 |
|
51 |
|
52 |
def get_bootstrap_result(battles, func_compute_elo, num_round):
|
|
|
58 |
|
59 |
|
60 |
def preety_print_two_ratings(ratings_1, ratings_2, column_names):
|
61 |
+
df = (
|
62 |
+
pd.DataFrame(
|
63 |
+
[[n, ratings_1[n], ratings_2[n]] for n in ratings_1.keys()],
|
64 |
+
columns=["Model", column_names[0], column_names[1]],
|
65 |
+
)
|
66 |
+
.sort_values(column_names[0], ascending=False)
|
67 |
+
.reset_index(drop=True)
|
68 |
+
)
|
69 |
df[column_names[0]] = (df[column_names[0]] + 0.5).astype(int)
|
70 |
df[column_names[1]] = (df[column_names[1]] + 0.5).astype(int)
|
71 |
df.index = df.index + 1
|
|
|
73 |
|
74 |
|
75 |
def visualize_bootstrap_scores(df, title):
|
76 |
+
bars = (
|
77 |
+
pd.DataFrame(dict(lower=df.quantile(0.025), rating=df.quantile(0.5), upper=df.quantile(0.975)))
|
78 |
+
.reset_index(names="model")
|
79 |
+
.sort_values("rating", ascending=False)
|
80 |
+
)
|
81 |
+
bars["error_y"] = bars["upper"] - bars["rating"]
|
82 |
+
bars["error_y_minus"] = bars["rating"] - bars["lower"]
|
83 |
+
bars["rating_rounded"] = np.round(bars["rating"], 2)
|
84 |
+
fig = px.scatter(
|
85 |
+
bars,
|
86 |
+
x="model",
|
87 |
+
y="rating",
|
88 |
+
error_y="error_y",
|
89 |
+
error_y_minus="error_y_minus",
|
90 |
+
text="rating_rounded",
|
91 |
+
title=title,
|
92 |
+
)
|
93 |
+
fig.update_layout(xaxis_title="Model", yaxis_title="Rating", height=600)
|
94 |
return fig
|
95 |
|
96 |
|
|
|
103 |
wins[a][b] = ea
|
104 |
wins[b][a] = 1 - ea
|
105 |
|
106 |
+
data = {a: [wins[a][b] if a != b else np.NAN for b in names] for a in names}
|
|
|
|
|
|
|
107 |
|
108 |
df = pd.DataFrame(data, index=names)
|
109 |
df.index.name = "model_a"
|
|
|
129 |
|
130 |
for _, row in df.iterrows():
|
131 |
# game 1
|
132 |
+
output = {"question_id": row["question_id"], "model_a": "gpt-3.5-turbo-0125", "model_b": row["model"]}
|
|
|
|
|
133 |
|
134 |
game = row["games"][0]
|
135 |
|
|
|
154 |
|
155 |
if not first_game_only:
|
156 |
# game 2
|
157 |
+
output = {"question_id": row["question_id"], "model_a": "gpt-3.5-turbo-0125", "model_b": row["model"]}
|
|
|
|
|
158 |
|
159 |
game = row["games"][1]
|
160 |
|
|
|
194 |
parser.add_argument("--first-game-only", action="store_true")
|
195 |
args = parser.parse_args()
|
196 |
print(args)
|
197 |
+
assert not args.load_bootstrap or (
|
198 |
+
args.load_battles and args.load_bootstrap
|
199 |
+
), "If loading prexisting bootstrapping data, you must also load preexisting battles."
|
200 |
|
201 |
answer_dir = os.path.join("data", args.bench_name, "model_answer/external")
|
202 |
model_answers = load_model_answers(answer_dir)
|
|
|
209 |
|
210 |
bootstrap_online_elo = compute_mle_elo(battles)
|
211 |
|
|
|
212 |
if args.load_bootstrap:
|
213 |
bootstrap_elo_lu = pd.read_json("data/bootstrapping_results.jsonl", lines=True)
|
214 |
else:
|
|
|
218 |
|
219 |
stats = pd.DataFrame()
|
220 |
stats["results"] = None
|
221 |
+
stats["results"] = stats["results"].astype("object")
|
222 |
|
223 |
for i, model in enumerate(bootstrap_online_elo.index):
|
224 |
assert model in bootstrap_elo_lu.columns
|
|
|
246 |
decimal = 1
|
247 |
else:
|
248 |
decimal = 0
|
249 |
+
stats = stats.astype({"score": int, "lower": int, "upper": int})
|
250 |
|
251 |
stats.sort_values(by="score", ascending=False, inplace=True)
|
252 |
for _, row in stats.iterrows():
|
253 |
+
interval = str((round(row["lower"] - row["score"], decimal), round(row["upper"] - row["score"], decimal)))
|
254 |
+
print(
|
255 |
+
f"{row['model'] : <30} | score: {round(row['score'], decimal) : ^5} | 95% CI: {interval : ^12} | average #tokens: {int(row['avg_tokens'])}"
|
256 |
+
)
|
257 |
|
258 |
if args.output:
|
259 |
cur_date = datetime.datetime.now()
|
260 |
date_str = cur_date.strftime("%Y%m%d")
|
261 |
stats.to_json(f"arena_hard_leaderboard_{date_str}.json", orient="records", indent=4)
|
262 |
import huggingface_hub
|
263 |
+
|
264 |
+
huggingface_hub.HfApi().upload_file(
|
265 |
+
path_or_fileobj=f"arena_hard_leaderboard_{date_str}.json",
|
266 |
+
path_in_repo="evals/upd.json",
|
267 |
+
repo_id="Vikhrmodels/openbench-eval",
|
268 |
+
repo_type="dataset",
|
269 |
+
)
|
src/gen/utils.py
CHANGED
@@ -77,9 +77,7 @@ def get_endpoint(endpoint_list):
|
|
77 |
return None
|
78 |
assert endpoint_list is not None
|
79 |
# randomly pick one
|
80 |
-
api_dict = random.choices(
|
81 |
-
endpoint_list
|
82 |
-
)[0]
|
83 |
return api_dict
|
84 |
|
85 |
|
@@ -91,9 +89,11 @@ def make_config(config_file: str) -> dict:
|
|
91 |
|
92 |
return config_kwargs
|
93 |
|
|
|
94 |
def chat_completion_gigachat(model, messages, temperature, max_tokens, api_dict=None):
|
95 |
from gigachat import GigaChat
|
96 |
from gigachat.models import Chat, Messages
|
|
|
97 |
assert api_dict is not None, "no api settings provided!"
|
98 |
auth_token = api_dict.get("auth_token", os.environ.get(api_dict["auth_token"], ""))
|
99 |
client = GigaChat(credentials=auth_token, model=model, verify_ssl_certs=False)
|
@@ -115,15 +115,13 @@ def chat_completion_gigachat(model, messages, temperature, max_tokens, api_dict=
|
|
115 |
|
116 |
return output
|
117 |
|
|
|
118 |
def chat_completion_yandex(model, messages, temperature, max_tokens, api_dict=None):
|
119 |
from yandex_gpt import YandexGPT, YandexGPTConfigManagerForIAMToken
|
|
|
120 |
assert api_dict is not None, "no api settings provided!"
|
121 |
iam_token = api_dict.get("iam_token", os.environ.get(api_dict["iam_token_ENV"], ""))
|
122 |
-
config = YandexGPTConfigManagerForIAMToken(
|
123 |
-
model_type=model,
|
124 |
-
catalog_id=api_dict["catalog_id"],
|
125 |
-
iam_token=iam_token
|
126 |
-
)
|
127 |
client = YandexGPT(config_manager=config)
|
128 |
|
129 |
messages = [{"role": m["role"], "text": m["content"]} for m in messages]
|
@@ -147,6 +145,7 @@ def chat_completion_yandex(model, messages, temperature, max_tokens, api_dict=No
|
|
147 |
|
148 |
def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=None):
|
149 |
import openai
|
|
|
150 |
api_key = api_dict.get("api_key", os.environ.get(api_dict["api_key_ENV"], ""))
|
151 |
if api_dict:
|
152 |
client = openai.OpenAI(
|
@@ -165,8 +164,8 @@ def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=No
|
|
165 |
messages=messages,
|
166 |
temperature=temperature,
|
167 |
max_tokens=max_tokens,
|
168 |
-
stop=["</s>", "<eos>", "<|eot_id|>"]
|
169 |
-
|
170 |
output = completion.choices[0].message.content
|
171 |
break
|
172 |
except openai.RateLimitError as e:
|
@@ -175,7 +174,7 @@ def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=No
|
|
175 |
except openai.BadRequestError as e:
|
176 |
print(messages)
|
177 |
print(type(e), e)
|
178 |
-
except KeyError:
|
179 |
print(type(e), e)
|
180 |
break
|
181 |
|
@@ -189,11 +188,7 @@ def chat_completion_openai_azure(model, messages, temperature, max_tokens, api_d
|
|
189 |
api_base = api_dict["api_base"]
|
190 |
api_key = api_dict.get("api_key", os.environ.get(api_dict["api_key_ENV"], ""))
|
191 |
client = AzureOpenAI(
|
192 |
-
azure_endpoint =
|
193 |
-
api_key= api_key,
|
194 |
-
api_version=api_dict["api_version"],
|
195 |
-
timeout=240,
|
196 |
-
max_retries=2
|
197 |
)
|
198 |
|
199 |
output = API_ERROR_OUTPUT
|
@@ -215,7 +210,7 @@ def chat_completion_openai_azure(model, messages, temperature, max_tokens, api_d
|
|
215 |
except openai.BadRequestError as e:
|
216 |
print(type(e), e)
|
217 |
break
|
218 |
-
except KeyError:
|
219 |
print(type(e), e)
|
220 |
break
|
221 |
|
@@ -246,7 +241,7 @@ def chat_completion_anthropic(model, messages, temperature, max_tokens, api_dict
|
|
246 |
stop_sequences=[anthropic.HUMAN_PROMPT],
|
247 |
max_tokens=max_tokens,
|
248 |
temperature=temperature,
|
249 |
-
system=sys_msg
|
250 |
)
|
251 |
output = response.content[0].text
|
252 |
break
|
@@ -286,25 +281,14 @@ def chat_completion_mistral(model, messages, temperature, max_tokens):
|
|
286 |
|
287 |
def chat_completion_gemini(model, messages, temperature, max_tokens):
|
288 |
import google.generativeai as genai
|
|
|
289 |
genai.configure(api_key=os.environ["GEMINI_API_KEY"])
|
290 |
|
291 |
safety_settings = [
|
292 |
-
{
|
293 |
-
|
294 |
-
|
295 |
-
},
|
296 |
-
{
|
297 |
-
"category": "HARM_CATEGORY_HATE_SPEECH",
|
298 |
-
"threshold": "BLOCK_NONE"
|
299 |
-
},
|
300 |
-
{
|
301 |
-
"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
|
302 |
-
"threshold": "BLOCK_NONE"
|
303 |
-
},
|
304 |
-
{
|
305 |
-
"category": "HARM_CATEGORY_DANGEROUS_CONTENT",
|
306 |
-
"threshold": "BLOCK_NONE"
|
307 |
-
},
|
308 |
]
|
309 |
|
310 |
# Set up the model
|
@@ -319,9 +303,8 @@ def chat_completion_gemini(model, messages, temperature, max_tokens):
|
|
319 |
for _ in range(API_MAX_RETRY):
|
320 |
try:
|
321 |
gemini = genai.GenerativeModel(
|
322 |
-
model_name=model,
|
323 |
-
|
324 |
-
safety_settings=safety_settings)
|
325 |
|
326 |
convo = gemini.start_chat(history=[])
|
327 |
|
@@ -344,9 +327,7 @@ def chat_completion_cohere(model, messages, temperature, max_tokens):
|
|
344 |
co = cohere.Client(os.environ["COHERE_API_KEY"])
|
345 |
assert len(messages) > 0
|
346 |
|
347 |
-
template_map = {"system":"SYSTEM",
|
348 |
-
"assistant":"CHATBOT",
|
349 |
-
"user":"USER"}
|
350 |
|
351 |
assert messages[-1]["role"] == "user"
|
352 |
prompt = messages[-1]["content"]
|
@@ -354,7 +335,7 @@ def chat_completion_cohere(model, messages, temperature, max_tokens):
|
|
354 |
if len(messages) > 1:
|
355 |
history = []
|
356 |
for message in messages[:-1]:
|
357 |
-
history.append({"role":template_map[message["role"]], "message":message["content"]})
|
358 |
else:
|
359 |
history = None
|
360 |
|
@@ -384,9 +365,9 @@ def reorg_answer_file(answer_file):
|
|
384 |
"""Sort by question id and de-duplication"""
|
385 |
answers = {}
|
386 |
with open(answer_file, "r") as fin:
|
387 |
-
for
|
388 |
-
qid = json.loads(
|
389 |
-
answers[qid] =
|
390 |
|
391 |
qids = sorted(list(answers.keys()))
|
392 |
with open(answer_file, "w") as fout:
|
|
|
77 |
return None
|
78 |
assert endpoint_list is not None
|
79 |
# randomly pick one
|
80 |
+
api_dict = random.choices(endpoint_list)[0]
|
|
|
|
|
81 |
return api_dict
|
82 |
|
83 |
|
|
|
89 |
|
90 |
return config_kwargs
|
91 |
|
92 |
+
|
93 |
def chat_completion_gigachat(model, messages, temperature, max_tokens, api_dict=None):
|
94 |
from gigachat import GigaChat
|
95 |
from gigachat.models import Chat, Messages
|
96 |
+
|
97 |
assert api_dict is not None, "no api settings provided!"
|
98 |
auth_token = api_dict.get("auth_token", os.environ.get(api_dict["auth_token"], ""))
|
99 |
client = GigaChat(credentials=auth_token, model=model, verify_ssl_certs=False)
|
|
|
115 |
|
116 |
return output
|
117 |
|
118 |
+
|
119 |
def chat_completion_yandex(model, messages, temperature, max_tokens, api_dict=None):
|
120 |
from yandex_gpt import YandexGPT, YandexGPTConfigManagerForIAMToken
|
121 |
+
|
122 |
assert api_dict is not None, "no api settings provided!"
|
123 |
iam_token = api_dict.get("iam_token", os.environ.get(api_dict["iam_token_ENV"], ""))
|
124 |
+
config = YandexGPTConfigManagerForIAMToken(model_type=model, catalog_id=api_dict["catalog_id"], iam_token=iam_token)
|
|
|
|
|
|
|
|
|
125 |
client = YandexGPT(config_manager=config)
|
126 |
|
127 |
messages = [{"role": m["role"], "text": m["content"]} for m in messages]
|
|
|
145 |
|
146 |
def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=None):
|
147 |
import openai
|
148 |
+
|
149 |
api_key = api_dict.get("api_key", os.environ.get(api_dict["api_key_ENV"], ""))
|
150 |
if api_dict:
|
151 |
client = openai.OpenAI(
|
|
|
164 |
messages=messages,
|
165 |
temperature=temperature,
|
166 |
max_tokens=max_tokens,
|
167 |
+
stop=["</s>", "<eos>", "<|eot_id|>"],
|
168 |
+
)
|
169 |
output = completion.choices[0].message.content
|
170 |
break
|
171 |
except openai.RateLimitError as e:
|
|
|
174 |
except openai.BadRequestError as e:
|
175 |
print(messages)
|
176 |
print(type(e), e)
|
177 |
+
except KeyError as e:
|
178 |
print(type(e), e)
|
179 |
break
|
180 |
|
|
|
188 |
api_base = api_dict["api_base"]
|
189 |
api_key = api_dict.get("api_key", os.environ.get(api_dict["api_key_ENV"], ""))
|
190 |
client = AzureOpenAI(
|
191 |
+
azure_endpoint=api_base, api_key=api_key, api_version=api_dict["api_version"], timeout=240, max_retries=2
|
|
|
|
|
|
|
|
|
192 |
)
|
193 |
|
194 |
output = API_ERROR_OUTPUT
|
|
|
210 |
except openai.BadRequestError as e:
|
211 |
print(type(e), e)
|
212 |
break
|
213 |
+
except KeyError as e:
|
214 |
print(type(e), e)
|
215 |
break
|
216 |
|
|
|
241 |
stop_sequences=[anthropic.HUMAN_PROMPT],
|
242 |
max_tokens=max_tokens,
|
243 |
temperature=temperature,
|
244 |
+
system=sys_msg,
|
245 |
)
|
246 |
output = response.content[0].text
|
247 |
break
|
|
|
281 |
|
282 |
def chat_completion_gemini(model, messages, temperature, max_tokens):
|
283 |
import google.generativeai as genai
|
284 |
+
|
285 |
genai.configure(api_key=os.environ["GEMINI_API_KEY"])
|
286 |
|
287 |
safety_settings = [
|
288 |
+
{"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
|
289 |
+
{"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
|
290 |
+
{"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
|
291 |
+
{"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
292 |
]
|
293 |
|
294 |
# Set up the model
|
|
|
303 |
for _ in range(API_MAX_RETRY):
|
304 |
try:
|
305 |
gemini = genai.GenerativeModel(
|
306 |
+
model_name=model, generation_config=generation_config, safety_settings=safety_settings
|
307 |
+
)
|
|
|
308 |
|
309 |
convo = gemini.start_chat(history=[])
|
310 |
|
|
|
327 |
co = cohere.Client(os.environ["COHERE_API_KEY"])
|
328 |
assert len(messages) > 0
|
329 |
|
330 |
+
template_map = {"system": "SYSTEM", "assistant": "CHATBOT", "user": "USER"}
|
|
|
|
|
331 |
|
332 |
assert messages[-1]["role"] == "user"
|
333 |
prompt = messages[-1]["content"]
|
|
|
335 |
if len(messages) > 1:
|
336 |
history = []
|
337 |
for message in messages[:-1]:
|
338 |
+
history.append({"role": template_map[message["role"]], "message": message["content"]})
|
339 |
else:
|
340 |
history = None
|
341 |
|
|
|
365 |
"""Sort by question id and de-duplication"""
|
366 |
answers = {}
|
367 |
with open(answer_file, "r") as fin:
|
368 |
+
for line in fin:
|
369 |
+
qid = json.loads(line)["question_id"]
|
370 |
+
answers[qid] = line
|
371 |
|
372 |
qids = sorted(list(answers.keys()))
|
373 |
with open(answer_file, "w") as fout:
|
src/leaderboard/build_leaderboard.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
|
2 |
import json
|
3 |
import logging
|
4 |
import os
|
@@ -11,7 +10,8 @@ from huggingface_hub import snapshot_download
|
|
11 |
from src.envs import EVAL_RESULTS_PATH
|
12 |
|
13 |
# Configure logging
|
14 |
-
logging.basicConfig(level=logging.INFO, format=
|
|
|
15 |
|
16 |
def time_diff_wrapper(func):
|
17 |
def wrapper(*args, **kwargs):
|
@@ -21,15 +21,17 @@ def time_diff_wrapper(func):
|
|
21 |
diff = end_time - start_time
|
22 |
logging.info(f"Time taken for {func.__name__}: {diff} seconds")
|
23 |
return result
|
|
|
24 |
return wrapper
|
25 |
|
|
|
26 |
@time_diff_wrapper
|
27 |
def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
|
28 |
"""Download dataset with exponential backoff retries."""
|
29 |
attempt = 0
|
30 |
while attempt < max_attempts:
|
31 |
try:
|
32 |
-
logging.info(
|
33 |
snapshot_download(
|
34 |
repo_id=repo_id,
|
35 |
local_dir=local_dir,
|
@@ -42,27 +44,41 @@ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, ba
|
|
42 |
logging.info("Download successful")
|
43 |
return
|
44 |
except Exception as e:
|
45 |
-
wait_time = backoff_factor
|
46 |
logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
|
47 |
time.sleep(wait_time)
|
48 |
attempt += 1
|
49 |
logging.error(f"Failed to download {repo_id} after {max_attempts} attempts")
|
50 |
|
|
|
51 |
def build_leadearboard_df():
|
52 |
"""Initializes the application space, loading only necessary data."""
|
53 |
-
# Check ENV LEADERBOARD_DOWNLOAD if wee need to download the leaderboard
|
54 |
-
if os.getenv("LEADERBOARD_DOWNLOAD", "True") == "True":
|
55 |
-
# These downloads only occur on full initialization
|
56 |
-
# try:
|
57 |
-
# download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
|
58 |
-
# download_dataset(DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH)
|
59 |
-
download_dataset("Vikhrmodels/openbench-eval", EVAL_RESULTS_PATH)
|
60 |
-
# print(subprocess.Popen('ls src'))
|
61 |
-
subprocess.run(['rsync', '-avzP', '--ignore-existing', f'{EVAL_RESULTS_PATH[2:]}/external/*', 'src/gen/data/arena-hard-v0.1/model_answer/'], check=False)
|
62 |
-
subprocess.run(['rsync', '-avzP', '--ignore-existing', f'{EVAL_RESULTS_PATH[2:]}/model_judgment/*', 'src/gen/data/arena-hard-v0.1/model_judgement/'], check=False)
|
63 |
-
# except Exception:
|
64 |
-
# restart_space()
|
65 |
|
66 |
-
#
|
67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
return leaderboard_df.copy()
|
|
|
|
|
1 |
import json
|
2 |
import logging
|
3 |
import os
|
|
|
10 |
from src.envs import EVAL_RESULTS_PATH
|
11 |
|
12 |
# Configure logging
|
13 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
14 |
+
|
15 |
|
16 |
def time_diff_wrapper(func):
|
17 |
def wrapper(*args, **kwargs):
|
|
|
21 |
diff = end_time - start_time
|
22 |
logging.info(f"Time taken for {func.__name__}: {diff} seconds")
|
23 |
return result
|
24 |
+
|
25 |
return wrapper
|
26 |
|
27 |
+
|
28 |
@time_diff_wrapper
|
29 |
def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
|
30 |
"""Download dataset with exponential backoff retries."""
|
31 |
attempt = 0
|
32 |
while attempt < max_attempts:
|
33 |
try:
|
34 |
+
logging.info("Downloading %s to %s", repo_id, local_dir)
|
35 |
snapshot_download(
|
36 |
repo_id=repo_id,
|
37 |
local_dir=local_dir,
|
|
|
44 |
logging.info("Download successful")
|
45 |
return
|
46 |
except Exception as e:
|
47 |
+
wait_time = backoff_factor**attempt
|
48 |
logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
|
49 |
time.sleep(wait_time)
|
50 |
attempt += 1
|
51 |
logging.error(f"Failed to download {repo_id} after {max_attempts} attempts")
|
52 |
|
53 |
+
|
54 |
def build_leadearboard_df():
|
55 |
"""Initializes the application space, loading only necessary data."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
+
# download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
|
58 |
+
# download_dataset(DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH)
|
59 |
+
download_dataset("Vikhrmodels/openbench-eval", EVAL_RESULTS_PATH)
|
60 |
+
# print(subprocess.Popen('ls src'))
|
61 |
+
subprocess.run(
|
62 |
+
[
|
63 |
+
"rsync",
|
64 |
+
"-avzP",
|
65 |
+
"--ignore-existing",
|
66 |
+
f"{EVAL_RESULTS_PATH}/external/*",
|
67 |
+
"src/gen/data/arena-hard-v0.1/model_answer/",
|
68 |
+
],
|
69 |
+
check=False,
|
70 |
+
)
|
71 |
+
subprocess.run(
|
72 |
+
[
|
73 |
+
"rsync",
|
74 |
+
"-avzP",
|
75 |
+
"--ignore-existing",
|
76 |
+
f"{EVAL_RESULTS_PATH}/model_judgment/*",
|
77 |
+
"src/gen/data/arena-hard-v0.1/model_judgement/",
|
78 |
+
],
|
79 |
+
check=False,
|
80 |
+
)
|
81 |
+
|
82 |
+
# Retrieve the leaderboard DataFrame
|
83 |
+
leaderboard_df = pd.DataFrame.from_records(json.load(open("eval-results/evals/upd.json", "r")))
|
84 |
return leaderboard_df.copy()
|
src/leaderboard/filter_models.py
CHANGED
@@ -137,9 +137,9 @@ def flag_models(leaderboard_data: list[dict]):
|
|
137 |
if model_data[AutoEvalColumn.not_flagged.name]:
|
138 |
flag_key = model_data[AutoEvalColumn.fullname.name]
|
139 |
else:
|
140 |
-
|
141 |
flag_key = "merged"
|
142 |
-
|
143 |
# Reverse the logic: Check for non-flagged models instead
|
144 |
if flag_key in FLAGGED_MODELS:
|
145 |
issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
|
@@ -147,9 +147,9 @@ def flag_models(leaderboard_data: list[dict]):
|
|
147 |
FLAGGED_MODELS[flag_key],
|
148 |
f"See discussion #{issue_num}",
|
149 |
)
|
150 |
-
model_data[
|
151 |
-
|
152 |
-
|
153 |
model_data[AutoEvalColumn.not_flagged.name] = False
|
154 |
else:
|
155 |
model_data[AutoEvalColumn.not_flagged.name] = True
|
@@ -171,4 +171,3 @@ def remove_forbidden_models(leaderboard_data: list[dict]):
|
|
171 |
def filter_models_flags(leaderboard_data: list[dict]):
|
172 |
leaderboard_data = remove_forbidden_models(leaderboard_data)
|
173 |
flag_models(leaderboard_data)
|
174 |
-
|
|
|
137 |
if model_data[AutoEvalColumn.not_flagged.name]:
|
138 |
flag_key = model_data[AutoEvalColumn.fullname.name]
|
139 |
else:
|
140 |
+
# Merges and moes are flagged
|
141 |
flag_key = "merged"
|
142 |
+
|
143 |
# Reverse the logic: Check for non-flagged models instead
|
144 |
if flag_key in FLAGGED_MODELS:
|
145 |
issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
|
|
|
147 |
FLAGGED_MODELS[flag_key],
|
148 |
f"See discussion #{issue_num}",
|
149 |
)
|
150 |
+
model_data[
|
151 |
+
AutoEvalColumn.model.name
|
152 |
+
] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
|
153 |
model_data[AutoEvalColumn.not_flagged.name] = False
|
154 |
else:
|
155 |
model_data[AutoEvalColumn.not_flagged.name] = True
|
|
|
171 |
def filter_models_flags(leaderboard_data: list[dict]):
|
172 |
leaderboard_data = remove_forbidden_models(leaderboard_data)
|
173 |
flag_models(leaderboard_data)
|
|
src/leaderboard/read_evals.py
CHANGED
@@ -16,36 +16,36 @@ from src.display.formatting import make_clickable_model
|
|
16 |
from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, parse_datetime
|
17 |
|
18 |
# Configure logging
|
19 |
-
logging.basicConfig(level=logging.INFO, format=
|
|
|
20 |
|
21 |
@dataclass
|
22 |
class EvalResult:
|
23 |
# Also see src.display.utils.AutoEvalColumn for what will be displayed.
|
24 |
-
eval_name: str
|
25 |
-
full_model: str
|
26 |
org: Optional[str]
|
27 |
model: str
|
28 |
-
revision: str
|
29 |
results: Dict[str, float]
|
30 |
precision: Precision = Precision.Unknown
|
31 |
-
model_type: ModelType = ModelType.Unknown
|
32 |
weight_type: WeightType = WeightType.Original
|
33 |
-
architecture: str = "Unknown"
|
34 |
license: str = "?"
|
35 |
likes: int = 0
|
36 |
num_params: int = 0
|
37 |
-
date: str = ""
|
38 |
still_on_hub: bool = True
|
39 |
is_merge: bool = False
|
40 |
not_flagged: bool = False
|
41 |
status: str = "FINISHED"
|
42 |
# List of tags, initialized to a new empty list for each instance to avoid the pitfalls of mutable default arguments.
|
43 |
tags: List[str] = field(default_factory=list)
|
44 |
-
|
45 |
-
|
46 |
@classmethod
|
47 |
-
def init_from_json_file(cls, json_filepath: str) ->
|
48 |
-
with open(json_filepath,
|
49 |
data = json.load(fp)
|
50 |
|
51 |
config = data.get("config_general", {})
|
@@ -72,7 +72,7 @@ class EvalResult:
|
|
72 |
model=model,
|
73 |
results=results,
|
74 |
precision=precision,
|
75 |
-
revision=config.get("model_sha", "")
|
76 |
)
|
77 |
|
78 |
@staticmethod
|
@@ -118,9 +118,8 @@ class EvalResult:
|
|
118 |
|
119 |
mean_acc = np.mean(accs) * 100.0
|
120 |
results[task.benchmark] = mean_acc
|
121 |
-
|
122 |
-
return results
|
123 |
|
|
|
124 |
|
125 |
def update_with_request_file(self, requests_path):
|
126 |
"""Finds the relevant request file for the current model and updates info with it."""
|
@@ -130,17 +129,17 @@ class EvalResult:
|
|
130 |
logging.warning(f"No request file for {self.org}/{self.model}")
|
131 |
self.status = "FAILED"
|
132 |
return
|
133 |
-
|
134 |
with open(request_file, "r") as f:
|
135 |
request = json.load(f)
|
136 |
-
|
137 |
self.model_type = ModelType.from_str(request.get("model_type", "Unknown"))
|
138 |
self.weight_type = WeightType[request.get("weight_type", "Original")]
|
139 |
self.num_params = int(request.get("params", 0)) # Ensuring type safety
|
140 |
self.date = request.get("submitted_time", "")
|
141 |
self.architecture = request.get("architectures", "Unknown")
|
142 |
self.status = request.get("status", "FAILED")
|
143 |
-
|
144 |
except FileNotFoundError:
|
145 |
self.status = "FAILED"
|
146 |
logging.error(f"Request file: {request_file} not found for {self.org}/{self.model}")
|
@@ -154,7 +153,6 @@ class EvalResult:
|
|
154 |
self.status = "FAILED"
|
155 |
logging.error(f"Unexpected error {e} for {self.org}/{self.model}")
|
156 |
|
157 |
-
|
158 |
def update_with_dynamic_file_dict(self, file_dict):
|
159 |
"""Update object attributes based on the provided dictionary, with error handling for missing keys and type validation."""
|
160 |
# Default values set for optional or potentially missing keys.
|
@@ -162,11 +160,10 @@ class EvalResult:
|
|
162 |
self.likes = int(file_dict.get("likes", 0)) # Ensure likes is treated as an integer
|
163 |
self.still_on_hub = file_dict.get("still_on_hub", False) # Default to False if key is missing
|
164 |
self.tags = file_dict.get("tags", [])
|
165 |
-
|
166 |
# Calculate `flagged` only if 'tags' is not empty and avoid calculating each time
|
167 |
self.not_flagged = not (any("flagged" in tag for tag in self.tags))
|
168 |
|
169 |
-
|
170 |
def to_dict(self):
|
171 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
172 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
@@ -185,8 +182,10 @@ class EvalResult:
|
|
185 |
AutoEvalColumn.likes.name: self.likes,
|
186 |
AutoEvalColumn.params.name: self.num_params,
|
187 |
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
188 |
-
AutoEvalColumn.merged.name: not(
|
189 |
-
AutoEvalColumn.moe.name: not (
|
|
|
|
|
190 |
AutoEvalColumn.not_flagged.name: self.not_flagged,
|
191 |
}
|
192 |
|
@@ -194,16 +193,16 @@ class EvalResult:
|
|
194 |
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
195 |
|
196 |
return data_dict
|
197 |
-
|
198 |
|
199 |
def get_request_file_for_model(requests_path, model_name, precision):
|
200 |
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
201 |
requests_path = Path(requests_path)
|
202 |
pattern = f"{model_name}_eval_request_*.json"
|
203 |
-
|
204 |
# Using pathlib to find files matching the pattern
|
205 |
request_files = list(requests_path.glob(pattern))
|
206 |
-
|
207 |
# Sort the files by name in descending order to mimic 'reverse=True'
|
208 |
request_files.sort(reverse=True)
|
209 |
|
@@ -214,7 +213,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
214 |
req_content = json.load(f)
|
215 |
if req_content["status"] == "FINISHED" and req_content["precision"] == precision.split(".")[-1]:
|
216 |
request_file = str(request_file)
|
217 |
-
|
218 |
# Return empty string if no file found that matches criteria
|
219 |
return request_file
|
220 |
|
@@ -223,9 +222,9 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
|
|
223 |
"""From the path of the results folder root, extract all needed info for results"""
|
224 |
with open(dynamic_path) as f:
|
225 |
dynamic_data = json.load(f)
|
226 |
-
|
227 |
results_path = Path(results_path)
|
228 |
-
model_files = list(results_path.rglob(
|
229 |
model_files.sort(key=lambda file: parse_datetime(file.stem.removeprefix("results_")))
|
230 |
|
231 |
eval_results = {}
|
@@ -260,4 +259,3 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
|
|
260 |
continue
|
261 |
|
262 |
return results
|
263 |
-
|
|
|
16 |
from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, parse_datetime
|
17 |
|
18 |
# Configure logging
|
19 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
20 |
+
|
21 |
|
22 |
@dataclass
|
23 |
class EvalResult:
|
24 |
# Also see src.display.utils.AutoEvalColumn for what will be displayed.
|
25 |
+
eval_name: str # org_model_precision (uid)
|
26 |
+
full_model: str # org/model (path on hub)
|
27 |
org: Optional[str]
|
28 |
model: str
|
29 |
+
revision: str # commit hash, "" if main
|
30 |
results: Dict[str, float]
|
31 |
precision: Precision = Precision.Unknown
|
32 |
+
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
33 |
weight_type: WeightType = WeightType.Original
|
34 |
+
architecture: str = "Unknown" # From config file
|
35 |
license: str = "?"
|
36 |
likes: int = 0
|
37 |
num_params: int = 0
|
38 |
+
date: str = "" # submission date of request file
|
39 |
still_on_hub: bool = True
|
40 |
is_merge: bool = False
|
41 |
not_flagged: bool = False
|
42 |
status: str = "FINISHED"
|
43 |
# List of tags, initialized to a new empty list for each instance to avoid the pitfalls of mutable default arguments.
|
44 |
tags: List[str] = field(default_factory=list)
|
45 |
+
|
|
|
46 |
@classmethod
|
47 |
+
def init_from_json_file(cls, json_filepath: str) -> "EvalResult":
|
48 |
+
with open(json_filepath, "r") as fp:
|
49 |
data = json.load(fp)
|
50 |
|
51 |
config = data.get("config_general", {})
|
|
|
72 |
model=model,
|
73 |
results=results,
|
74 |
precision=precision,
|
75 |
+
revision=config.get("model_sha", ""),
|
76 |
)
|
77 |
|
78 |
@staticmethod
|
|
|
118 |
|
119 |
mean_acc = np.mean(accs) * 100.0
|
120 |
results[task.benchmark] = mean_acc
|
|
|
|
|
121 |
|
122 |
+
return results
|
123 |
|
124 |
def update_with_request_file(self, requests_path):
|
125 |
"""Finds the relevant request file for the current model and updates info with it."""
|
|
|
129 |
logging.warning(f"No request file for {self.org}/{self.model}")
|
130 |
self.status = "FAILED"
|
131 |
return
|
132 |
+
|
133 |
with open(request_file, "r") as f:
|
134 |
request = json.load(f)
|
135 |
+
|
136 |
self.model_type = ModelType.from_str(request.get("model_type", "Unknown"))
|
137 |
self.weight_type = WeightType[request.get("weight_type", "Original")]
|
138 |
self.num_params = int(request.get("params", 0)) # Ensuring type safety
|
139 |
self.date = request.get("submitted_time", "")
|
140 |
self.architecture = request.get("architectures", "Unknown")
|
141 |
self.status = request.get("status", "FAILED")
|
142 |
+
|
143 |
except FileNotFoundError:
|
144 |
self.status = "FAILED"
|
145 |
logging.error(f"Request file: {request_file} not found for {self.org}/{self.model}")
|
|
|
153 |
self.status = "FAILED"
|
154 |
logging.error(f"Unexpected error {e} for {self.org}/{self.model}")
|
155 |
|
|
|
156 |
def update_with_dynamic_file_dict(self, file_dict):
|
157 |
"""Update object attributes based on the provided dictionary, with error handling for missing keys and type validation."""
|
158 |
# Default values set for optional or potentially missing keys.
|
|
|
160 |
self.likes = int(file_dict.get("likes", 0)) # Ensure likes is treated as an integer
|
161 |
self.still_on_hub = file_dict.get("still_on_hub", False) # Default to False if key is missing
|
162 |
self.tags = file_dict.get("tags", [])
|
163 |
+
|
164 |
# Calculate `flagged` only if 'tags' is not empty and avoid calculating each time
|
165 |
self.not_flagged = not (any("flagged" in tag for tag in self.tags))
|
166 |
|
|
|
167 |
def to_dict(self):
|
168 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
169 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
|
|
182 |
AutoEvalColumn.likes.name: self.likes,
|
183 |
AutoEvalColumn.params.name: self.num_params,
|
184 |
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
185 |
+
AutoEvalColumn.merged.name: not ("merge" in self.tags if self.tags else False),
|
186 |
+
AutoEvalColumn.moe.name: not (
|
187 |
+
("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower()
|
188 |
+
),
|
189 |
AutoEvalColumn.not_flagged.name: self.not_flagged,
|
190 |
}
|
191 |
|
|
|
193 |
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
194 |
|
195 |
return data_dict
|
196 |
+
|
197 |
|
198 |
def get_request_file_for_model(requests_path, model_name, precision):
|
199 |
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
200 |
requests_path = Path(requests_path)
|
201 |
pattern = f"{model_name}_eval_request_*.json"
|
202 |
+
|
203 |
# Using pathlib to find files matching the pattern
|
204 |
request_files = list(requests_path.glob(pattern))
|
205 |
+
|
206 |
# Sort the files by name in descending order to mimic 'reverse=True'
|
207 |
request_files.sort(reverse=True)
|
208 |
|
|
|
213 |
req_content = json.load(f)
|
214 |
if req_content["status"] == "FINISHED" and req_content["precision"] == precision.split(".")[-1]:
|
215 |
request_file = str(request_file)
|
216 |
+
|
217 |
# Return empty string if no file found that matches criteria
|
218 |
return request_file
|
219 |
|
|
|
222 |
"""From the path of the results folder root, extract all needed info for results"""
|
223 |
with open(dynamic_path) as f:
|
224 |
dynamic_data = json.load(f)
|
225 |
+
|
226 |
results_path = Path(results_path)
|
227 |
+
model_files = list(results_path.rglob("results_*.json"))
|
228 |
model_files.sort(key=lambda file: parse_datetime(file.stem.removeprefix("results_")))
|
229 |
|
230 |
eval_results = {}
|
|
|
259 |
continue
|
260 |
|
261 |
return results
|
|
src/populate.py
CHANGED
@@ -1,5 +1,3 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
import pathlib
|
4 |
import pandas as pd
|
5 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
@@ -21,7 +19,7 @@ def get_evaluation_queue_df(save_path, cols):
|
|
21 |
save_path = pathlib.Path(save_path)
|
22 |
all_evals = []
|
23 |
|
24 |
-
for path in save_path.rglob(
|
25 |
data = load_json_data(path)
|
26 |
if data:
|
27 |
all_evals.append(_process_model_data(data))
|
|
|
|
|
|
|
1 |
import pathlib
|
2 |
import pandas as pd
|
3 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
|
|
19 |
save_path = pathlib.Path(save_path)
|
20 |
all_evals = []
|
21 |
|
22 |
+
for path in save_path.rglob("*.json"):
|
23 |
data = load_json_data(path)
|
24 |
if data:
|
25 |
all_evals.append(_process_model_data(data))
|
src/scripts/create_request_file.py
CHANGED
@@ -47,7 +47,7 @@ def main():
|
|
47 |
eval_entry = {
|
48 |
"model": model_name,
|
49 |
"base_model": base_model,
|
50 |
-
"revision": model_info.sha,
|
51 |
"private": False,
|
52 |
"precision": precision,
|
53 |
"weight_type": weight_type,
|
|
|
47 |
eval_entry = {
|
48 |
"model": model_name,
|
49 |
"base_model": base_model,
|
50 |
+
"revision": model_info.sha, # force to use the exact model commit
|
51 |
"private": False,
|
52 |
"precision": precision,
|
53 |
"weight_type": weight_type,
|
src/scripts/update_all_request_files.py
CHANGED
@@ -91,6 +91,6 @@ def update_models(file_path, models_on_the_hub):
|
|
91 |
|
92 |
def update_dynamic_files():
|
93 |
# from gen import gen_answer,gen_judgment\
|
94 |
-
subprocess.Popen(
|
95 |
|
96 |
-
subprocess.Popen(
|
|
|
91 |
|
92 |
def update_dynamic_files():
|
93 |
# from gen import gen_answer,gen_judgment\
|
94 |
+
subprocess.Popen("python3 ../gen/gen_judgement.py")
|
95 |
|
96 |
+
subprocess.Popen("python3 ../gen/show_result.py --output")
|
src/submission/check_validity.py
CHANGED
@@ -49,7 +49,7 @@ def is_model_on_hub(
|
|
49 |
) # , force_download=True)
|
50 |
if test_tokenizer:
|
51 |
try:
|
52 |
-
|
53 |
model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
|
54 |
)
|
55 |
except ValueError as e:
|
|
|
49 |
) # , force_download=True)
|
50 |
if test_tokenizer:
|
51 |
try:
|
52 |
+
AutoTokenizer.from_pretrained(
|
53 |
model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
|
54 |
)
|
55 |
except ValueError as e:
|
src/submission/submit.py
CHANGED
@@ -1,21 +1,4 @@
|
|
1 |
-
import
|
2 |
-
import os
|
3 |
-
from datetime import datetime, timezone
|
4 |
-
|
5 |
-
from huggingface_hub import snapshot_download
|
6 |
-
|
7 |
-
from src.display.formatting import styled_error, styled_message, styled_warning
|
8 |
-
from src.envs import (
|
9 |
-
API,
|
10 |
-
DYNAMIC_INFO_FILE_PATH,
|
11 |
-
DYNAMIC_INFO_PATH,
|
12 |
-
DYNAMIC_INFO_REPO,
|
13 |
-
EVAL_REQUESTS_PATH,
|
14 |
-
H4_TOKEN,
|
15 |
-
QUEUE_REPO,
|
16 |
-
RATE_LIMIT_PERIOD,
|
17 |
-
RATE_LIMIT_QUOTA,
|
18 |
-
)
|
19 |
# from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
|
20 |
# from src.submission.check_validity import (
|
21 |
# already_submitted_models,
|
@@ -38,7 +21,6 @@ def add_new_eval(
|
|
38 |
# if not REQUESTED_MODELS:
|
39 |
# REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
40 |
|
41 |
-
|
42 |
# user_name = ""
|
43 |
# model_path = model
|
44 |
# if "/" in model:
|
@@ -186,6 +168,4 @@ def add_new_eval(
|
|
186 |
# # Remove the local file
|
187 |
# os.remove(out_path)
|
188 |
|
189 |
-
return styled_message(
|
190 |
-
"Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour."
|
191 |
-
)
|
|
|
1 |
+
from src.display.formatting import styled_message
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
# from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
|
3 |
# from src.submission.check_validity import (
|
4 |
# already_submitted_models,
|
|
|
21 |
# if not REQUESTED_MODELS:
|
22 |
# REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
23 |
|
|
|
24 |
# user_name = ""
|
25 |
# model_path = model
|
26 |
# if "/" in model:
|
|
|
168 |
# # Remove the local file
|
169 |
# os.remove(out_path)
|
170 |
|
171 |
+
return styled_message("Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour.")
|
|
|
|
src/tools/plots.py
CHANGED
@@ -3,7 +3,7 @@ import pandas as pd
|
|
3 |
import plotly.express as px
|
4 |
from plotly.graph_objs import Figure
|
5 |
|
6 |
-
from src.display.utils import
|
7 |
from src.display.utils import human_baseline_row as HUMAN_BASELINE
|
8 |
from src.leaderboard.filter_models import FLAGGED_MODELS
|
9 |
from src.leaderboard.read_evals import EvalResult
|
|
|
3 |
import plotly.express as px
|
4 |
from plotly.graph_objs import Figure
|
5 |
|
6 |
+
from src.display.utils import AutoEvalColumn, Task, Tasks
|
7 |
from src.display.utils import human_baseline_row as HUMAN_BASELINE
|
8 |
from src.leaderboard.filter_models import FLAGGED_MODELS
|
9 |
from src.leaderboard.read_evals import EvalResult
|