sync from github
Browse files- requirements.txt +4 -2
- src/backend/envs.py +1 -0
- src/backend/hflm_with_measurement.py +3 -3
- src/backend/run_eval_suite.py +2 -2
- src/backend/tasks/arena_hard/__init__.py +0 -0
- src/backend/tasks/arena_hard/arena_hard.yaml +2 -0
- src/backend/tasks/arena_hard/arena_judgment.py +256 -0
- src/backend/tasks/arena_hard/arena_utils.py +349 -0
- src/backend/tasks/arena_hard/configs/api_config.yaml +17 -0
- src/backend/tasks/arena_hard/configs/judge_config.yaml +26 -0
- src/backend/tasks/arena_hard/model_answer/gpt-4-0314.jsonl +0 -0
- src/backend/tasks/arena_hard/question.jsonl +0 -0
- src/backend/tasks/arena_hard/task.py +220 -0
- src/backend/tasks/selfcheckgpt/task.py +2 -2
- src/display/utils.py +11 -10
- src/leaderboard/read_evals.py +5 -2
requirements.txt
CHANGED
@@ -16,7 +16,7 @@ requests
|
|
16 |
semantic-version
|
17 |
tqdm
|
18 |
wandb
|
19 |
-
transformers
|
20 |
tokenizers>=0.15.0
|
21 |
lm_eval[ifeval] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.2
|
22 |
accelerate
|
@@ -31,4 +31,6 @@ spacy==3.7.4
|
|
31 |
selfcheckgpt
|
32 |
immutabledict
|
33 |
gputil
|
34 |
-
bitsandbytes
|
|
|
|
|
|
16 |
semantic-version
|
17 |
tqdm
|
18 |
wandb
|
19 |
+
transformers
|
20 |
tokenizers>=0.15.0
|
21 |
lm_eval[ifeval] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.2
|
22 |
accelerate
|
|
|
31 |
selfcheckgpt
|
32 |
immutabledict
|
33 |
gputil
|
34 |
+
bitsandbytes
|
35 |
+
openai
|
36 |
+
scikit-learn
|
src/backend/envs.py
CHANGED
@@ -59,6 +59,7 @@ class Tasks(Enum):
|
|
59 |
task21 = Task("mmlu", "acc", "MMLU", 5)
|
60 |
task22 = Task("gsm8k_custom", "em", "GSM8K", 5)
|
61 |
# task23 = Task("gsm8k_cot", "em", "GSM8K", 8)
|
|
|
62 |
|
63 |
|
64 |
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
|
|
59 |
task21 = Task("mmlu", "acc", "MMLU", 5)
|
60 |
task22 = Task("gsm8k_custom", "em", "GSM8K", 5)
|
61 |
# task23 = Task("gsm8k_cot", "em", "GSM8K", 8)
|
62 |
+
task24 = Task("arena_hard", "score", "Arena Hard", 0)
|
63 |
|
64 |
|
65 |
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
src/backend/hflm_with_measurement.py
CHANGED
@@ -354,6 +354,7 @@ class HFLMWithMeasurement(HFLM):
|
|
354 |
linear_count += 1
|
355 |
elif isinstance(module, DbrxExpertGLU):
|
356 |
linear_count = 3
|
|
|
357 |
# elif 'experts' not in name:
|
358 |
# if ("gate" not in name and "router" not in name) or "gate_proj" in name:
|
359 |
# if "gate_proj" in name:
|
@@ -388,8 +389,7 @@ class HFLMWithMeasurement(HFLM):
|
|
388 |
|
389 |
precision_bytes = transfer_precision2bytes(self.precision)
|
390 |
|
391 |
-
|
392 |
-
model_size_param = get_model_size(model_info=model_info, precision=self.precision)
|
393 |
|
394 |
n_layers = model_config.num_hidden_layers if hasattr(model_config, "num_hidden_layers") else \
|
395 |
(model_config.num_layers if hasattr(model_config, "num_layers") else model_config.n_layers)
|
@@ -429,7 +429,7 @@ class HFLMWithMeasurement(HFLM):
|
|
429 |
|
430 |
ffn_params = n_layers * d_ff * linear_count * d_model
|
431 |
|
432 |
-
shared_params = model_size_param
|
433 |
|
434 |
model_size = shared_params + n_experts_per_tok * ffn_params
|
435 |
|
|
|
354 |
linear_count += 1
|
355 |
elif isinstance(module, DbrxExpertGLU):
|
356 |
linear_count = 3
|
357 |
+
element_wise_mul = 1
|
358 |
# elif 'experts' not in name:
|
359 |
# if ("gate" not in name and "router" not in name) or "gate_proj" in name:
|
360 |
# if "gate_proj" in name:
|
|
|
389 |
|
390 |
precision_bytes = transfer_precision2bytes(self.precision)
|
391 |
|
392 |
+
model_size_param = sum(p.numel() for p in self.model.parameters())
|
|
|
393 |
|
394 |
n_layers = model_config.num_hidden_layers if hasattr(model_config, "num_hidden_layers") else \
|
395 |
(model_config.num_layers if hasattr(model_config, "num_layers") else model_config.n_layers)
|
|
|
429 |
|
430 |
ffn_params = n_layers * d_ff * linear_count * d_model
|
431 |
|
432 |
+
shared_params = model_size_param - num_experts * ffn_params
|
433 |
|
434 |
model_size = shared_params + n_experts_per_tok * ffn_params
|
435 |
|
src/backend/run_eval_suite.py
CHANGED
@@ -25,8 +25,8 @@ def process_results_decorator(func):
|
|
25 |
result_dict["end_to_end_time"] = end_to_end_time
|
26 |
result_dict["prefilling_time"] = prefilling_time
|
27 |
result_dict["decoding_throughput"] = decoding_throughput
|
28 |
-
result_dict["mfu"] = mfu
|
29 |
-
result_dict["mbu"] = mbu
|
30 |
return result_dict
|
31 |
return wrapper
|
32 |
ConfigurableTask.process_results = process_results_decorator(orig_process_results)
|
|
|
25 |
result_dict["end_to_end_time"] = end_to_end_time
|
26 |
result_dict["prefilling_time"] = prefilling_time
|
27 |
result_dict["decoding_throughput"] = decoding_throughput
|
28 |
+
result_dict["mfu"] = mfu
|
29 |
+
result_dict["mbu"] = mbu
|
30 |
return result_dict
|
31 |
return wrapper
|
32 |
ConfigurableTask.process_results = process_results_decorator(orig_process_results)
|
src/backend/tasks/arena_hard/__init__.py
ADDED
File without changes
|
src/backend/tasks/arena_hard/arena_hard.yaml
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
task: arena_hard
|
2 |
+
class: !function task.ArenaHard
|
src/backend/tasks/arena_hard/arena_judgment.py
ADDED
@@ -0,0 +1,256 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'''
|
2 |
+
This file is part of Open-MoE-LLM-Leaderboard and is modified based on work
|
3 |
+
under the Apache 2.0 License from the arena-hard project.
|
4 |
+
(https://github.com/lm-sys/arena-hard)
|
5 |
+
Original Copyright (c) 2024 Tianle Li*, Wei-Lin Chiang*, Evan Frick, Lisa Dunlap, Banghua Zhu, Joseph E. Gonzalez, Ion Stoica
|
6 |
+
See the NOTICE file distributed with this work for additional
|
7 |
+
information regarding copyright ownership.
|
8 |
+
'''
|
9 |
+
|
10 |
+
import pandas as pd
|
11 |
+
from tqdm import tqdm
|
12 |
+
import numpy as np
|
13 |
+
from sklearn.linear_model import LogisticRegression
|
14 |
+
import math
|
15 |
+
from collections import defaultdict
|
16 |
+
from tqdm import tqdm
|
17 |
+
|
18 |
+
from src.backend.tasks.arena_hard.arena_utils import (
|
19 |
+
chat_completion_openai,
|
20 |
+
load_questions,
|
21 |
+
load_model_answers,
|
22 |
+
get_endpoint,
|
23 |
+
make_config,
|
24 |
+
)
|
25 |
+
|
26 |
+
|
27 |
+
def get_score(judgment, pattern, pairwise=True):
|
28 |
+
matches = pattern.findall(judgment)
|
29 |
+
matches = [m for m in matches if m != ""]
|
30 |
+
if len(set(matches)) == 0:
|
31 |
+
return None, True
|
32 |
+
elif len(set(matches)) == 1:
|
33 |
+
if pairwise:
|
34 |
+
return matches[0].strip("\n"), False
|
35 |
+
return int(matches[0])
|
36 |
+
else:
|
37 |
+
return None, False
|
38 |
+
|
39 |
+
|
40 |
+
# get answer from model
|
41 |
+
def get_answer(model, conv, temperature, max_tokens, endpoint_dict=None):
|
42 |
+
api_dict = get_endpoint(endpoint_dict["endpoints"])
|
43 |
+
|
44 |
+
# if endpoint_dict["api_type"] == "anthropic":
|
45 |
+
# output = chat_completion_anthropic(model, conv, temperature, max_tokens)
|
46 |
+
# elif endpoint_dict["api_type"] == "azure":
|
47 |
+
# output = chat_completion_openai_azure(model, conv, temperature, max_tokens, api_dict)
|
48 |
+
|
49 |
+
output = chat_completion_openai(model, conv, temperature, max_tokens, api_dict)
|
50 |
+
return output
|
51 |
+
|
52 |
+
|
53 |
+
def judgment(**args):
|
54 |
+
question = args["question"]
|
55 |
+
answer = args["answer"]
|
56 |
+
reference = args["reference"]
|
57 |
+
baseline = args["baseline_answer"]
|
58 |
+
configs = args["configs"]
|
59 |
+
# output_file = args["output_file"]
|
60 |
+
model = configs["judge_model"]
|
61 |
+
|
62 |
+
num_games = 2 if configs["pairwise"] else 1
|
63 |
+
|
64 |
+
# output = {
|
65 |
+
# "question_id":question["question_id"],
|
66 |
+
# "judge": model,
|
67 |
+
# "model": "custom_model",
|
68 |
+
# "games":[]
|
69 |
+
# }
|
70 |
+
output = [question["question_id"]]
|
71 |
+
|
72 |
+
for game in range(num_games):
|
73 |
+
conv = [{"role": "system", "content": configs["system_prompt"]}]
|
74 |
+
|
75 |
+
for template in configs["prompt_template"]:
|
76 |
+
prompt_args = {}
|
77 |
+
|
78 |
+
prompt_args[f"question_{1}"] = question["content"]
|
79 |
+
base = 1
|
80 |
+
|
81 |
+
if baseline:
|
82 |
+
if game % 2 == 1: # swap position
|
83 |
+
temp = baseline
|
84 |
+
baseline = answer
|
85 |
+
answer = temp
|
86 |
+
|
87 |
+
if game == 0:
|
88 |
+
for i, turn in enumerate(baseline["choices"][0]["turns"]):
|
89 |
+
prompt_args[f"answer_{i+1}"] = turn["content"]
|
90 |
+
base += 1
|
91 |
+
|
92 |
+
if game == 1:
|
93 |
+
prompt_args[f"answer_{1}"] = baseline
|
94 |
+
base += 1
|
95 |
+
|
96 |
+
if answer:
|
97 |
+
prompt_args[f"answer_{base}"] = answer
|
98 |
+
|
99 |
+
if reference:
|
100 |
+
for j, ref_answer in enumerate(reference):
|
101 |
+
for i, turn in enumerate(ref_answer["choices"][0]["turns"]):
|
102 |
+
prompt_args[f"ref_answer_{i+j+1}"] = turn["content"]
|
103 |
+
|
104 |
+
user_prompt = template.format(**prompt_args)
|
105 |
+
conv.append({"role": "user", "content": user_prompt})
|
106 |
+
|
107 |
+
judgment = ""
|
108 |
+
for _ in range(2):
|
109 |
+
new_judgment = get_answer(
|
110 |
+
model,
|
111 |
+
conv,
|
112 |
+
configs["temperature"],
|
113 |
+
configs["max_tokens"],
|
114 |
+
args["endpoint_dict"],
|
115 |
+
)
|
116 |
+
|
117 |
+
judgment += ("\n" + new_judgment)
|
118 |
+
|
119 |
+
score, try_again = get_score(judgment, args["regex_pattern"])
|
120 |
+
|
121 |
+
conv.append({"role": "assistant", "content": new_judgment})
|
122 |
+
|
123 |
+
if not try_again:
|
124 |
+
break
|
125 |
+
|
126 |
+
conv.append({"role": "user", "content": "continue your judgment and finish by outputting a final verdict label"})
|
127 |
+
print("Finish judgment!!!")
|
128 |
+
# result = {
|
129 |
+
# "user_prompt": conv[1]["content"],
|
130 |
+
# "judgment": judgment,
|
131 |
+
# "score":score
|
132 |
+
# }
|
133 |
+
output.append(score)
|
134 |
+
|
135 |
+
return output
|
136 |
+
|
137 |
+
def get_battles_from_scores(score_list, first_game_only=False, WEIGHT=3):
|
138 |
+
arena_hard_battles = pd.DataFrame()
|
139 |
+
|
140 |
+
print("Turning score list into battles...")
|
141 |
+
|
142 |
+
for scores in tqdm(score_list):
|
143 |
+
question_id, score1, score2 = scores
|
144 |
+
|
145 |
+
# Process game 1
|
146 |
+
output = {"question_id": question_id,
|
147 |
+
"model_a": "gpt-4-0314",
|
148 |
+
"model_b": f"custom_model"} # Unique identifier for model
|
149 |
+
weight = 1
|
150 |
+
if score1 == "A=B":
|
151 |
+
output["winner"] = "tie"
|
152 |
+
elif score1 == "A>B":
|
153 |
+
output["winner"] = "model_a"
|
154 |
+
elif score1 == "A>>B":
|
155 |
+
output["winner"] = "model_a"
|
156 |
+
weight = WEIGHT
|
157 |
+
elif score1 == "B>A":
|
158 |
+
output["winner"] = "model_b"
|
159 |
+
elif score1 == "B>>A":
|
160 |
+
output["winner"] = "model_b"
|
161 |
+
weight = WEIGHT
|
162 |
+
else:
|
163 |
+
weight = 0
|
164 |
+
|
165 |
+
if weight:
|
166 |
+
arena_hard_battles = pd.concat([arena_hard_battles, pd.DataFrame([output] * weight)])
|
167 |
+
|
168 |
+
if not first_game_only:
|
169 |
+
# Process game 2
|
170 |
+
output = {"question_id": question_id,
|
171 |
+
"model_a": "gpt-4-0314",
|
172 |
+
"model_b": f"custom_model"} # Unique identifier for model
|
173 |
+
weight = 1
|
174 |
+
if score2 == "A=B":
|
175 |
+
output["winner"] = "tie"
|
176 |
+
elif score2 == "A>B":
|
177 |
+
output["winner"] = "model_b"
|
178 |
+
elif score2 == "A>>B":
|
179 |
+
output["winner"] = "model_b"
|
180 |
+
weight = WEIGHT
|
181 |
+
elif score2 == "B>A":
|
182 |
+
output["winner"] = "model_a"
|
183 |
+
elif score2 == "B>>A":
|
184 |
+
output["winner"] = "model_a"
|
185 |
+
weight = WEIGHT
|
186 |
+
else:
|
187 |
+
weight = 0
|
188 |
+
|
189 |
+
if weight:
|
190 |
+
arena_hard_battles = pd.concat([arena_hard_battles, pd.DataFrame([output] * weight)])
|
191 |
+
|
192 |
+
arena_hard_battles.to_json("./arena_hard_battles.jsonl", lines=True, orient="records")
|
193 |
+
return arena_hard_battles
|
194 |
+
|
195 |
+
def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
|
196 |
+
models = pd.concat([df["model_a"], df["model_b"]]).unique()
|
197 |
+
models = pd.Series(np.arange(len(models)), index=models)
|
198 |
+
|
199 |
+
LOW_RATING = 100
|
200 |
+
# duplicate battles
|
201 |
+
df = pd.concat([df, df], ignore_index=True)
|
202 |
+
p = len(models.index)
|
203 |
+
n = df.shape[0]
|
204 |
+
|
205 |
+
X = np.zeros([n, p])
|
206 |
+
X[np.arange(n), models[df["model_a"]]] = +math.log(BASE)
|
207 |
+
X[np.arange(n), models[df["model_b"]]] = -math.log(BASE)
|
208 |
+
|
209 |
+
# one A win => two A win
|
210 |
+
Y = np.zeros(n)
|
211 |
+
Y[df["winner"] == "model_a"] = 1.0
|
212 |
+
|
213 |
+
# one tie => one A win + one B win
|
214 |
+
# find tie + tie (both bad) index
|
215 |
+
tie_idx = (df["winner"] == "tie") | (df["winner"] == "tie (bothbad)")
|
216 |
+
tie_idx[len(tie_idx)//2:] = False
|
217 |
+
Y[tie_idx] = 1.0
|
218 |
+
|
219 |
+
if len(np.unique(Y)) == 1:
|
220 |
+
# If there's only one class in the data, assign default ratings
|
221 |
+
elo_scores = np.full(p, LOW_RATING)
|
222 |
+
elo_scores[models["gpt-4-0314"]] = INIT_RATING
|
223 |
+
else:
|
224 |
+
lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-8)
|
225 |
+
lr.fit(X,Y)
|
226 |
+
|
227 |
+
elo_scores = SCALE * lr.coef_[0] + INIT_RATING
|
228 |
+
|
229 |
+
# set anchor as gpt-4-0314 = 1000
|
230 |
+
if "gpt-4-0314" in models.index:
|
231 |
+
elo_scores += 1000 - elo_scores[models["gpt-4-0314"]]
|
232 |
+
return pd.Series(elo_scores, index = models.index).sort_values(ascending=False)
|
233 |
+
|
234 |
+
def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000):
|
235 |
+
names = sorted(list(elo_ratings.keys()))
|
236 |
+
wins = defaultdict(lambda: defaultdict(lambda: 0))
|
237 |
+
for a in names:
|
238 |
+
for b in names:
|
239 |
+
ea = 1 / (1 + BASE ** ((elo_ratings[b] - elo_ratings[a]) / SCALE))
|
240 |
+
wins[a][b] = ea
|
241 |
+
wins[b][a] = 1 - ea
|
242 |
+
|
243 |
+
data = {
|
244 |
+
a: [wins[a][b] if a != b else np.NAN for b in names]
|
245 |
+
for a in names
|
246 |
+
}
|
247 |
+
|
248 |
+
df = pd.DataFrame(data, index=names)
|
249 |
+
df.index.name = "model_a"
|
250 |
+
df.columns.name = "model_b"
|
251 |
+
return df.T
|
252 |
+
|
253 |
+
def get_win_rate_column(df, column, baseline="gpt-4-0314"):
|
254 |
+
to_dict = df[["model", column]].set_index("model").to_dict()[column]
|
255 |
+
win_rate_table = predict_win_rate(to_dict)
|
256 |
+
return win_rate_table[baseline].fillna(0.5).apply(lambda x: round(x * 100, 2))
|
src/backend/tasks/arena_hard/arena_utils.py
ADDED
@@ -0,0 +1,349 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'''
|
2 |
+
This file is part of Open-MoE-LLM-Leaderboard and is modified based on work
|
3 |
+
under the Apache 2.0 License from the arena-hard project.
|
4 |
+
(https://github.com/lm-sys/arena-hard)
|
5 |
+
Original Copyright (c) 2024 Tianle Li*, Wei-Lin Chiang*, Evan Frick, Lisa Dunlap, Banghua Zhu, Joseph E. Gonzalez, Ion Stoica
|
6 |
+
See the NOTICE file distributed with this work for additional
|
7 |
+
information regarding copyright ownership.
|
8 |
+
'''
|
9 |
+
|
10 |
+
|
11 |
+
import os
|
12 |
+
import json
|
13 |
+
import time
|
14 |
+
import yaml
|
15 |
+
import random
|
16 |
+
|
17 |
+
from typing import Optional
|
18 |
+
from glob import glob
|
19 |
+
|
20 |
+
# API setting constants
|
21 |
+
API_MAX_RETRY = 16
|
22 |
+
API_RETRY_SLEEP = 10
|
23 |
+
API_ERROR_OUTPUT = "$ERROR$"
|
24 |
+
|
25 |
+
|
26 |
+
OPENAI_MODEL_LIST = (
|
27 |
+
"gpt-3.5-turbo",
|
28 |
+
"gpt-3.5-turbo-0301",
|
29 |
+
"gpt-3.5-turbo-0613",
|
30 |
+
"gpt-3.5-turbo-0613-verbose",
|
31 |
+
"gpt-3.5-turbo-1106",
|
32 |
+
"gpt-3.5-turbo-0125",
|
33 |
+
"gpt-4",
|
34 |
+
"gpt-4-0314",
|
35 |
+
"gpt-4-0613",
|
36 |
+
"gpt-4-turbo",
|
37 |
+
"gpt-4-1106-preview",
|
38 |
+
"gpt-4-0125-preview",
|
39 |
+
)
|
40 |
+
|
41 |
+
|
42 |
+
temperature_config = {
|
43 |
+
"writing": 0.7,
|
44 |
+
"roleplay": 0.7,
|
45 |
+
"extraction": 0.0,
|
46 |
+
"math": 0.0,
|
47 |
+
"coding": 0.0,
|
48 |
+
"reasoning": 0.0,
|
49 |
+
"stem": 0.1,
|
50 |
+
"humanities": 0.1,
|
51 |
+
}
|
52 |
+
|
53 |
+
|
54 |
+
def load_questions(question_file: str):
|
55 |
+
"""Load questions from a file."""
|
56 |
+
questions = []
|
57 |
+
with open(question_file, "r") as ques_file:
|
58 |
+
for line in ques_file:
|
59 |
+
if line:
|
60 |
+
questions.append(json.loads(line))
|
61 |
+
return questions
|
62 |
+
|
63 |
+
|
64 |
+
def load_model_answers(answer_dir: str):
|
65 |
+
"""Load model answers.
|
66 |
+
|
67 |
+
The return value is a python dict of type:
|
68 |
+
Dict[model_name: str -> Dict[question_id: int -> answer: dict]]
|
69 |
+
"""
|
70 |
+
filenames = glob(os.path.join(answer_dir, "*.jsonl"))
|
71 |
+
filenames.sort()
|
72 |
+
model_answers = {}
|
73 |
+
|
74 |
+
for filename in filenames:
|
75 |
+
model_name = os.path.basename(filename)[:-6]
|
76 |
+
answer = {}
|
77 |
+
with open(filename) as fin:
|
78 |
+
for line in fin:
|
79 |
+
line = json.loads(line)
|
80 |
+
answer[line["question_id"]] = line
|
81 |
+
model_answers[model_name] = answer
|
82 |
+
|
83 |
+
return model_answers
|
84 |
+
|
85 |
+
|
86 |
+
def get_endpoint(endpoint_list):
|
87 |
+
if endpoint_list is None:
|
88 |
+
return None
|
89 |
+
assert endpoint_list is not None
|
90 |
+
# randomly pick one
|
91 |
+
api_dict = random.choices(
|
92 |
+
endpoint_list
|
93 |
+
)[0]
|
94 |
+
return api_dict
|
95 |
+
|
96 |
+
|
97 |
+
# load config args from config yaml files
|
98 |
+
def make_config(config_file: str) -> dict:
|
99 |
+
config_kwargs = {}
|
100 |
+
with open(config_file, "r") as f:
|
101 |
+
config_kwargs = yaml.load(f, Loader=yaml.SafeLoader)
|
102 |
+
|
103 |
+
return config_kwargs
|
104 |
+
|
105 |
+
|
106 |
+
def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=None):
|
107 |
+
import openai
|
108 |
+
if api_dict:
|
109 |
+
client = openai.OpenAI(
|
110 |
+
base_url=api_dict["api_base"],
|
111 |
+
api_key=api_dict["api_key"],
|
112 |
+
)
|
113 |
+
else:
|
114 |
+
client = openai.OpenAI()
|
115 |
+
|
116 |
+
output = API_ERROR_OUTPUT
|
117 |
+
for _ in range(API_MAX_RETRY):
|
118 |
+
try:
|
119 |
+
# print(messages)
|
120 |
+
completion = client.chat.completions.create(
|
121 |
+
model=model,
|
122 |
+
messages=messages,
|
123 |
+
temperature=temperature,
|
124 |
+
max_tokens=max_tokens
|
125 |
+
)
|
126 |
+
output = completion.choices[0].message.content
|
127 |
+
break
|
128 |
+
except openai.RateLimitError as e:
|
129 |
+
print(type(e), e)
|
130 |
+
time.sleep(API_RETRY_SLEEP)
|
131 |
+
except openai.BadRequestError as e:
|
132 |
+
print(messages)
|
133 |
+
print(type(e), e)
|
134 |
+
except KeyError:
|
135 |
+
print(type(e), e)
|
136 |
+
break
|
137 |
+
|
138 |
+
return output
|
139 |
+
|
140 |
+
|
141 |
+
# def chat_completion_openai_azure(model, messages, temperature, max_tokens, api_dict=None):
|
142 |
+
# import openai
|
143 |
+
# from openai import AzureOpenAI
|
144 |
+
|
145 |
+
# api_base = api_dict["api_base"]
|
146 |
+
# client = AzureOpenAI(
|
147 |
+
# azure_endpoint = api_base,
|
148 |
+
# api_key= api_dict["api_key"],
|
149 |
+
# api_version=api_dict["api_version"],
|
150 |
+
# timeout=240,
|
151 |
+
# max_retries=2
|
152 |
+
# )
|
153 |
+
|
154 |
+
# output = API_ERROR_OUTPUT
|
155 |
+
# for _ in range(API_MAX_RETRY):
|
156 |
+
# try:
|
157 |
+
# response = client.chat.completions.create(
|
158 |
+
# model=model,
|
159 |
+
# messages=messages,
|
160 |
+
# n=1,
|
161 |
+
# temperature=temperature,
|
162 |
+
# max_tokens=max_tokens,
|
163 |
+
# seed=42,
|
164 |
+
# )
|
165 |
+
# output = response.choices[0].message.content
|
166 |
+
# break
|
167 |
+
# except openai.RateLimitError as e:
|
168 |
+
# print(type(e), e)
|
169 |
+
# time.sleep(API_RETRY_SLEEP)
|
170 |
+
# except openai.BadRequestError as e:
|
171 |
+
# print(type(e), e)
|
172 |
+
# break
|
173 |
+
# except KeyError:
|
174 |
+
# print(type(e), e)
|
175 |
+
# break
|
176 |
+
|
177 |
+
# return output
|
178 |
+
|
179 |
+
|
180 |
+
# def chat_completion_anthropic(model, messages, temperature, max_tokens, api_dict=None):
|
181 |
+
# import anthropic
|
182 |
+
|
183 |
+
# if api_dict:
|
184 |
+
# api_key = api_dict["api_key"]
|
185 |
+
# else:
|
186 |
+
# api_key = os.environ["ANTHROPIC_API_KEY"]
|
187 |
+
|
188 |
+
# sys_msg = ""
|
189 |
+
# if messages[0]["role"] == "system":
|
190 |
+
# sys_msg = messages[0]["content"]
|
191 |
+
# messages = messages[1:]
|
192 |
+
|
193 |
+
# output = API_ERROR_OUTPUT
|
194 |
+
# for _ in range(API_MAX_RETRY):
|
195 |
+
# try:
|
196 |
+
# # print(sys_msg)
|
197 |
+
# c = anthropic.Anthropic(api_key=api_key)
|
198 |
+
# response = c.messages.create(
|
199 |
+
# model=model,
|
200 |
+
# messages=messages,
|
201 |
+
# stop_sequences=[anthropic.HUMAN_PROMPT],
|
202 |
+
# max_tokens=max_tokens,
|
203 |
+
# temperature=temperature,
|
204 |
+
# system=sys_msg
|
205 |
+
# )
|
206 |
+
# output = response.content[0].text
|
207 |
+
# break
|
208 |
+
# except anthropic.APIError as e:
|
209 |
+
# print(type(e), e)
|
210 |
+
# time.sleep(API_RETRY_SLEEP)
|
211 |
+
# return output
|
212 |
+
|
213 |
+
|
214 |
+
# def chat_completion_mistral(model, messages, temperature, max_tokens):
|
215 |
+
# from mistralai.client import MistralClient
|
216 |
+
# from mistralai.models.chat_completion import ChatMessage
|
217 |
+
# from mistralai.exceptions import MistralException
|
218 |
+
|
219 |
+
# api_key = os.environ["MISTRAL_API_KEY"]
|
220 |
+
# client = MistralClient(api_key=api_key)
|
221 |
+
|
222 |
+
# prompts = [ChatMessage(role=message["role"], content=message["content"]) for message in messages]
|
223 |
+
|
224 |
+
# output = API_ERROR_OUTPUT
|
225 |
+
# for _ in range(API_MAX_RETRY):
|
226 |
+
# try:
|
227 |
+
# chat_response = client.chat(
|
228 |
+
# model=model,
|
229 |
+
# messages=prompts,
|
230 |
+
# temperature=temperature,
|
231 |
+
# max_tokens=max_tokens,
|
232 |
+
# )
|
233 |
+
# output = chat_response.choices[0].message.content
|
234 |
+
# break
|
235 |
+
# except MistralException as e:
|
236 |
+
# print(type(e), e)
|
237 |
+
# break
|
238 |
+
|
239 |
+
# return output
|
240 |
+
|
241 |
+
|
242 |
+
# def chat_completion_gemini(model, messages, temperature, max_tokens):
|
243 |
+
# import google.generativeai as genai
|
244 |
+
# genai.configure(api_key=os.environ["GEMINI_API_KEY"])
|
245 |
+
|
246 |
+
# safety_settings = [
|
247 |
+
# {
|
248 |
+
# "category": "HARM_CATEGORY_HARASSMENT",
|
249 |
+
# "threshold": "BLOCK_NONE"
|
250 |
+
# },
|
251 |
+
# {
|
252 |
+
# "category": "HARM_CATEGORY_HATE_SPEECH",
|
253 |
+
# "threshold": "BLOCK_NONE"
|
254 |
+
# },
|
255 |
+
# {
|
256 |
+
# "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
|
257 |
+
# "threshold": "BLOCK_NONE"
|
258 |
+
# },
|
259 |
+
# {
|
260 |
+
# "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
|
261 |
+
# "threshold": "BLOCK_NONE"
|
262 |
+
# },
|
263 |
+
# ]
|
264 |
+
|
265 |
+
# # Set up the model
|
266 |
+
# generation_config = {
|
267 |
+
# "temperature": temperature,
|
268 |
+
# "top_p": 1,
|
269 |
+
# "top_k": 1,
|
270 |
+
# "max_output_tokens": max_tokens,
|
271 |
+
# }
|
272 |
+
|
273 |
+
# output = API_ERROR_OUTPUT
|
274 |
+
# for _ in range(API_MAX_RETRY):
|
275 |
+
# try:
|
276 |
+
# gemini = genai.GenerativeModel(
|
277 |
+
# model_name=model,
|
278 |
+
# generation_config=generation_config,
|
279 |
+
# safety_settings=safety_settings)
|
280 |
+
|
281 |
+
# convo = gemini.start_chat(history=[])
|
282 |
+
|
283 |
+
# convo.send_message(messages)
|
284 |
+
# output = convo.last.text
|
285 |
+
# break
|
286 |
+
# except genai.types.generation_types.StopCandidateException as e:
|
287 |
+
# print(type(e), e)
|
288 |
+
# break
|
289 |
+
# except Exception as e:
|
290 |
+
# print(type(e), e)
|
291 |
+
# time.sleep(API_RETRY_SLEEP)
|
292 |
+
|
293 |
+
# return output
|
294 |
+
|
295 |
+
|
296 |
+
# def chat_completion_cohere(model, messages, temperature, max_tokens):
|
297 |
+
# import cohere
|
298 |
+
|
299 |
+
# co = cohere.Client(os.environ["COHERE_API_KEY"])
|
300 |
+
# assert len(messages) > 0
|
301 |
+
|
302 |
+
# template_map = {"system":"SYSTEM",
|
303 |
+
# "assistant":"CHATBOT",
|
304 |
+
# "user":"USER"}
|
305 |
+
|
306 |
+
# assert messages[-1]["role"] == "user"
|
307 |
+
# prompt = messages[-1]["content"]
|
308 |
+
|
309 |
+
# if len(messages) > 1:
|
310 |
+
# history = []
|
311 |
+
# for message in messages[:-1]:
|
312 |
+
# history.append({"role":template_map[message["role"]], "message":message["content"]})
|
313 |
+
# else:
|
314 |
+
# history = None
|
315 |
+
|
316 |
+
# output = API_ERROR_OUTPUT
|
317 |
+
# for _ in range(API_MAX_RETRY):
|
318 |
+
# try:
|
319 |
+
# response = co.chat(
|
320 |
+
# message=prompt,
|
321 |
+
# model=model,
|
322 |
+
# temperature=temperature,
|
323 |
+
# max_tokens=max_tokens,
|
324 |
+
# chat_history=history,
|
325 |
+
# )
|
326 |
+
# output = response.text
|
327 |
+
# break
|
328 |
+
# except cohere.core.api_error.ApiError as e:
|
329 |
+
# print(type(e), e)
|
330 |
+
# raise
|
331 |
+
# except Exception as e:
|
332 |
+
# print(type(e), e)
|
333 |
+
# break
|
334 |
+
|
335 |
+
# return output
|
336 |
+
|
337 |
+
|
338 |
+
def reorg_answer_file(answer_file):
|
339 |
+
"""Sort by question id and de-duplication"""
|
340 |
+
answers = {}
|
341 |
+
with open(answer_file, "r") as fin:
|
342 |
+
for l in fin:
|
343 |
+
qid = json.loads(l)["question_id"]
|
344 |
+
answers[qid] = l
|
345 |
+
|
346 |
+
qids = sorted(list(answers.keys()))
|
347 |
+
with open(answer_file, "w") as fout:
|
348 |
+
for qid in qids:
|
349 |
+
fout.write(answers[qid])
|
src/backend/tasks/arena_hard/configs/api_config.yaml
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# gpt-3.5-turbo:
|
2 |
+
# model_name: gpt-3.5-turbo
|
3 |
+
# endpoints: null
|
4 |
+
# api_type: openai
|
5 |
+
# parallel: 8
|
6 |
+
|
7 |
+
gpt-4-1106-preview:
|
8 |
+
model_name: gpt-4-1106-preview
|
9 |
+
endpoints: null
|
10 |
+
api_type: openai
|
11 |
+
parallel: 8
|
12 |
+
|
13 |
+
# llama3-7b:
|
14 |
+
# model_name: llama3-7b
|
15 |
+
# endpoints: null
|
16 |
+
# api_type: openai
|
17 |
+
# parallel: 8
|
src/backend/tasks/arena_hard/configs/judge_config.yaml
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: judgment config file for Arena Hard
|
2 |
+
|
3 |
+
bench_name: arena-hard-v0.1
|
4 |
+
|
5 |
+
# Arena Hard default
|
6 |
+
judge_model: gpt-4-1106-preview
|
7 |
+
# judge_model: gpt-3.5-turbo
|
8 |
+
reference: False # Optional
|
9 |
+
ref_model: null
|
10 |
+
|
11 |
+
baseline: True
|
12 |
+
baseline_model: gpt-4-0314
|
13 |
+
|
14 |
+
pairwise: True
|
15 |
+
temperature: 0
|
16 |
+
max_tokens: 4096
|
17 |
+
|
18 |
+
regex_pattern: \[\[([AB<>=]+)\]\]
|
19 |
+
|
20 |
+
system_prompt: "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\"."
|
21 |
+
|
22 |
+
prompt_template: ["<|User Prompt|>\n{question_1}\n\n<|The Start of Assistant A's Answer|>\n{answer_1}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{answer_2}\n<|The End of Assistant B's Answer|>"]
|
23 |
+
|
24 |
+
# Add your model below for evaluation
|
25 |
+
# model_list:
|
26 |
+
# - gpt-3.5-turbo-0125
|
src/backend/tasks/arena_hard/model_answer/gpt-4-0314.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
src/backend/tasks/arena_hard/question.jsonl
ADDED
The diff for this file is too large to render.
See raw diff
|
|
src/backend/tasks/arena_hard/task.py
ADDED
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import Union, List
|
3 |
+
|
4 |
+
from lm_eval.api.task import ConfigurableTask
|
5 |
+
from lm_eval.api.instance import Instance
|
6 |
+
|
7 |
+
# from lm_eval.api.registry import register_task
|
8 |
+
from lm_eval.api.metrics import mean
|
9 |
+
|
10 |
+
from src.backend.envs import DEVICE
|
11 |
+
|
12 |
+
import pandas as pd
|
13 |
+
|
14 |
+
from src.backend.tasks.measurement_task_utils import measure_system_metrics
|
15 |
+
import json
|
16 |
+
|
17 |
+
from typing import (
|
18 |
+
Any,
|
19 |
+
Dict,
|
20 |
+
List,
|
21 |
+
Optional,
|
22 |
+
Union,
|
23 |
+
)
|
24 |
+
|
25 |
+
from datasets import Dataset
|
26 |
+
import re
|
27 |
+
|
28 |
+
from src.backend.tasks.arena_hard.arena_utils import (
|
29 |
+
load_questions,
|
30 |
+
load_questions,
|
31 |
+
load_model_answers,
|
32 |
+
make_config,
|
33 |
+
)
|
34 |
+
|
35 |
+
from src.backend.tasks.arena_hard.arena_judgment import (
|
36 |
+
judgment,
|
37 |
+
get_battles_from_scores,
|
38 |
+
compute_mle_elo,
|
39 |
+
predict_win_rate,
|
40 |
+
get_win_rate_column
|
41 |
+
)
|
42 |
+
|
43 |
+
def load_questions(question_file: str):
|
44 |
+
"""Load questions from a file."""
|
45 |
+
questions = []
|
46 |
+
with open(question_file, "r") as ques_file:
|
47 |
+
for line in ques_file:
|
48 |
+
if line:
|
49 |
+
questions.append(json.loads(line))
|
50 |
+
return questions
|
51 |
+
|
52 |
+
def download_wrapper(func):
|
53 |
+
def download(self, *args, **kwargs):
|
54 |
+
print("Using Arena Hard, No need to download")
|
55 |
+
return download
|
56 |
+
|
57 |
+
original_download = ConfigurableTask.download
|
58 |
+
ConfigurableTask.download = download_wrapper(original_download)
|
59 |
+
# @register_task("selfcheckgpt")
|
60 |
+
@measure_system_metrics
|
61 |
+
class ArenaHard(ConfigurableTask):
|
62 |
+
VERSION = 0.0
|
63 |
+
OUTPUT_TYPE = "generate_until"
|
64 |
+
data_path = os.path.join(os.path.dirname(__file__), 'question.jsonl')
|
65 |
+
judge_config_path = os.path.join(os.path.dirname(__file__), "configs/judge_config.yaml")
|
66 |
+
configs = make_config(judge_config_path)
|
67 |
+
model_ans_dir = os.path.join(os.path.dirname(__file__), "model_answer")
|
68 |
+
model_answers = load_model_answers(model_ans_dir)
|
69 |
+
data = load_questions(data_path)
|
70 |
+
|
71 |
+
def __init__(self):
|
72 |
+
super().__init__(config={"metadata": {"version": self.VERSION}})
|
73 |
+
# these end tokens are hard coded because of the current limitaion of the llm-eval.
|
74 |
+
# self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>", "<|endoftext|>"], "max_length": 512}
|
75 |
+
self.generation_kwargs = {"until": ["</s>", "<|im_end|>"], "max_length": 4096}
|
76 |
+
# self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
|
77 |
+
# self.generation_kwargs_sampling = {
|
78 |
+
# "temperature": 0.99,
|
79 |
+
# "do_sample": True,
|
80 |
+
# "until": ["<im_end>", "<im_end>"],
|
81 |
+
# "max_length": 1024,
|
82 |
+
# }
|
83 |
+
|
84 |
+
def transform_data(self, data):
|
85 |
+
transformed_data = []
|
86 |
+
for i in range(len(data)):
|
87 |
+
if self.configs["baseline"]:
|
88 |
+
baseline_answer = self.model_answers[self.configs["baseline_model"]][data[i]["question_id"]]
|
89 |
+
else:
|
90 |
+
baseline_answer = None
|
91 |
+
transformed_item = {
|
92 |
+
"question_id": data[i]["question_id"],
|
93 |
+
"content": data[i]["turns"][0]["content"], # Assuming you want the first turn's content
|
94 |
+
"model_answer": baseline_answer
|
95 |
+
}
|
96 |
+
transformed_data.append(transformed_item)
|
97 |
+
return transformed_data
|
98 |
+
|
99 |
+
def has_training_docs(self):
|
100 |
+
return False
|
101 |
+
|
102 |
+
def has_validation_docs(self):
|
103 |
+
return True
|
104 |
+
|
105 |
+
def has_test_docs(self):
|
106 |
+
return False
|
107 |
+
|
108 |
+
def validation_docs(self):
|
109 |
+
self.dataset = self.transform_data(self.data)
|
110 |
+
self.dataset = Dataset.from_dict({"question_id": [item["question_id"] for item in self.dataset],
|
111 |
+
"content": [item["content"] for item in self.dataset],
|
112 |
+
"model_answer": [item["model_answer"] for item in self.dataset]})
|
113 |
+
return self.dataset
|
114 |
+
|
115 |
+
def doc_to_text(self, doc):
|
116 |
+
sentence = doc["content"]
|
117 |
+
doc_text = f"{sentence}\n"
|
118 |
+
return doc_text
|
119 |
+
|
120 |
+
def doc_to_target(self, doc):
|
121 |
+
q_id = doc["question_id"]
|
122 |
+
return q_id
|
123 |
+
|
124 |
+
def construct_requests(self, doc: dict, ctx: str, **kwargs) -> Union[List[Instance], Instance]:
|
125 |
+
arguments = (ctx, self.generation_kwargs)
|
126 |
+
request_list = [
|
127 |
+
Instance(request_type="generate_until", doc=doc, arguments=arguments, idx=0, **kwargs),
|
128 |
+
]
|
129 |
+
# sampling_arguments = (ctx, self.generation_kwargs_sampling)
|
130 |
+
# request_list.extend(
|
131 |
+
# [
|
132 |
+
# Instance(request_type="generate_until", doc=doc, arguments=sampling_arguments, idx=idx, **kwargs)
|
133 |
+
# for idx in range(1, self.generation_kwargs_sampling_number + 1)
|
134 |
+
# ]
|
135 |
+
# )
|
136 |
+
return request_list
|
137 |
+
|
138 |
+
def process_results(self, doc, results):
|
139 |
+
response_temperature_0 = results[0]
|
140 |
+
# other_responses = results[1:]
|
141 |
+
api_config_path = os.path.join(os.path.dirname(__file__), "configs/api_config.yaml")
|
142 |
+
endpoint_list = make_config(api_config_path)
|
143 |
+
|
144 |
+
if self.configs["regex_pattern"]:
|
145 |
+
pattern = re.compile(self.configs["regex_pattern"])
|
146 |
+
|
147 |
+
ref_answer_dir = os.path.join(os.path.dirname(__file__), "reference_answer")
|
148 |
+
|
149 |
+
ref_answers = None
|
150 |
+
if self.configs["reference"]:
|
151 |
+
ref_answers = load_model_answers(ref_answer_dir)
|
152 |
+
ref_answers = [ref_answers[model] for model in self.configs["ref_model"]]
|
153 |
+
|
154 |
+
# output_files = {}
|
155 |
+
# models = ["custom_model"]
|
156 |
+
# output_dir = f"{os.path.join(os.path.dirname(__file__))}/model_judgments/{self.configs['judge_model']}"
|
157 |
+
# for model in models:
|
158 |
+
# output_files[model] = os.path.join(
|
159 |
+
# output_dir,
|
160 |
+
# f"{model}.jsonl",
|
161 |
+
# )
|
162 |
+
|
163 |
+
# for output_file in output_files.values():
|
164 |
+
# os.makedirs(os.path.dirname(output_file), exist_ok=True)
|
165 |
+
|
166 |
+
endpoint_info = endpoint_list[self.configs["judge_model"]]
|
167 |
+
|
168 |
+
question = doc
|
169 |
+
kwargs = {}
|
170 |
+
kwargs["question"] = question
|
171 |
+
kwargs["answer"] = response_temperature_0
|
172 |
+
if ref_answers:
|
173 |
+
kwargs["reference"] = [ref_answer[doc["question_id"]] for ref_answer in ref_answers]
|
174 |
+
assert len(kwargs["reference"]) == len(self.configs["ref_model"])
|
175 |
+
else:
|
176 |
+
kwargs["reference"] = None
|
177 |
+
|
178 |
+
if self.configs["baseline"]:
|
179 |
+
kwargs["baseline_answer"] = doc["model_answer"]
|
180 |
+
else:
|
181 |
+
kwargs["baseline_answer"] = None
|
182 |
+
kwargs["configs"] = self.configs
|
183 |
+
kwargs["endpoint_dict"] = endpoint_info
|
184 |
+
# kwargs["output_file"] = output_files["custom_model"]
|
185 |
+
kwargs["regex_pattern"] = pattern
|
186 |
+
|
187 |
+
scores = judgment(**kwargs)
|
188 |
+
return {"score": scores}
|
189 |
+
|
190 |
+
def aggregation(self):
|
191 |
+
"""
|
192 |
+
:returns: {str: [float] -> float}
|
193 |
+
A dictionary where keys are the names of submetrics and values are
|
194 |
+
functions that aggregate a list of metrics
|
195 |
+
"""
|
196 |
+
##TODO implement the aggregation function to calculate elo for score
|
197 |
+
def get_win_rate(score_list):
|
198 |
+
battles = get_battles_from_scores(score_list)
|
199 |
+
bootstrap_online_elo = compute_mle_elo(battles)
|
200 |
+
stats = pd.DataFrame()
|
201 |
+
stats["results"] = None
|
202 |
+
stats["results"] = stats['results'].astype('object')
|
203 |
+
for i, model in enumerate(bootstrap_online_elo.index):
|
204 |
+
stats.at[i, "model"] = model
|
205 |
+
stats.at[i, "score"] = bootstrap_online_elo[model]
|
206 |
+
|
207 |
+
stats.sort_values(by="model", inplace=True)
|
208 |
+
stats["score"] = get_win_rate_column(stats, "score", "gpt-4-0314").tolist()
|
209 |
+
|
210 |
+
return stats["score"][1]
|
211 |
+
|
212 |
+
return {k: get_win_rate for k in ["score"]}
|
213 |
+
|
214 |
+
def higher_is_better(self):
|
215 |
+
"""
|
216 |
+
:returns: {str: bool}
|
217 |
+
A dictionary where keys are the names of submetrics and values are
|
218 |
+
whether a higher value of the submetric is better
|
219 |
+
"""
|
220 |
+
return {k: True for k in ["score"]}
|
src/backend/tasks/selfcheckgpt/task.py
CHANGED
@@ -27,12 +27,12 @@ class SelfCheckGPT(ConfigurableTask):
|
|
27 |
super().__init__(config={"metadata": {"version": self.VERSION}})
|
28 |
# these end tokens are hard coded because of the current limitaion of the llm-eval.
|
29 |
# self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>", "<|endoftext|>"], "max_length": 512}
|
30 |
-
self.generation_kwargs = {"until": ["
|
31 |
self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
|
32 |
self.generation_kwargs_sampling = {
|
33 |
"temperature": 0.99,
|
34 |
"do_sample": True,
|
35 |
-
"until": ["
|
36 |
"max_length": 1024,
|
37 |
}
|
38 |
|
|
|
27 |
super().__init__(config={"metadata": {"version": self.VERSION}})
|
28 |
# these end tokens are hard coded because of the current limitaion of the llm-eval.
|
29 |
# self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>", "<|endoftext|>"], "max_length": 512}
|
30 |
+
self.generation_kwargs = {"until": ["<|im_end|>"], "max_length": 1024}
|
31 |
self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
|
32 |
self.generation_kwargs_sampling = {
|
33 |
"temperature": 0.99,
|
34 |
"do_sample": True,
|
35 |
+
"until": ["<|im_end|>", "</s>"],
|
36 |
"max_length": 1024,
|
37 |
}
|
38 |
|
src/display/utils.py
CHANGED
@@ -79,10 +79,11 @@ class Tasks(Enum):
|
|
79 |
# halueval_dial = Task("halueval_dialogue", "acc", "HaluDial/Acc")
|
80 |
|
81 |
# # XXX include me back at some point
|
82 |
-
selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
|
83 |
mmlu = Task("mmlu", "acc", "MMLU") #MMLU/Acc (5-shot)
|
84 |
gsm8k = Task("gsm8k_custom", "em", "GSM8K") #GSM8K/EM (5-shot)
|
85 |
# gsm8k_cot = Task("gsm8k_cot", "em", "GSM8K COT") #GSM8K COT/EM (5-shot)
|
|
|
86 |
|
87 |
|
88 |
# These classes are for user facing column names,
|
@@ -115,9 +116,9 @@ for task in Tasks:
|
|
115 |
auto_eval_column_dict.append([f"{task.name}_end_to_end_time", ColumnContent, ColumnContent(f"{task.value.col_name} {E2Es}", "number", True, hidden=True)])
|
116 |
auto_eval_column_dict.append([f"{task.name}_batch_size", ColumnContent, ColumnContent(f"{task.value.col_name} {BATCH_SIZE}", "number", True, hidden=True)])
|
117 |
# auto_eval_column_dict.append([f"{task.name}_precision", ColumnContent, ColumnContent(f"{task.value.col_name} {PRECISION}", "str", True, hidden=True)])
|
118 |
-
auto_eval_column_dict.append([f"{task.name}_gpu_mem", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Mem}", "number", True, hidden=True)])
|
119 |
auto_eval_column_dict.append([f"{task.name}_gpu", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Name}", "str", True, hidden=True)])
|
120 |
-
auto_eval_column_dict.append([f"{task.name}_gpu_util", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Util}", "number", True, hidden=True)])
|
121 |
if task.value.benchmark in MULTIPLE_CHOICEs:
|
122 |
continue
|
123 |
# auto_eval_column_dict.append([f"{task.name}_prefilling_time", ColumnContent, ColumnContent(f"{task.value.col_name} {PREs}", "number", False, hidden=True)])
|
@@ -187,6 +188,7 @@ class InferenceFramework(Enum):
|
|
187 |
# "moe-infinity", hf-chat
|
188 |
MoE_Infinity = ModelDetails("moe-infinity")
|
189 |
HF_Chat = ModelDetails("hf-chat")
|
|
|
190 |
Unknown = ModelDetails("?")
|
191 |
|
192 |
def to_str(self):
|
@@ -198,12 +200,13 @@ class InferenceFramework(Enum):
|
|
198 |
return InferenceFramework.MoE_Infinity
|
199 |
if inference_framework in ["hf-chat"]:
|
200 |
return InferenceFramework.HF_Chat
|
|
|
|
|
201 |
return InferenceFramework.Unknown
|
202 |
|
203 |
class GPUType(Enum):
|
204 |
-
|
205 |
A100_pcie = ModelDetails("NVIDIA-A100-PCIe-80GB")
|
206 |
-
A5000 = ModelDetails("NVIDIA-RTX-A5000-24GB")
|
207 |
Unknown = ModelDetails("?")
|
208 |
|
209 |
def to_str(self):
|
@@ -211,12 +214,10 @@ class GPUType(Enum):
|
|
211 |
|
212 |
@staticmethod
|
213 |
def from_str(gpu_type: str):
|
214 |
-
if gpu_type in ["NVIDIA-H100-PCIe-80GB"]:
|
215 |
-
return GPUType.A100_pcie
|
216 |
if gpu_type in ["NVIDIA-A100-PCIe-80GB"]:
|
217 |
-
return GPUType.
|
218 |
-
if gpu_type in ["NVIDIA-
|
219 |
-
return GPUType.
|
220 |
return GPUType.Unknown
|
221 |
|
222 |
class WeightType(Enum):
|
|
|
79 |
# halueval_dial = Task("halueval_dialogue", "acc", "HaluDial/Acc")
|
80 |
|
81 |
# # XXX include me back at some point
|
82 |
+
# selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
|
83 |
mmlu = Task("mmlu", "acc", "MMLU") #MMLU/Acc (5-shot)
|
84 |
gsm8k = Task("gsm8k_custom", "em", "GSM8K") #GSM8K/EM (5-shot)
|
85 |
# gsm8k_cot = Task("gsm8k_cot", "em", "GSM8K COT") #GSM8K COT/EM (5-shot)
|
86 |
+
arena_hard = Task("arena_hard", "score", "Arena Hard") #Arena Hard/Score
|
87 |
|
88 |
|
89 |
# These classes are for user facing column names,
|
|
|
116 |
auto_eval_column_dict.append([f"{task.name}_end_to_end_time", ColumnContent, ColumnContent(f"{task.value.col_name} {E2Es}", "number", True, hidden=True)])
|
117 |
auto_eval_column_dict.append([f"{task.name}_batch_size", ColumnContent, ColumnContent(f"{task.value.col_name} {BATCH_SIZE}", "number", True, hidden=True)])
|
118 |
# auto_eval_column_dict.append([f"{task.name}_precision", ColumnContent, ColumnContent(f"{task.value.col_name} {PRECISION}", "str", True, hidden=True)])
|
119 |
+
# auto_eval_column_dict.append([f"{task.name}_gpu_mem", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Mem}", "number", True, hidden=True)])
|
120 |
auto_eval_column_dict.append([f"{task.name}_gpu", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Name}", "str", True, hidden=True)])
|
121 |
+
# auto_eval_column_dict.append([f"{task.name}_gpu_util", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Util}", "number", True, hidden=True)])
|
122 |
if task.value.benchmark in MULTIPLE_CHOICEs:
|
123 |
continue
|
124 |
# auto_eval_column_dict.append([f"{task.name}_prefilling_time", ColumnContent, ColumnContent(f"{task.value.col_name} {PREs}", "number", False, hidden=True)])
|
|
|
188 |
# "moe-infinity", hf-chat
|
189 |
MoE_Infinity = ModelDetails("moe-infinity")
|
190 |
HF_Chat = ModelDetails("hf-chat")
|
191 |
+
VLLM = ModelDetails("vllm_moe")
|
192 |
Unknown = ModelDetails("?")
|
193 |
|
194 |
def to_str(self):
|
|
|
200 |
return InferenceFramework.MoE_Infinity
|
201 |
if inference_framework in ["hf-chat"]:
|
202 |
return InferenceFramework.HF_Chat
|
203 |
+
if inference_framework in ["vllm_moe"]:
|
204 |
+
return InferenceFramework.VLLM
|
205 |
return InferenceFramework.Unknown
|
206 |
|
207 |
class GPUType(Enum):
|
208 |
+
A100_sxm = ModelDetails("NVIDIA-A100-SXM4-80GB")
|
209 |
A100_pcie = ModelDetails("NVIDIA-A100-PCIe-80GB")
|
|
|
210 |
Unknown = ModelDetails("?")
|
211 |
|
212 |
def to_str(self):
|
|
|
214 |
|
215 |
@staticmethod
|
216 |
def from_str(gpu_type: str):
|
|
|
|
|
217 |
if gpu_type in ["NVIDIA-A100-PCIe-80GB"]:
|
218 |
+
return GPUType.A100_pcie
|
219 |
+
if gpu_type in ["NVIDIA-A100-SXM4-80GB"]:
|
220 |
+
return GPUType.A100_sxm
|
221 |
return GPUType.Unknown
|
222 |
|
223 |
class WeightType(Enum):
|
src/leaderboard/read_evals.py
CHANGED
@@ -116,7 +116,7 @@ class EvalResult:
|
|
116 |
multiplier = 1.0
|
117 |
if "time" in metric:
|
118 |
multiplier = 1.0
|
119 |
-
if "throughput" in metric
|
120 |
multiplier = 1.0
|
121 |
if "batch_" in metric or "Mem" in metric or "Util" in metric:
|
122 |
multiplier = 1
|
@@ -124,7 +124,10 @@ class EvalResult:
|
|
124 |
|
125 |
# print('RESULTS', data['results'])
|
126 |
# print('XXX', benchmark, metric, value, multiplier)
|
127 |
-
|
|
|
|
|
|
|
128 |
|
129 |
res = EvalResult(
|
130 |
eval_name=result_key,
|
|
|
116 |
multiplier = 1.0
|
117 |
if "time" in metric:
|
118 |
multiplier = 1.0
|
119 |
+
if "throughput" in metric:
|
120 |
multiplier = 1.0
|
121 |
if "batch_" in metric or "Mem" in metric or "Util" in metric:
|
122 |
multiplier = 1
|
|
|
124 |
|
125 |
# print('RESULTS', data['results'])
|
126 |
# print('XXX', benchmark, metric, value, multiplier)
|
127 |
+
if value == "N/A":
|
128 |
+
results[benchmark][metric] = None
|
129 |
+
else:
|
130 |
+
results[benchmark][metric] = value * multiplier
|
131 |
|
132 |
res = EvalResult(
|
133 |
eval_name=result_key,
|