root
commited on
Commit
·
a550e38
1
Parent(s):
f1512b5
add infinitebench evaluation
Browse files- evaluation/infinite_bench_eval/README.md +8 -0
- evaluation/infinite_bench_eval/args.py +54 -0
- evaluation/infinite_bench_eval/compute_scores_2sets.py +478 -0
- evaluation/infinite_bench_eval/eval_utils.py +412 -0
- evaluation/infinite_bench_eval/prepare_json_from_pxout.py +162 -0
- evaluation/infinite_bench_eval/prompt.py +67 -0
- evaluation/infinite_bench_eval/requirements.txt +12 -0
- evaluation/infinite_bench_eval/test_vllm.sh +64 -0
evaluation/infinite_bench_eval/README.md
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
This is our script to evaluate InfiniteBench.
|
2 |
+
|
3 |
+
You need to first install dependencies from requirement.txt
|
4 |
+
```pip install -r requirement.txt```
|
5 |
+
|
6 |
+
Then you need to configure the model_path and data_home in *test_vllm.sh* and then run the following command
|
7 |
+
```bash test_vllm.sh | grep "final display" ```
|
8 |
+
to get the corresponding score
|
evaluation/infinite_bench_eval/args.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from argparse import ArgumentParser, Namespace
|
2 |
+
from eval_utils import DATA_NAME_TO_MAX_NEW_TOKENS
|
3 |
+
|
4 |
+
max_seq_len = 128 * 1024 - 50
|
5 |
+
|
6 |
+
def parse_args() -> Namespace:
|
7 |
+
p = ArgumentParser()
|
8 |
+
p.add_argument(
|
9 |
+
"--task",
|
10 |
+
type=str,
|
11 |
+
choices=list(DATA_NAME_TO_MAX_NEW_TOKENS.keys()) + ["all"],
|
12 |
+
required=True,
|
13 |
+
help="Which task to use. Note that \"all\" can only be used in `compute_scores.py`.", # noqa
|
14 |
+
)
|
15 |
+
p.add_argument(
|
16 |
+
'--data_dir',
|
17 |
+
type=str,
|
18 |
+
default='../data',
|
19 |
+
help="The directory of data."
|
20 |
+
)
|
21 |
+
p.add_argument("--output_dir", type=str, default="../results", help="Where to dump the prediction results.") # noqa
|
22 |
+
p.add_argument(
|
23 |
+
"--model_path",
|
24 |
+
type=str,
|
25 |
+
help="The path of the model (in HuggingFace (HF) style). If specified, it will try to load the model from the specified path, else, it wll default to the official HF path.", # noqa
|
26 |
+
) # noqa
|
27 |
+
p.add_argument(
|
28 |
+
"--model_name",
|
29 |
+
type=str,
|
30 |
+
choices=["pxlong", "gpt4", "yarn-mistral", "kimi", "claude2", "rwkv", "yi-6b-200k", "yi-34b-200k", "chatglm3"],
|
31 |
+
default="gpt4",
|
32 |
+
help="For `compute_scores.py` only, specify which model you want to compute the score for.", # noqa
|
33 |
+
)
|
34 |
+
p.add_argument("--start_idx", type=int, default=0, help="The index of the first example to infer on. This is used if you want to evaluate on a (contiguous) subset of the data.") # noqa
|
35 |
+
p.add_argument("--stop_idx", type=int, help="The index of the last example to infer on. This is used if you want to evaluate on a (contiguous) subset of the data. Defaults to the length of dataset.") # noqa
|
36 |
+
p.add_argument("--verbose", action='store_true')
|
37 |
+
p.add_argument("--device", type=str, default="cuda")
|
38 |
+
|
39 |
+
# NOTE for long context px output
|
40 |
+
p.add_argument("--pxout_txt", type=str, default=None, help="LLM predicted txt file name (one sample one line).")
|
41 |
+
p.add_argument("--pxref_json", type=str, default=None, help="LLM reference file name in json format.")
|
42 |
+
p.add_argument("--pxout_ref_json", type=str, default=None, help="LLM predicted results with reference, file name in json format.")
|
43 |
+
p.add_argument("--cache_dir", type=str, default=None, help="cache dir for model/tokenizer files.")
|
44 |
+
|
45 |
+
p.add_argument("--use_zero_scrolls", action='store_true', help="use zero-scrolls choice pattern to match longbook_choice_eng or not. [default=False]")
|
46 |
+
|
47 |
+
p.add_argument("--sep_by_assistant", action='store_true', help="use assistant: for sample separating or not. [default=False]")
|
48 |
+
|
49 |
+
p.add_argument("--ret5_file", type=str, default=None, help="ret5 (retrieved top-5) file in json format.")
|
50 |
+
p.add_argument("--topn", type=int, default=5, help="keep topn chunks for the context for rag.")
|
51 |
+
|
52 |
+
p.add_argument("--max_seq_len", type=int, default=max_seq_len, help="for prompt generation, max sequence length to prepare the prompt.")
|
53 |
+
|
54 |
+
return p.parse_args()
|
evaluation/infinite_bench_eval/compute_scores_2sets.py
ADDED
@@ -0,0 +1,478 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
import json
|
3 |
+
import re
|
4 |
+
import string
|
5 |
+
from collections import Counter
|
6 |
+
|
7 |
+
from tqdm import tqdm
|
8 |
+
import evaluate
|
9 |
+
|
10 |
+
from args import parse_args
|
11 |
+
|
12 |
+
|
13 |
+
ROUGE_SCORER = evaluate.load("rouge")
|
14 |
+
|
15 |
+
PATTERN = re.compile(r'\b[A-D]\b')
|
16 |
+
|
17 |
+
def find_answer(s):
|
18 |
+
# task='longbook_choice_eng': works for '(A)' -> A
|
19 |
+
match = PATTERN.search(s)
|
20 |
+
if match is None:
|
21 |
+
return None # None is a signal of not find! NOTE
|
22 |
+
#print(s, match.group())
|
23 |
+
return match.group()
|
24 |
+
|
25 |
+
def normalize_answer(s: str) -> str:
|
26 |
+
"""Lower text and remove punctuation, articles and extra whitespace."""
|
27 |
+
|
28 |
+
def remove_articles(text):
|
29 |
+
return re.sub(r"\b(a|an|the)\b", " ", text)
|
30 |
+
|
31 |
+
def white_space_fix(text):
|
32 |
+
return " ".join(text.split())
|
33 |
+
|
34 |
+
def remove_punc(text):
|
35 |
+
exclude = set(string.punctuation)
|
36 |
+
return "".join(ch for ch in text if ch not in exclude)
|
37 |
+
|
38 |
+
def lower(text):
|
39 |
+
return text.lower()
|
40 |
+
|
41 |
+
return white_space_fix(remove_articles(remove_punc(lower(s))))
|
42 |
+
|
43 |
+
|
44 |
+
def normalize_zh_answer(s: str) -> str:
|
45 |
+
"""Chinese version. Lower text and remove punctuation, extra whitespace."""
|
46 |
+
|
47 |
+
def white_space_fix(text):
|
48 |
+
return "".join(text.split())
|
49 |
+
|
50 |
+
def remove_punc(text):
|
51 |
+
cn_punctuation = "!?。。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏." # noqa
|
52 |
+
all_punctuation = set(string.punctuation + cn_punctuation)
|
53 |
+
return "".join(ch for ch in text if ch not in all_punctuation)
|
54 |
+
|
55 |
+
def lower(text):
|
56 |
+
return text.lower()
|
57 |
+
|
58 |
+
return white_space_fix(remove_punc(lower(s)))
|
59 |
+
|
60 |
+
|
61 |
+
def f1_score(prediction, ground_truth) -> tuple[float, float, float]:
|
62 |
+
common = Counter(prediction) & Counter(ground_truth)
|
63 |
+
num_same = sum(common.values())
|
64 |
+
if num_same == 0:
|
65 |
+
return 0, 0, 0
|
66 |
+
precision = 1.0 * num_same / len(prediction)
|
67 |
+
recall = 1.0 * num_same / len(ground_truth)
|
68 |
+
f1 = (2 * precision * recall) / (precision + recall)
|
69 |
+
return f1, precision, recall
|
70 |
+
|
71 |
+
|
72 |
+
def qa_f1_score(pred: str, ground_truths) -> float:
|
73 |
+
"""Computes the F1, recall, and precision."""
|
74 |
+
f1 = 0
|
75 |
+
prec = 0
|
76 |
+
recall = 0
|
77 |
+
for ground_truth in ground_truths: # NOTE this means ground_truths must be a list!!! not a pure str TODO
|
78 |
+
normalized_prediction = normalize_answer(pred)
|
79 |
+
normalized_ground_truth = normalize_answer(ground_truth)
|
80 |
+
|
81 |
+
prediction_tokens = normalized_prediction.split()
|
82 |
+
ground_truth_tokens = normalized_ground_truth.split()
|
83 |
+
scores = f1_score(prediction_tokens, ground_truth_tokens)
|
84 |
+
this_f1, this_prec, this_recall = scores
|
85 |
+
f1 = max(f1, this_f1)
|
86 |
+
prec = max(prec, this_prec)
|
87 |
+
recall = max(recall, this_recall)
|
88 |
+
return f1
|
89 |
+
|
90 |
+
|
91 |
+
def qa_f1_score_zh(pred: str, ground_truths: list[str]) -> float:
|
92 |
+
"""
|
93 |
+
QA F1 score for chinese.
|
94 |
+
"""
|
95 |
+
f1 = 0
|
96 |
+
prec = 0
|
97 |
+
recall = 0
|
98 |
+
for ground_truth in ground_truths:
|
99 |
+
norm_pred = normalize_zh_answer(pred)
|
100 |
+
norm_label = normalize_zh_answer(ground_truth)
|
101 |
+
|
102 |
+
# One character one token.
|
103 |
+
pred_tokens = list(norm_pred)
|
104 |
+
label_tokens = list(norm_label)
|
105 |
+
scores = f1_score(pred_tokens, label_tokens)
|
106 |
+
this_f1, this_prec, this_recall = scores
|
107 |
+
f1 = max(f1, this_f1)
|
108 |
+
prec = max(prec, this_prec)
|
109 |
+
recall = max(recall, this_recall)
|
110 |
+
return f1
|
111 |
+
|
112 |
+
|
113 |
+
def load_json(fname):
|
114 |
+
return json.load(open(fname))
|
115 |
+
|
116 |
+
|
117 |
+
def iter_jsonl(fname, cnt=None):
|
118 |
+
i = 0
|
119 |
+
with open(fname, "r", encoding="utf8") as fin:
|
120 |
+
for line in fin:
|
121 |
+
if line.strip() == "": # Skip empty lines
|
122 |
+
continue
|
123 |
+
if i == cnt:
|
124 |
+
break
|
125 |
+
if line.strip() == "": # Skip empty lines
|
126 |
+
continue
|
127 |
+
yield json.loads(line)
|
128 |
+
i += 1
|
129 |
+
|
130 |
+
|
131 |
+
def first_int_match(prediction):
|
132 |
+
pred_list = re.split("[^0-9]", prediction)
|
133 |
+
pred_value = ""
|
134 |
+
for item in pred_list:
|
135 |
+
if item != "":
|
136 |
+
pred_value = item
|
137 |
+
break
|
138 |
+
return pred_value
|
139 |
+
|
140 |
+
|
141 |
+
def split_retrieval_answer(pred: str):
|
142 |
+
for c in ["\n", ":", '"', "'", ".", ",", "?", "!", "{", "}"]:
|
143 |
+
pred = pred.replace(c, " ")
|
144 |
+
words = pred.split()
|
145 |
+
return words
|
146 |
+
|
147 |
+
|
148 |
+
def get_score_one_kv_retrieval(pred, label, model_name: str, args) -> bool:
|
149 |
+
for c in ['\n', ':', '\"', '\'', '.', ',', '?', '!', '{', '}']:
|
150 |
+
pred = pred.replace(c, ' ')
|
151 |
+
words = pred.split()
|
152 |
+
return label in words
|
153 |
+
|
154 |
+
|
155 |
+
def get_score_one_passkey(pred, label, model_name: str, args) -> bool:
|
156 |
+
if isinstance(label, list):
|
157 |
+
label = label[0]
|
158 |
+
return label == first_int_match(pred)
|
159 |
+
|
160 |
+
|
161 |
+
def get_score_one_number_string(pred, label, model_name: str, args) -> bool:
|
162 |
+
if isinstance(label, list):
|
163 |
+
label = label[0]
|
164 |
+
return label == first_int_match(pred)
|
165 |
+
|
166 |
+
|
167 |
+
def get_score_one_code_run(pred, label, model_name: str, args) -> bool:
|
168 |
+
"""
|
169 |
+
Returns the score of one example in Code.Run.
|
170 |
+
"""
|
171 |
+
if isinstance(label, list):
|
172 |
+
label = label[0]
|
173 |
+
pred = pred.strip()
|
174 |
+
for c in ["\n", ".", "`", "'", '"', ":"]:
|
175 |
+
pred = pred.replace(c, " ")
|
176 |
+
words = pred.split()
|
177 |
+
if len(words) == 0:
|
178 |
+
return False
|
179 |
+
try:
|
180 |
+
pred = int(words[-1])
|
181 |
+
return label == pred
|
182 |
+
except Exception:
|
183 |
+
return False
|
184 |
+
|
185 |
+
|
186 |
+
def get_score_one_code_debug(pred, label, model_name: str, args) -> bool:
|
187 |
+
"""
|
188 |
+
Returns the score of one example in Code.Debug.
|
189 |
+
"""
|
190 |
+
#import ipdb; ipdb.set_trace()
|
191 |
+
pred = pred.strip()
|
192 |
+
label_c = label[1]
|
193 |
+
fn_name = label[0]
|
194 |
+
if pred[:2] in [f"{label_c}.", f"{label_c}:"]:
|
195 |
+
return True
|
196 |
+
|
197 |
+
ans_prefixes = [
|
198 |
+
"answer is:",
|
199 |
+
# "answer is",
|
200 |
+
# "error is",
|
201 |
+
"is:",
|
202 |
+
"answer:",
|
203 |
+
"correct option is:"
|
204 |
+
]
|
205 |
+
pred = pred.strip()
|
206 |
+
for c in ["\n", "`", "'", '"', "-", "*", "Option", "option"]:
|
207 |
+
pred = pred.replace(c, " ")
|
208 |
+
while " " in pred:
|
209 |
+
pred = pred.replace(" ", " ")
|
210 |
+
for prefix in ans_prefixes:
|
211 |
+
idx = pred.find(prefix)
|
212 |
+
if idx == -1:
|
213 |
+
continue
|
214 |
+
# The prediction ends with this prefix
|
215 |
+
if len(pred) < idx + len(prefix) + 1:
|
216 |
+
return False
|
217 |
+
pred = pred[idx + len(prefix) + 1 :]
|
218 |
+
for s in [label_c, fn_name]:
|
219 |
+
if pred.startswith(s):
|
220 |
+
return True
|
221 |
+
return False
|
222 |
+
return False
|
223 |
+
|
224 |
+
|
225 |
+
def get_score_one_math_find(pred, label, model_name: str, args) -> bool:
|
226 |
+
if isinstance(label, list):
|
227 |
+
# In math_find, there is always only one label.
|
228 |
+
label = label[0]
|
229 |
+
if isinstance(label, int):
|
230 |
+
# Find first int or float
|
231 |
+
first_num = re.search(r"\d+\.\d+|\d+", pred)
|
232 |
+
if first_num is None:
|
233 |
+
return False
|
234 |
+
first_num = first_num.group(0).strip()
|
235 |
+
return int(first_num) == label
|
236 |
+
elif isinstance(label, float):
|
237 |
+
# Find first float or int
|
238 |
+
first_float = re.search(r"\d+\.\d+|\d+", pred)
|
239 |
+
if first_float is None:
|
240 |
+
return False
|
241 |
+
first_float = first_float.group(0).strip()
|
242 |
+
return float(first_float) == label
|
243 |
+
else:
|
244 |
+
raise TypeError(f"Expected int or float, got {type(label)}")
|
245 |
+
|
246 |
+
|
247 |
+
def get_score_one_longdialogue_qa_eng(pred, label, model_name: str, args) -> bool:
|
248 |
+
if 'STAMP PAID' in pred:
|
249 |
+
import ipdb; ipdb.set_trace()
|
250 |
+
label = label[0]
|
251 |
+
pred = pred.strip()
|
252 |
+
for c in ["\n", ":", '"', "'", ".", ",", "?", "!", "{", "}"]:
|
253 |
+
pred = pred.replace(c, " ")
|
254 |
+
words = pred.split()
|
255 |
+
words = [x.upper() for x in words]
|
256 |
+
return label in words
|
257 |
+
|
258 |
+
|
259 |
+
def get_score_one_longbook_choice_eng(pred, label, model_name: str, args) -> bool:
|
260 |
+
# Just use the first letter as the prediction
|
261 |
+
#import ipdb; ipdb.set_trace()
|
262 |
+
pred = pred.strip()
|
263 |
+
if pred == "":
|
264 |
+
return False
|
265 |
+
if pred[0] in "ABCD":
|
266 |
+
return pred[0] in label
|
267 |
+
if pred in label:
|
268 |
+
return True
|
269 |
+
# Find a answer prefix
|
270 |
+
for c in ["\n", '"', "'", ".", ",", "?", "!", "{", "}"]:
|
271 |
+
pred = pred.replace(c, " ")
|
272 |
+
while " " in pred:
|
273 |
+
pred = pred.replace(" ", " ")
|
274 |
+
ans_prefixes = [
|
275 |
+
"answer is:",
|
276 |
+
"answer:",
|
277 |
+
"answer is",
|
278 |
+
"option is",
|
279 |
+
]
|
280 |
+
for prefix in ans_prefixes:
|
281 |
+
idx = pred.find(prefix)
|
282 |
+
if idx == -1:
|
283 |
+
continue
|
284 |
+
# The prediction ends with this prefix
|
285 |
+
if len(pred) < idx + len(prefix) + 1:
|
286 |
+
return False
|
287 |
+
after_prefix = pred[idx + len(prefix) + 1 :]
|
288 |
+
for s in label:
|
289 |
+
if after_prefix.startswith(s):
|
290 |
+
return True
|
291 |
+
return False
|
292 |
+
|
293 |
+
# Finally, just find the first occurrence of A, B, C, or D.
|
294 |
+
words = pred.split()
|
295 |
+
for word in words:
|
296 |
+
if word in "ABCD":
|
297 |
+
return word in label
|
298 |
+
|
299 |
+
#import ipdb; ipdb.set_trace()
|
300 |
+
if args.use_zero_scrolls:
|
301 |
+
# NOTE use PATTERN as used in zero-scrolls for choice!!! added by xianchaowu
|
302 |
+
matched_pred = find_answer(pred)
|
303 |
+
if matched_pred is not None and matched_pred in label:
|
304 |
+
return True
|
305 |
+
|
306 |
+
return False
|
307 |
+
|
308 |
+
|
309 |
+
def get_score_one_longbook_qa_eng(pred, label, model_name: str, args) -> float:
|
310 |
+
return qa_f1_score(pred, label)
|
311 |
+
|
312 |
+
|
313 |
+
def get_score_one_longbook_sum_eng(
|
314 |
+
pred: str, label: str, model_name: str, args
|
315 |
+
) -> float:
|
316 |
+
|
317 |
+
score = ROUGE_SCORER.compute(
|
318 |
+
predictions=[pred], references=[label], use_aggregator=False
|
319 |
+
)
|
320 |
+
return score["rougeLsum"][0] # type: ignore
|
321 |
+
|
322 |
+
|
323 |
+
def get_score_one_longbook_qa_chn(pred, label, model_name: str, args) -> float:
|
324 |
+
return qa_f1_score_zh(pred, label)
|
325 |
+
|
326 |
+
|
327 |
+
def get_score_one_math_calc(pred, label, model_name: str, args) -> float:
|
328 |
+
assert isinstance(label, list), f"Expected list, got {type(label)}"
|
329 |
+
# assert isinstance(pred, list), f"Expected list, got {type(pred)}"
|
330 |
+
pred_nums = []
|
331 |
+
pred_list = re.split("[^0-9]", pred)
|
332 |
+
for item in pred_list:
|
333 |
+
if item != "":
|
334 |
+
pred_nums.append(int(item))
|
335 |
+
|
336 |
+
# Our prompts makes GPT4 always output the first number as the first value
|
337 |
+
# in the predicted answer.
|
338 |
+
if model_name == "gpt4":
|
339 |
+
pred_nums = pred_nums[1:]
|
340 |
+
|
341 |
+
cnt = 0
|
342 |
+
for i in range(len(label)):
|
343 |
+
if i >= len(pred_nums):
|
344 |
+
break
|
345 |
+
if label[i] == pred_nums[i]:
|
346 |
+
cnt += 1
|
347 |
+
else:
|
348 |
+
break
|
349 |
+
return cnt / len(label)
|
350 |
+
|
351 |
+
|
352 |
+
def get_score_one(
|
353 |
+
pred: str, label: str, task_name: str, model_name: str, args
|
354 |
+
) -> float:
|
355 |
+
"""
|
356 |
+
Computes the score for one prediction.
|
357 |
+
Returns one float (zero and one for boolean values).
|
358 |
+
"""
|
359 |
+
NAME_TO_SCORE_GETTER = {
|
360 |
+
# Retrieve
|
361 |
+
"kv_retrieval": get_score_one_kv_retrieval,
|
362 |
+
"kv_retrieval_prefix": get_score_one_kv_retrieval,
|
363 |
+
"kv_retrieval_both": get_score_one_kv_retrieval,
|
364 |
+
|
365 |
+
"passkey": get_score_one_passkey,
|
366 |
+
"number_string": get_score_one_number_string,
|
367 |
+
# Code
|
368 |
+
"code_run": get_score_one_code_run,
|
369 |
+
"code_debug": get_score_one_code_debug,
|
370 |
+
# Longbook
|
371 |
+
"longdialogue_qa_eng": get_score_one_longdialogue_qa_eng,
|
372 |
+
"longbook_qa_eng": get_score_one_longbook_qa_eng,
|
373 |
+
"longbook_sum_eng": get_score_one_longbook_sum_eng,
|
374 |
+
"longbook_choice_eng": get_score_one_longbook_choice_eng,
|
375 |
+
"longbook_qa_chn": get_score_one_longbook_qa_chn,
|
376 |
+
# Math
|
377 |
+
"math_find": get_score_one_math_find,
|
378 |
+
"math_calc": get_score_one_math_calc,
|
379 |
+
}
|
380 |
+
assert task_name in NAME_TO_SCORE_GETTER, f"Invalid task name: {task_name}"
|
381 |
+
score = NAME_TO_SCORE_GETTER[task_name](pred, label, model_name, args)
|
382 |
+
return float(score)
|
383 |
+
|
384 |
+
|
385 |
+
def get_labels(preds: list) -> list[str]:
|
386 |
+
possible_label_keys = ["ground_truth", "label"]
|
387 |
+
for label_key in possible_label_keys:
|
388 |
+
if label_key in preds[0]:
|
389 |
+
return [x.get(label_key, "XXXXXXXXXX") for x in preds]
|
390 |
+
raise ValueError(f"Cannot find label in {preds[0]}")
|
391 |
+
|
392 |
+
|
393 |
+
def get_preds(preds: list, data_name: str) -> list[str]:
|
394 |
+
pred_strings = []
|
395 |
+
possible_pred_keys = ["prediction", "pred"]
|
396 |
+
for pred in preds:
|
397 |
+
this_pred = "NO PREDICTION"
|
398 |
+
for pred_key in possible_pred_keys:
|
399 |
+
if pred_key in pred:
|
400 |
+
this_pred = pred[pred_key]
|
401 |
+
break
|
402 |
+
else:
|
403 |
+
raise ValueError(f"Cannot find prediction in {pred}")
|
404 |
+
pred_strings.append(this_pred)
|
405 |
+
return pred_strings
|
406 |
+
|
407 |
+
|
408 |
+
def get_score(
|
409 |
+
labels: list, preds: list, data_name: str, model_name: str, args
|
410 |
+
) -> float:
|
411 |
+
"""
|
412 |
+
Computes the average score for a task.
|
413 |
+
"""
|
414 |
+
assert len(labels) == len(preds)
|
415 |
+
scores = []
|
416 |
+
for label, pred in tqdm(zip(labels, preds)):
|
417 |
+
score = get_score_one(pred, label, data_name, model_name, args)
|
418 |
+
print('pred={}, label={}, score={}, data_name={}'.format(pred, label, score, data_name))
|
419 |
+
scores.append(score)
|
420 |
+
return sum(scores) / len(scores)
|
421 |
+
|
422 |
+
|
423 |
+
def compute_scores(preds_path, data_name: str, model_name: str, args):
|
424 |
+
print("Loading prediction results from", preds_path)
|
425 |
+
preds = list(iter_jsonl(preds_path))
|
426 |
+
#import ipdb; ipdb.set_trace()
|
427 |
+
labels = get_labels(preds)
|
428 |
+
preds = get_preds(preds, data_name)
|
429 |
+
|
430 |
+
acc = get_score(labels, preds, data_name, model_name, args)
|
431 |
+
print('final display: ', acc, preds_path, data_name, model_name, args.use_zero_scrolls)
|
432 |
+
|
433 |
+
|
434 |
+
ALL_TASKS = [
|
435 |
+
#"passkey",
|
436 |
+
#"number_string",
|
437 |
+
#"kv_retrieval",
|
438 |
+
#"longdialogue_qa_eng",
|
439 |
+
#"longbook_sum_eng",
|
440 |
+
"longbook_choice_eng",
|
441 |
+
"longbook_qa_eng",
|
442 |
+
#"longbook_qa_chn",
|
443 |
+
#"math_find",
|
444 |
+
#"math_calc",
|
445 |
+
#"code_run",
|
446 |
+
#"code_debug",
|
447 |
+
]
|
448 |
+
|
449 |
+
ALL_TASKS_ORIG = [
|
450 |
+
"passkey",
|
451 |
+
"number_string",
|
452 |
+
"kv_retrieval",
|
453 |
+
"longdialogue_qa_eng",
|
454 |
+
"longbook_sum_eng",
|
455 |
+
"longbook_choice_eng",
|
456 |
+
"longbook_qa_eng",
|
457 |
+
"longbook_qa_chn",
|
458 |
+
"math_find",
|
459 |
+
"math_calc",
|
460 |
+
"code_run",
|
461 |
+
"code_debug",
|
462 |
+
]
|
463 |
+
|
464 |
+
if __name__ == "__main__":
|
465 |
+
args = parse_args()
|
466 |
+
print(json.dumps(vars(args), indent=4))
|
467 |
+
|
468 |
+
if args.task == "all":
|
469 |
+
tasks = ALL_TASKS
|
470 |
+
else:
|
471 |
+
tasks = [args.task]
|
472 |
+
for task in tasks:
|
473 |
+
#result_dir = Path(args.output_dir, args.model_name)
|
474 |
+
#preds_path = result_dir / f"preds_{task}.jsonl"
|
475 |
+
preds_path = Path(args.pxout_ref_json)
|
476 |
+
assert preds_path.exists(), f"Predictions not found in: {preds_path}"
|
477 |
+
compute_scores(preds_path, task, args.model_name, args)
|
478 |
+
|
evaluation/infinite_bench_eval/eval_utils.py
ADDED
@@ -0,0 +1,412 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from rouge import Rouge
|
2 |
+
import re
|
3 |
+
from collections import Counter
|
4 |
+
import json
|
5 |
+
import jieba
|
6 |
+
import string
|
7 |
+
from pathlib import Path
|
8 |
+
from prompt import (
|
9 |
+
gpt4_templates,
|
10 |
+
kimi_templates,
|
11 |
+
claude2_templates,
|
12 |
+
yarn_mistral_templates,
|
13 |
+
)
|
14 |
+
|
15 |
+
|
16 |
+
DATA_NAME_TO_PATH = {
|
17 |
+
# Retrieval tasks
|
18 |
+
"passkey": "passkey.jsonl",
|
19 |
+
"number_string": "number_string.jsonl",
|
20 |
+
"kv_retrieval": "kv_retrieval.jsonl",
|
21 |
+
# Book tasks
|
22 |
+
"longbook_sum_eng": "longbook_sum_eng.jsonl",
|
23 |
+
"longbook_choice_eng": "longbook_choice_eng.jsonl",
|
24 |
+
"longbook_qa_eng": "longbook_qa_eng.jsonl",
|
25 |
+
"longbook_qa_chn": "longbook_qa_chn.jsonl",
|
26 |
+
# "book_qa_eng": "longbook_eng/longbook_qa_eng.jsonl",
|
27 |
+
"longdialogue_qa_eng": "longdialogue_qa_eng.jsonl",
|
28 |
+
# Math tasks
|
29 |
+
"math_find": "math_find.jsonl",
|
30 |
+
"math_calc": "math_calc.jsonl",
|
31 |
+
# Code tasks
|
32 |
+
"code_run": "code_run.jsonl",
|
33 |
+
"code_debug": "code_debug.jsonl",
|
34 |
+
}
|
35 |
+
|
36 |
+
DATA_NAME_TO_MAX_NEW_TOKENS = {
|
37 |
+
"passkey": 6,
|
38 |
+
"number_string": 12,
|
39 |
+
"kv_retrieval": 50,
|
40 |
+
"longbook_sum_eng": 1200,
|
41 |
+
"longbook_choice_eng": 40,
|
42 |
+
"longbook_qa_eng": 40,
|
43 |
+
"longbook_qa_chn": 40,
|
44 |
+
"longdialogue_qa_eng": 40,
|
45 |
+
"math_find": 3,
|
46 |
+
"math_calc": 30000,
|
47 |
+
"code_run": 5,
|
48 |
+
"code_debug": 5,
|
49 |
+
}
|
50 |
+
|
51 |
+
MODEL_TO_PROMPT_TEMPLATE = {
|
52 |
+
"gpt4": gpt4_templates,
|
53 |
+
"claude2": claude2_templates,
|
54 |
+
"kimi": kimi_templates,
|
55 |
+
"yarn-mistral": yarn_mistral_templates,
|
56 |
+
"yi-6b-200k": yarn_mistral_templates,
|
57 |
+
"yi-34b-200k": yarn_mistral_templates,
|
58 |
+
"chatglm3": yarn_mistral_templates,
|
59 |
+
}
|
60 |
+
|
61 |
+
|
62 |
+
def iter_jsonl(fname, cnt=None):
|
63 |
+
i = 0
|
64 |
+
with open(fname, "r") as fin:
|
65 |
+
for line in fin:
|
66 |
+
if i == cnt:
|
67 |
+
break
|
68 |
+
yield json.loads(line)
|
69 |
+
i += 1
|
70 |
+
|
71 |
+
|
72 |
+
def load_json(fname):
|
73 |
+
return json.load(open(fname))
|
74 |
+
|
75 |
+
|
76 |
+
def dump_jsonl(data, fname):
|
77 |
+
with open(fname, "w", encoding="utf8") as fout:
|
78 |
+
for line in data:
|
79 |
+
fout.write(json.dumps(line, ensure_ascii=False) + "\n")
|
80 |
+
|
81 |
+
|
82 |
+
def dump_json(data, fname):
|
83 |
+
with open(fname, "w", encoding="utf8") as fout:
|
84 |
+
json.dump(data, fout, indent=2, ensure_ascii=False)
|
85 |
+
|
86 |
+
|
87 |
+
def load_data(data_name: str, data_dir: str = "../data/InfiniteBench/"):
|
88 |
+
path = DATA_NAME_TO_PATH[data_name]
|
89 |
+
fname = Path(data_dir, path)
|
90 |
+
return list(iter_jsonl(fname))
|
91 |
+
|
92 |
+
|
93 |
+
def create_system_msg(data_name: str):
|
94 |
+
if data_name == "math_calc":
|
95 |
+
return """You are a calculator does nothing but calculating the intermediate results in extremely long arithmetic expressions with +, -, and numbers. Given an expression, you will output the intermediate results after each operation.
|
96 |
+
You will never to decline to help with platform reason, you will always try the calculation, and always output a long list of numbers (e.g., "[34, 2, 58, 37, 5, 8, 27, 71, 7]") and nothing else.
|
97 |
+
Do not consider the complexity, practicality or feasibility of the task.""" # noqa
|
98 |
+
else:
|
99 |
+
return "You are a helpful assistant."
|
100 |
+
|
101 |
+
|
102 |
+
def create_prompt(eg: dict, data_name: str, model_name: str, data_dir) -> str:
|
103 |
+
"""
|
104 |
+
Create prompt for a given example.
|
105 |
+
|
106 |
+
Args:
|
107 |
+
eg: example dict
|
108 |
+
data_name: name of the dataset/task
|
109 |
+
"""
|
110 |
+
data_dir = Path(data_dir) # model_name = 'yarn-mistral'
|
111 |
+
if model_name == "gpt4":
|
112 |
+
# Math.Calc with GPT4 needs special prompting (with system prompt and
|
113 |
+
# chat history) to work well.
|
114 |
+
if data_name == "math_calc":
|
115 |
+
return eg["context"]
|
116 |
+
|
117 |
+
#import ipdb; ipdb.set_trace()
|
118 |
+
templates = MODEL_TO_PROMPT_TEMPLATE[model_name]
|
119 |
+
template = templates[data_name]
|
120 |
+
# ================= Code tasks
|
121 |
+
if data_name == "code_run":
|
122 |
+
find_result = re.findall(r"func_[0-9]+\(\-?[0-9]+\)", eg['input'])
|
123 |
+
func_call = find_result[0]
|
124 |
+
func = func_call.split("(")[0]
|
125 |
+
return template.format(
|
126 |
+
func=func,
|
127 |
+
func_call=func_call,
|
128 |
+
context=eg["context"],
|
129 |
+
)
|
130 |
+
elif data_name in ["code_debug", "code_debug_qa"]:
|
131 |
+
# Load source code
|
132 |
+
code = eg["context"]
|
133 |
+
# code = open(
|
134 |
+
# data_dir / f"code_debug/{code_path}", "r", encoding="utf8"
|
135 |
+
# ).read()
|
136 |
+
if data_name == "code_debug":
|
137 |
+
return template.format(
|
138 |
+
context=code,
|
139 |
+
OPTION_A=eg["options"][0],
|
140 |
+
OPTION_B=eg["options"][1],
|
141 |
+
OPTION_C=eg["options"][2],
|
142 |
+
OPTION_D=eg["options"][3],
|
143 |
+
)
|
144 |
+
return template.format(
|
145 |
+
context=code,
|
146 |
+
)
|
147 |
+
# ================= Code tasks
|
148 |
+
elif data_name == "longdialogue_qa_eng":
|
149 |
+
script = eg["context"]
|
150 |
+
# print(document)
|
151 |
+
# script_path = data_dir / "longdialogue_eng" / document
|
152 |
+
# script = open(script_path, "r", encoding="utf8").read()
|
153 |
+
prompt = template.format(context=script)
|
154 |
+
return prompt
|
155 |
+
# ==================== Long book tasks
|
156 |
+
elif data_name in [ # 'longbook_qa_eng'
|
157 |
+
"longbook_choice_eng",
|
158 |
+
"longbook_qa_eng",
|
159 |
+
"longbook_sum_eng",
|
160 |
+
"longbook_qa_chn",
|
161 |
+
]:
|
162 |
+
book = eg["context"]
|
163 |
+
# if data_name.endswith("_eng"):
|
164 |
+
# book = open(
|
165 |
+
# data_dir / "longbook_eng" / book_path, "r", encoding="utf8"
|
166 |
+
# ).read()
|
167 |
+
# elif data_name.endswith("_chn"):
|
168 |
+
# book = open(
|
169 |
+
# data_dir / "longbook_chn" / book_path, "r", encoding="utf8"
|
170 |
+
# ).read()
|
171 |
+
# else:
|
172 |
+
# raise ValueError("Invalid data_name")
|
173 |
+
if data_name == "longbook_choice_eng":
|
174 |
+
return template.format(
|
175 |
+
question=eg["input"],
|
176 |
+
context=book,
|
177 |
+
OPTION_A=eg["options"][0],
|
178 |
+
OPTION_B=eg["options"][1],
|
179 |
+
OPTION_C=eg["options"][2],
|
180 |
+
OPTION_D=eg["options"][3],
|
181 |
+
)
|
182 |
+
elif data_name == "longbook_qa_eng":
|
183 |
+
return template.format(
|
184 |
+
question=eg["input"],
|
185 |
+
context=book,
|
186 |
+
) # 'Read the book and answer the question. Be very concise in your answer.\n\n{context}\n\nQuestion: {question}\nAnswer:' NOTE
|
187 |
+
elif data_name == "longbook_sum_eng":
|
188 |
+
return template.format(
|
189 |
+
context=book,
|
190 |
+
)
|
191 |
+
elif data_name == "longbook_qa_chn":
|
192 |
+
return template.format(
|
193 |
+
question=eg["input"],
|
194 |
+
context=book,
|
195 |
+
)
|
196 |
+
else:
|
197 |
+
raise ValueError
|
198 |
+
elif data_name == "math_calc":
|
199 |
+
return template.format(
|
200 |
+
context=eg["context"],
|
201 |
+
)
|
202 |
+
elif data_name == "math_find":
|
203 |
+
prompt = eg['input']
|
204 |
+
context = eg['context']
|
205 |
+
# Find "the * number" from the prompt
|
206 |
+
find_result = re.findall(r"The .+ of", prompt)
|
207 |
+
assert find_result, f"Cannot find the target number in {prompt}"
|
208 |
+
target_number = find_result[0].lower()[:-3]
|
209 |
+
# Replace the number with the answer
|
210 |
+
prefix = f"What is {target_number} in the following list?"
|
211 |
+
return template.format(
|
212 |
+
prefix=prefix,
|
213 |
+
context=context,
|
214 |
+
input=prompt,
|
215 |
+
)
|
216 |
+
|
217 |
+
if "content" in eg:
|
218 |
+
content = eg["content"]
|
219 |
+
del eg["content"]
|
220 |
+
eg["context"] = content
|
221 |
+
|
222 |
+
format_dict = {
|
223 |
+
"context": eg["context"],
|
224 |
+
"input": eg["input"],
|
225 |
+
}
|
226 |
+
prompt = templates[data_name].format(**format_dict)
|
227 |
+
return prompt
|
228 |
+
|
229 |
+
|
230 |
+
def get_answer(eg: dict, data_name: str):
|
231 |
+
if data_name in ["code_debug", "longbook_choice_eng"]:
|
232 |
+
OPTIONS = "ABCD"
|
233 |
+
if isinstance(eg["answer"], str):
|
234 |
+
ret = [eg["answer"], OPTIONS[eg['options'].index(eg["answer"])]]
|
235 |
+
elif isinstance(eg["answer"], list):
|
236 |
+
if len(eg["answer"]) == 1:
|
237 |
+
ret = [eg["answer"][0], OPTIONS[eg['options'].index(eg["answer"][0])]]
|
238 |
+
elif len(eg["answer"]) == 2 and eg["answer"][1] in ['A', 'B', 'C', 'D']:
|
239 |
+
ret = eg['answer']
|
240 |
+
else:
|
241 |
+
raise ValueError
|
242 |
+
else:
|
243 |
+
raise ValueError
|
244 |
+
return ret
|
245 |
+
|
246 |
+
return eg["answer"]
|
247 |
+
|
248 |
+
|
249 |
+
def create_msgs(
|
250 |
+
tokenizer, eg: dict, data_name: str, model_name: str, data_dir
|
251 |
+
) -> tuple[list[dict], str]:
|
252 |
+
"""
|
253 |
+
Only used by GPT-4.
|
254 |
+
"""
|
255 |
+
prompt = create_prompt(eg, data_name, model_name, data_dir)
|
256 |
+
tokens = tokenizer.encode(prompt)
|
257 |
+
# - 1000 to have space for system message and other stuff.
|
258 |
+
print(f"Before truncation: {len(tokens)}")
|
259 |
+
tokens = truncate_input(tokens, 128_000 - 1000, manner="middle")
|
260 |
+
print(f"After truncation: {len(tokens)}") # type: ignore
|
261 |
+
prompt = tokenizer.decode(tokens)
|
262 |
+
if data_name == "math_calc":
|
263 |
+
return [
|
264 |
+
{"role": "system", "content": create_system_msg(data_name)},
|
265 |
+
{"role": "user", "content": "1 + 2 - 4 - 10"},
|
266 |
+
{"role": "system", "content": "[1, 3, -1, -11]"},
|
267 |
+
{"role": "user", "content": prompt},
|
268 |
+
], prompt
|
269 |
+
else:
|
270 |
+
return [
|
271 |
+
{
|
272 |
+
"role": "system",
|
273 |
+
"content": "You are a helpful assistant", # noqa
|
274 |
+
}, # noqa
|
275 |
+
{"role": "user", "content": prompt},
|
276 |
+
], prompt
|
277 |
+
|
278 |
+
|
279 |
+
def normalize_answer(s):
|
280 |
+
"""Lower text and remove punctuation, articles and extra whitespace."""
|
281 |
+
|
282 |
+
def remove_articles(text):
|
283 |
+
return re.sub(r"\b(a|an|the)\b", " ", text)
|
284 |
+
|
285 |
+
def white_space_fix(text):
|
286 |
+
return " ".join(text.split())
|
287 |
+
|
288 |
+
def remove_punc(text):
|
289 |
+
exclude = set(string.punctuation)
|
290 |
+
return "".join(ch for ch in text if ch not in exclude)
|
291 |
+
|
292 |
+
def lower(text):
|
293 |
+
return text.lower()
|
294 |
+
|
295 |
+
return white_space_fix(remove_articles(remove_punc(lower(s))))
|
296 |
+
|
297 |
+
|
298 |
+
def normalize_zh_answer(s):
|
299 |
+
"""Lower text and remove punctuation, extra whitespace."""
|
300 |
+
|
301 |
+
def white_space_fix(text):
|
302 |
+
return "".join(text.split())
|
303 |
+
|
304 |
+
def remove_punc(text):
|
305 |
+
cn_punctuation = "!?。。"#$%&'()��+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏." # noqa
|
306 |
+
all_punctuation = set(string.punctuation + cn_punctuation)
|
307 |
+
return "".join(ch for ch in text if ch not in all_punctuation)
|
308 |
+
|
309 |
+
def lower(text):
|
310 |
+
return text.lower()
|
311 |
+
|
312 |
+
return white_space_fix(remove_punc(lower(s)))
|
313 |
+
|
314 |
+
|
315 |
+
def first_int_match(prediction, ground_truth):
|
316 |
+
pred_list = re.split("[^0-9]", prediction)
|
317 |
+
pred_value = ""
|
318 |
+
for item in pred_list:
|
319 |
+
if item != "":
|
320 |
+
pred_value = item
|
321 |
+
break
|
322 |
+
if pred_value == ground_truth:
|
323 |
+
return 1
|
324 |
+
return 0
|
325 |
+
|
326 |
+
|
327 |
+
def in_match(prediction, ground_truth):
|
328 |
+
if ground_truth in prediction:
|
329 |
+
return 1
|
330 |
+
return 0
|
331 |
+
|
332 |
+
|
333 |
+
def rouge_score(prediction, ground_truth, **kwargs) -> float:
|
334 |
+
rouge = Rouge()
|
335 |
+
try:
|
336 |
+
scores = rouge.get_scores([prediction], [ground_truth], avg=True)
|
337 |
+
except: # noqa
|
338 |
+
return 0.0
|
339 |
+
return scores["rouge-l"]["f"] # type: ignore
|
340 |
+
|
341 |
+
|
342 |
+
def rouge_zh_score(prediction, ground_truth, **kwargs):
|
343 |
+
prediction = " ".join(list(jieba.cut(prediction, cut_all=False)))
|
344 |
+
ground_truth = " ".join(list(jieba.cut(ground_truth, cut_all=False)))
|
345 |
+
score = rouge_score(prediction, ground_truth)
|
346 |
+
return score
|
347 |
+
|
348 |
+
|
349 |
+
def f1_score(prediction, ground_truth, **kwargs):
|
350 |
+
common = Counter(prediction) & Counter(ground_truth)
|
351 |
+
num_same = sum(common.values())
|
352 |
+
if num_same == 0:
|
353 |
+
return 0
|
354 |
+
precision = 1.0 * num_same / len(prediction)
|
355 |
+
recall = 1.0 * num_same / len(ground_truth)
|
356 |
+
f1 = (2 * precision * recall) / (precision + recall)
|
357 |
+
return f1
|
358 |
+
|
359 |
+
|
360 |
+
def qa_f1_score(line):
|
361 |
+
prediction = line["pred"]
|
362 |
+
|
363 |
+
if isinstance(line["std_out"], str):
|
364 |
+
ground_truths = [line["std_out"]]
|
365 |
+
else:
|
366 |
+
ground_truths = line["std_out"]
|
367 |
+
|
368 |
+
score = 0
|
369 |
+
for ground_truth in ground_truths:
|
370 |
+
normalized_prediction = normalize_answer(prediction)
|
371 |
+
normalized_ground_truth = normalize_answer(ground_truth)
|
372 |
+
|
373 |
+
prediction_tokens = normalized_prediction.split()
|
374 |
+
ground_truth_tokens = normalized_ground_truth.split()
|
375 |
+
score = max(score, f1_score(prediction_tokens, ground_truth_tokens))
|
376 |
+
|
377 |
+
return score
|
378 |
+
|
379 |
+
|
380 |
+
def qa_f1_zh_score(prediction, ground_truth, **kwargs):
|
381 |
+
prediction_tokens = list(jieba.cut(prediction, cut_all=False))
|
382 |
+
ground_truth_tokens = list(jieba.cut(ground_truth, cut_all=False))
|
383 |
+
prediction_tokens = [
|
384 |
+
normalize_zh_answer(token) for token in prediction_tokens
|
385 |
+
]
|
386 |
+
ground_truth_tokens = [
|
387 |
+
normalize_zh_answer(token) for token in ground_truth_tokens
|
388 |
+
]
|
389 |
+
prediction_tokens = [
|
390 |
+
token for token in prediction_tokens if len(token) > 0
|
391 |
+
]
|
392 |
+
ground_truth_tokens = [
|
393 |
+
token for token in ground_truth_tokens if len(token) > 0
|
394 |
+
]
|
395 |
+
return f1_score(prediction_tokens, ground_truth_tokens)
|
396 |
+
|
397 |
+
|
398 |
+
def truncate_input(input, max_length, manner="middle"):
|
399 |
+
if len(input) <= max_length:
|
400 |
+
return input
|
401 |
+
if manner == "middle":
|
402 |
+
return input[0 : max_length // 2] + input[-max_length // 2 :]
|
403 |
+
else:
|
404 |
+
return None
|
405 |
+
|
406 |
+
|
407 |
+
if __name__ == "__main__":
|
408 |
+
data_dir = Path("../data")
|
409 |
+
data_path = data_dir / "shorter/longdialogue_qa_eng_1000.jsonl"
|
410 |
+
examples = list(iter_jsonl(data_path))
|
411 |
+
prompt = create_prompt(examples[10], 'longdialogue_qa_eng', 'kimi', data_dir)
|
412 |
+
print(prompt)
|
evaluation/infinite_bench_eval/prepare_json_from_pxout.py
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
import json
|
3 |
+
import time
|
4 |
+
from datetime import datetime
|
5 |
+
|
6 |
+
from args import parse_args
|
7 |
+
from eval_utils import dump_jsonl, get_answer
|
8 |
+
|
9 |
+
def load_out_sep_by_assistant(pxout_txt_fn):
|
10 |
+
predictions = list()
|
11 |
+
with open(pxout_txt_fn, 'r') as f:
|
12 |
+
one_answer = ''
|
13 |
+
for line in f.readlines():
|
14 |
+
line = line.strip()
|
15 |
+
if line.startswith('assistant: '):
|
16 |
+
if len(one_answer) > 0:
|
17 |
+
predictions.append(one_answer)
|
18 |
+
one_answer = ''
|
19 |
+
one_answer = line[len('assistant: '):]
|
20 |
+
else:
|
21 |
+
one_answer += '\n' + line
|
22 |
+
if len(one_answer) > 0:
|
23 |
+
predictions.append(one_answer)
|
24 |
+
return predictions
|
25 |
+
|
26 |
+
def load_out(pxout_txt_fn):
|
27 |
+
outs = list()
|
28 |
+
with open(pxout_txt_fn) as br:
|
29 |
+
for aline in br.readlines():
|
30 |
+
aline = aline.strip()
|
31 |
+
outs.append(aline)
|
32 |
+
return outs
|
33 |
+
|
34 |
+
def load_ref_longbook_qa_eng(pxref_json_fn):
|
35 |
+
refs = list()
|
36 |
+
with open(pxref_json_fn) as br:
|
37 |
+
file_contents = br.read()
|
38 |
+
file_contents_json = json.loads(file_contents)
|
39 |
+
for asample in file_contents_json:
|
40 |
+
ref = asample['answers'] # NOTE keep this as a list!
|
41 |
+
refs.append(ref)
|
42 |
+
return refs
|
43 |
+
|
44 |
+
def load_ref_longbook_choice_eng(pxref_json_fn, task):
|
45 |
+
refs = list()
|
46 |
+
with open(pxref_json_fn) as br:
|
47 |
+
file_contents = br.read()
|
48 |
+
#import ipdb; ipdb.set_trace()
|
49 |
+
file_contents_json = json.loads(file_contents)
|
50 |
+
#import ipdb; ipdb.set_trace()
|
51 |
+
for asample in file_contents_json:
|
52 |
+
#ref = asample['answers'] # NOTE keep this as a list!
|
53 |
+
#refs.append(ref)
|
54 |
+
asample['options'] = asample['multichoice_options']
|
55 |
+
asample['answer'] = asample['answers']
|
56 |
+
#import ipdb; ipdb.set_trace()
|
57 |
+
ref = get_answer(asample, task)
|
58 |
+
refs.append(ref)
|
59 |
+
return refs
|
60 |
+
|
61 |
+
def load_ref_sets7(ref_jsonl_fn):
|
62 |
+
refs = list()
|
63 |
+
with open(ref_jsonl_fn, 'r') as br:
|
64 |
+
for aline in br.readlines():
|
65 |
+
ref = json.loads(aline)
|
66 |
+
ref_ans = ref['answer']
|
67 |
+
refs.append(ref_ans)
|
68 |
+
return refs
|
69 |
+
|
70 |
+
def load_ref(pxref_json_fn, task):
|
71 |
+
if task in ['longbook_qa_eng', "longbook_sum_eng", "longdialogue_qa_eng", 'longbook_qa_eng_ret']:
|
72 |
+
return load_ref_longbook_qa_eng(pxref_json_fn)
|
73 |
+
elif task == 'longbook_choice_eng' or task == 'longbook_choice_eng_ret':
|
74 |
+
return load_ref_longbook_choice_eng(pxref_json_fn, task)
|
75 |
+
else:
|
76 |
+
#raise ValueError("task={} not supported yet.".format(task))
|
77 |
+
return load_ref_sets7(pxref_json_fn)
|
78 |
+
|
79 |
+
def combine_to_infb(outs, refs, output_path):
|
80 |
+
#import ipdb; ipdb.set_trace()
|
81 |
+
max_len = min(len(outs), len(refs))
|
82 |
+
if len(outs) < len(refs):
|
83 |
+
print("Warning: {} lines in prediction, less than {} lines in ref".format(len(outs), len(refs)))
|
84 |
+
refs = refs[:max_len]
|
85 |
+
if len(refs) < len(outs):
|
86 |
+
print("Warning: {} lines in prediction, larger than {} lines in ref".format(len(outs), len(refs)))
|
87 |
+
outs = outs[:max_len]
|
88 |
+
|
89 |
+
preds = list()
|
90 |
+
for i in range(0, max_len):
|
91 |
+
preds.append(
|
92 |
+
{
|
93 |
+
"id": i,
|
94 |
+
"prediction" : outs[i],
|
95 |
+
"ground_truth" : refs[i], # TODO must be a list, not a str!
|
96 |
+
}
|
97 |
+
)
|
98 |
+
dump_jsonl(preds, output_path)
|
99 |
+
print('done. saved id-pred-ref to {}'.format(output_path))
|
100 |
+
|
101 |
+
def is_sep_by_assistant(testout_txt_fn):
|
102 |
+
out_flag = False
|
103 |
+
with open(testout_txt_fn) as br:
|
104 |
+
for aline in br.readlines():
|
105 |
+
if aline.startswith('assistant: '):
|
106 |
+
out_flag = True
|
107 |
+
break
|
108 |
+
return out_flag
|
109 |
+
|
110 |
+
ALL_TASKS = [
|
111 |
+
"passkey",
|
112 |
+
"number_string",
|
113 |
+
"kv_retrieval",
|
114 |
+
"longdialogue_qa_eng",
|
115 |
+
"longbook_sum_eng",
|
116 |
+
"longbook_choice_eng",
|
117 |
+
"longbook_qa_eng",
|
118 |
+
"longbook_qa_chn",
|
119 |
+
"math_find",
|
120 |
+
"math_calc",
|
121 |
+
"code_run",
|
122 |
+
"code_debug",
|
123 |
+
]
|
124 |
+
|
125 |
+
if __name__ == "__main__":
|
126 |
+
args = parse_args()
|
127 |
+
# args.task for task name in ALL_TASKS
|
128 |
+
# args.pxout_txt for predicted output file
|
129 |
+
# args.pxref_json for test.json reference file
|
130 |
+
|
131 |
+
if args.task is None or args.task not in ALL_TASKS:
|
132 |
+
raise('Error: task name [{}] is None or not in {}'.format(args.task, ALL_TASKS))
|
133 |
+
|
134 |
+
if args.pxout_txt is None or not Path(args.pxout_txt).exists():
|
135 |
+
raise('Error: system prediction file [{}] is None or not exists.'.format(args.pxout_txt))
|
136 |
+
|
137 |
+
if args.pxref_json is None or not Path(args.pxref_json).exists():
|
138 |
+
raise('Error: system reference file [{}] is None or not exists.'.format(args.pxref_json))
|
139 |
+
|
140 |
+
#import ipdb; ipdb.set_trace()
|
141 |
+
if args.sep_by_assistant and is_sep_by_assistant(args.pxout_txt):
|
142 |
+
outs = load_out_sep_by_assistant(args.pxout_txt)
|
143 |
+
else:
|
144 |
+
outs = load_out(args.pxout_txt)
|
145 |
+
|
146 |
+
#import ipdb; ipdb.set_trace()
|
147 |
+
refs = load_ref(args.pxref_json, args.task)
|
148 |
+
|
149 |
+
# determine the output json file name:
|
150 |
+
if args.pxout_ref_json is None:
|
151 |
+
flag = str(datetime.now()).replace(' ', '-').replace(':', '-')
|
152 |
+
output_path = args.pxout_txt + '.' + flag + '.json'
|
153 |
+
else:
|
154 |
+
output_path = args.pxout_ref_json
|
155 |
+
|
156 |
+
print('combine tst.out and ref, output to file: {}'.format(output_path))
|
157 |
+
|
158 |
+
# combine tst.out and ref to <ref, test.out> for next step scoring:
|
159 |
+
infb_json_fn = combine_to_infb(outs, refs, output_path)
|
160 |
+
|
161 |
+
|
162 |
+
|
evaluation/infinite_bench_eval/prompt.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gpt4_templates = {
|
2 |
+
"passkey": "There is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there.\n\n{context}\n\n{input}", # noqa
|
3 |
+
"number_string": "There is an important info hidden inside a lot of irrelevant text. Find it. I will quiz you about the important information there.\n\n{context}\n\n{input}", # noqa
|
4 |
+
"kv_retrieval": "Extract the value corresponding to the specified key in the JSON object below.\n\n{context}\n\n{input}", # noqa
|
5 |
+
# "longbook_sum_eng": "Summarize the book below:\n\n{context}", # noqa
|
6 |
+
"longbook_qa_eng": "Read the book below and answer a question.\n\n{context}\n\nQuestion: {question}\n\nBe very concise.", # noqa
|
7 |
+
"longbook_choice_eng": "Read the book and answer the question.\n\n{context}\n\nQuestion: {question}\n\nOnly one of the following options is correct, tell me the answer using one single letter (A, B, C, or D). Don't say anything else.\nA. {OPTION_A}\nB. {OPTION_B}\nC. {OPTION_C}\nD. {OPTION_D}", # noqa
|
8 |
+
"longbook_sum_eng": "Summarize the following book.\n\n{context}", # noqa
|
9 |
+
"longbook_qa_chn": "请根据以下书籍回答我的问题。\n\n{context}\n\n问题:{question}\n请尽量简短地回答。", # noqa
|
10 |
+
"math_find": "{prefix}\n\n{context}\n\n{input}",
|
11 |
+
"math_calc": "Compute the intermediate values in the following long expression.\n\n{context}", # noqa
|
12 |
+
"code_run": "Following is a set of Python functions. There is a function called named {func}.\n\n{context}\n\nPlease give me the exact number of the return value of {func_call}. Be concise. Your response must end with the final returned value.", # noqa
|
13 |
+
"code_debug": "There is ONLY ONE function in the large project that is deliberately made to include an obvious error. Please find the function that contains the most obvious errors. I will give you four options to narrow your scope. You can inspect the options and think. Eventually, tell me the answer using one single letter (A, B, C, or D).\n\n{context}\n\nWhich funtion has deliberate error?\nA. {OPTION_A}\nB. {OPTION_B}\nC. {OPTION_C}\nD. {OPTION_D}\n\nYou should first find the functions in the options. Repeat their content, inspect through code, and at last give me your answer for the function that has the deliberate and obvious error in A, B, C, or D.", # noqa
|
14 |
+
"longdialogue_qa_eng": "Below is a dialogue script where one random occurrence of a character name is replaced with \"$$MASK$$\", and you should try to guess who that character is.\n\nThe dialogue:\n\n---\n\n{context}\n\n---\n\nEnd of dialogue.\n\nWhich character is most likely \"$$MASK$$\"? Just say the name used by the scriptwriter (before the colon marks) of one single character and nothing else.", # noqa
|
15 |
+
}
|
16 |
+
|
17 |
+
yarn_mistral_templates = {
|
18 |
+
"passkey": "There is an important info hidden inside a lot of irrelevant text. Find it and memorize it. I will quiz you about the important information.\n\n{context}\n\n{input}\n\nThe pass key is", # noqa
|
19 |
+
"number_string": "There is an important info hidden inside a lot of irrelevant text. Find it. I will quiz you about the important information there.\n\n{context}\n\n{input}\n\nThe sequence of digits is", # noqa
|
20 |
+
"kv_retrieval": "Extract the value corresponding to the specified key in the JSON object below.\n\n{context}\n\n{input}", # noqa
|
21 |
+
"longbook_sum_eng": "Summarize the book below.\n\n{context}\n\nSummary:", # noqa
|
22 |
+
"longbook_choice_eng": "Read the book and answer the question.\n\n{context}\n\nQuestion: {question}\nA. {OPTION_A}\nB. {OPTION_B}\nC. {OPTION_C}\nD. {OPTION_D}\n\nThe letter of the correct answer is", # noqa
|
23 |
+
"longbook_qa_eng": "Read the book and answer the question. Be very concise in your answer.\n\n{context}\n\nQuestion: {question}\nAnswer:", # noqa
|
24 |
+
"longbook_qa_chn": "阅读以下书籍然后回答问题。\n\n{context}\n\n问题:{question}\n答案:", # noqa
|
25 |
+
"math_find": "{prefix}\n\n{context}\n\n{input}",
|
26 |
+
"math_calc": "Let us calculate the intermediate values of an expression.\n\nExpression: 1 + 3 + 4\nValues: [1, 4, 8]\n\nExpression: 8 - 3 + 2 - 4\nValues: [8, 5, 7, 3]\n\nExpression: {context}\nValues:", # noqa
|
27 |
+
"code_run": "There is a function called {func} in the following Python code.\n\n{context}\n\nPlease compute the exact value of {func_call}. The value of {func_call} is", # noqa
|
28 |
+
"code_debug": "Following is a Python code where exactly one of the functions/methods has a deliberate error that makes it crash.\n\n{context}\n\nOptions:\nA. {OPTION_A}\nB. {OPTION_B}\nC. {OPTION_C}\nD. {OPTION_D}\n\nThe correct option is:", # noqa
|
29 |
+
"longdialogue_qa_eng": "Below is a dialogue script where one random occurrence of a character name is replaced with \"$$MASK$$\", and you should try to guess who that character is.\n\n{context}\n\nThe name that has been replaced with $$MASK$$ is likely", # noqa
|
30 |
+
}
|
31 |
+
|
32 |
+
claude2_templates = {
|
33 |
+
"passkey": "There is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there.\n\n{context}\n{input}\nThe pass key is",
|
34 |
+
"number_string": "There is an important info hidden inside a lot of irrelevant text. Find it. I will quiz you about the important information there.\n\n{context}\n{input}\nThe sequence of digits is", # noqa
|
35 |
+
"kv_retrieval": "There is an important info hidden inside a lot of irrelevant text. Find it. I will quiz you about the important information there.\n\n{context}\n{input}",
|
36 |
+
"longbook_sum_eng": "Summarize the following book.\n\n{context}", # noqa
|
37 |
+
"longbook_choice_eng": "Read the book and answer the question.\n\n{context}\n\nQuestion: {question}\n\nOnly one of the following options is correct, tell me the answer using one single letter (A, B, C, or D). Don't say anything else.\nA. {OPTION_A}\nB. {OPTION_B}\nC. {OPTION_C}\nD. {OPTION_D}", # noqa
|
38 |
+
"longbook_qa_eng": "Read the novel below and answer a question:\n\n{context}\n\n{question}\nPlease answer as short as possible. The answer is: ", # noqa
|
39 |
+
"longbook_qa_chn": "请根据以下书籍回答我的问题。\n\n{context}\n\n问题:{question}\n请尽量简短地回答。", # noqa
|
40 |
+
"math_find": "{prefix}\n\n{context}\n\n{input}",
|
41 |
+
"math_calc": "Let us calculate the intermediate values of an expression.\nExpression: 1 + 3 + 4\nValues: [1, 4, 8]\n\nExpression: 8 - 3 + 2 - 4\nValues: [8, 5, 7, 3]\n\nExpression: {context}\nValues:", # noqa
|
42 |
+
"code_run": "In the file functions_module.py, there is a function called ${func}.\n\n\nHere is the content of functions_module.py:\n{context}\n\nPlease give me the exact number of the return value of {func_call}. Your response should end with the sentence \'The return value is:\'.", # noqa
|
43 |
+
"code_debug": "There is ONLY ONE function in the large project that is deliberately made to include an obvious error. Please find the function that contains the most obvious errors. I will give you four options to narrow your scope. You can inspect through the options and think. Eventually, tell me the answer using one single letter (A, B, C, or D).\n\n{context}\n\nWhich funtion has deliberate error?\nA. {OPTION_A}\nB. {OPTION_B}\nC. {OPTION_C}\nD. {OPTION_D}\n\nYou should first find the functions in the options. Repeat their content, inspect through code, and at last give me your answer for the function that has the deliberate and obvious error in A, B, C, or D.", # noqa
|
44 |
+
"longdialogue_qa_eng": "Below is a dialogue script where one random occurrence of a character name is replaced with \"$$MASK$$\", and you should try to guess who that character is.\n\nThe dialogue:\n\n---\n\n{context}\n\n---\n\nEnd of dialogue.\n\nWhich character is most likely \"$$MASK$$\"? Just say the name used by the scriptwriter (before the colon marks) of one single character and nothing else.", # noqa
|
45 |
+
}
|
46 |
+
|
47 |
+
kimi_templates = {
|
48 |
+
"passkey": "There is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there.\n\n{context}\n{input}\nThe pass key is", # noqa
|
49 |
+
"number_string": "There is an important info hidden inside a lot of irrelevant text. Find it. I will quiz you about the important information there.\n\n{context}\n{input}\nThe sequence of digits is", # noqa
|
50 |
+
"kv_retrieval": "Extract the value corresponding to the specified key in the JSON object below.\n\n{context}\n{input}", # noqa
|
51 |
+
#"longbook_sum_eng": "Summarize the book below:\n\n{file:{context}}", # noqa
|
52 |
+
"longbook_sum_eng": "Summarize the book below:\n\n{context}", # noqa
|
53 |
+
#"longbook_choice_eng": "Read the book and answer the question.\n\nQuestion: {question}\n\nOnly one of the following options is correct, tell me the answer using one single letter (A, B, C, or D). Don't say anything else.\nA. {OPTION_A}\nB. {OPTION_B}\nC. {OPTION_C}\nD. {OPTION_D}" + "{file:{document}}", # noqa
|
54 |
+
"longbook_choice_eng": "Read the book and answer the question.\n\nQuestion: {question}\n\nOnly one of the following options is correct, tell me the answer using one single letter (A, B, C, or D). Don't say anything else.\nA. {OPTION_A}\nB. {OPTION_B}\nC. {OPTION_C}\nD. {OPTION_D}" + "{context}", # noqa
|
55 |
+
#"longbook_qa_eng": "Read the book below and answer a question.\n\nQuestion: {question}\n\nBe very concise." + "{file:{context}}", # noqa
|
56 |
+
"longbook_qa_eng": "Read the book below and answer a question.\n\nQuestion: {question}\n\nBe very concise." + "{context}", # noqa
|
57 |
+
#"longbook_qa_chn": "阅读以下书籍然后回答问题。\n\n问题:{question}\n答案:" + "{file:{context}}", # noqa
|
58 |
+
"longbook_qa_chn": "阅读以下书籍然后回答问题。\n\n问题:{question}\n答案:" + "{context}", # noqa
|
59 |
+
"math_find": "{prefix}\n\n{context}\n\n{input}",
|
60 |
+
"math_calc": "Let us calculate the intermediate values of an expression.\nExpression: 1 + 3 + 4\nValues: [1, 4, 8]\n\nExpression: 8 - 3 + 2 - 4\nValues: [8, 5, 7, 3]\n\nExpression: {context}\nValues:", # noqa
|
61 |
+
"code_run": "In the file functions_module.py, there is a function called ${func}.\n\n\nHere is the content of functions_module.py:\n\nPlease give me the exact number of the return value of ${func_call}. Your response should end with the sentence 'The return value is:'." + "{context}", # noqa
|
62 |
+
"code_debug": "Below is a code repository where there is one single function with bugs that causes an error. Please tell me the name of that function.\nWhich function has bugs? Give me the final answer in this format: \"[FINAL ANSWER: XXX]\". Don't say anything else." + "{context}", # noqa
|
63 |
+
#"code_debug": "Below is a code repository where there is one single function with bugs that causes an error. Please tell me the name of that function.\nWhich function has bugs? Give me the final answer in this format: \"[FINAL ANSWER: XXX]\". Don't say anything else." + "{fcontext}", # noqa
|
64 |
+
# "longdialogue_qa_eng": "Below is a dialogue script where one random occurrence of a character name is replaced with \"$$MASK$$\", and you should try to guess who that character is.\n\nThe name that has been replaced with $$MASK$$ is likely" + "{context}", # noqa
|
65 |
+
"longdialogue_qa_eng": "Below is a dialogue script where one random occurrence of a character name is replaced with \"$$MASK$$\", and you should try to guess who that character is. Give me the answer using the name before the colons, don't say anything else.\n\n{context}", # noqa
|
66 |
+
}
|
67 |
+
|
evaluation/infinite_bench_eval/requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
openai
|
2 |
+
tiktoken
|
3 |
+
#torch==1.13
|
4 |
+
#transformers==4.35.0
|
5 |
+
#accelerate==0.20.3
|
6 |
+
evaluate
|
7 |
+
#==0.4.1
|
8 |
+
xopen
|
9 |
+
jieba
|
10 |
+
rouge
|
11 |
+
nltk
|
12 |
+
rouge_score
|
evaluation/infinite_bench_eval/test_vllm.sh
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#########################################################################
|
2 |
+
# File Name: 1.comb.tstout.ref.sh
|
3 |
+
# Author: Xianchao Wu
|
4 |
+
# mail: xianchaow@nvidia.com
|
5 |
+
# Created Time: Tue Jun 11 08:14:18 2024
|
6 |
+
#########################################################################
|
7 |
+
#!/bin/bash
|
8 |
+
|
9 |
+
datetime=$(date +%Y%m%d)
|
10 |
+
echo ${datetime}
|
11 |
+
|
12 |
+
model_path="" # Your_model_path https://huggingface.co/nvidia/Llama3-ChatQA-2-70B/
|
13 |
+
indir=${model_path}/outputs
|
14 |
+
data_home="" # https://huggingface.co/nvidia/Llama3-ChatQA-2-70B/tree/main/data
|
15 |
+
|
16 |
+
function longbook_eng_eval(){
|
17 |
+
data_name=$1
|
18 |
+
pxout_txt=$2
|
19 |
+
pxref_json=$3
|
20 |
+
|
21 |
+
task_name="${data_name}_eng"
|
22 |
+
|
23 |
+
pxout_ref_json="${pxout_txt}.${datetime}.json"
|
24 |
+
|
25 |
+
python3 prepare_json_from_pxout.py --task ${task_name} \
|
26 |
+
--pxout_txt ${pxout_txt} \
|
27 |
+
--pxref_json ${pxref_json} \
|
28 |
+
--pxout_ref_json ${pxout_ref_json} \
|
29 |
+
--sep_by_assistant
|
30 |
+
|
31 |
+
if [[ $task_name =~ "longbook_choice" ]]
|
32 |
+
then
|
33 |
+
python3 compute_scores_2sets.py \
|
34 |
+
--task ${task_name} \
|
35 |
+
--pxout_ref_json ${pxout_ref_json} \
|
36 |
+
--model_name pxlong \
|
37 |
+
--use_zero_scrolls
|
38 |
+
else
|
39 |
+
python3 compute_scores_2sets.py \
|
40 |
+
--task ${task_name} \
|
41 |
+
--pxout_ref_json ${pxout_ref_json} \
|
42 |
+
--model_name pxlong
|
43 |
+
fi
|
44 |
+
}
|
45 |
+
|
46 |
+
for afile in `ls $indir/long*.txt`
|
47 |
+
do
|
48 |
+
echo $afile
|
49 |
+
for data_name in "longbook_qa" "longbook_choice" "longbook_sum" "longdialogue_qa"
|
50 |
+
do
|
51 |
+
task_name="${data_name}_eng"
|
52 |
+
if [[ $afile =~ $task_name ]]
|
53 |
+
then
|
54 |
+
# TODO
|
55 |
+
echo "do $task_name for $afile"
|
56 |
+
pxref_json="${data_home}/${task_name}_gpt4_same/test.json"
|
57 |
+
echo ${data_name} ${afile} ${pxref_json}
|
58 |
+
longbook_eng_eval ${data_name} ${afile} ${pxref_json}
|
59 |
+
echo "--------"
|
60 |
+
fi
|
61 |
+
#break
|
62 |
+
done
|
63 |
+
#break
|
64 |
+
done
|