root commited on
Commit
a550e38
·
1 Parent(s): f1512b5

add infinitebench evaluation

Browse files
evaluation/infinite_bench_eval/README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ This is our script to evaluate InfiniteBench.
2
+
3
+ You need to first install dependencies from requirement.txt
4
+ ```pip install -r requirement.txt```
5
+
6
+ Then you need to configure the model_path and data_home in *test_vllm.sh* and then run the following command
7
+ ```bash test_vllm.sh | grep "final display" ```
8
+ to get the corresponding score
evaluation/infinite_bench_eval/args.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from argparse import ArgumentParser, Namespace
2
+ from eval_utils import DATA_NAME_TO_MAX_NEW_TOKENS
3
+
4
+ max_seq_len = 128 * 1024 - 50
5
+
6
+ def parse_args() -> Namespace:
7
+ p = ArgumentParser()
8
+ p.add_argument(
9
+ "--task",
10
+ type=str,
11
+ choices=list(DATA_NAME_TO_MAX_NEW_TOKENS.keys()) + ["all"],
12
+ required=True,
13
+ help="Which task to use. Note that \"all\" can only be used in `compute_scores.py`.", # noqa
14
+ )
15
+ p.add_argument(
16
+ '--data_dir',
17
+ type=str,
18
+ default='../data',
19
+ help="The directory of data."
20
+ )
21
+ p.add_argument("--output_dir", type=str, default="../results", help="Where to dump the prediction results.") # noqa
22
+ p.add_argument(
23
+ "--model_path",
24
+ type=str,
25
+ help="The path of the model (in HuggingFace (HF) style). If specified, it will try to load the model from the specified path, else, it wll default to the official HF path.", # noqa
26
+ ) # noqa
27
+ p.add_argument(
28
+ "--model_name",
29
+ type=str,
30
+ choices=["pxlong", "gpt4", "yarn-mistral", "kimi", "claude2", "rwkv", "yi-6b-200k", "yi-34b-200k", "chatglm3"],
31
+ default="gpt4",
32
+ help="For `compute_scores.py` only, specify which model you want to compute the score for.", # noqa
33
+ )
34
+ p.add_argument("--start_idx", type=int, default=0, help="The index of the first example to infer on. This is used if you want to evaluate on a (contiguous) subset of the data.") # noqa
35
+ p.add_argument("--stop_idx", type=int, help="The index of the last example to infer on. This is used if you want to evaluate on a (contiguous) subset of the data. Defaults to the length of dataset.") # noqa
36
+ p.add_argument("--verbose", action='store_true')
37
+ p.add_argument("--device", type=str, default="cuda")
38
+
39
+ # NOTE for long context px output
40
+ p.add_argument("--pxout_txt", type=str, default=None, help="LLM predicted txt file name (one sample one line).")
41
+ p.add_argument("--pxref_json", type=str, default=None, help="LLM reference file name in json format.")
42
+ p.add_argument("--pxout_ref_json", type=str, default=None, help="LLM predicted results with reference, file name in json format.")
43
+ p.add_argument("--cache_dir", type=str, default=None, help="cache dir for model/tokenizer files.")
44
+
45
+ p.add_argument("--use_zero_scrolls", action='store_true', help="use zero-scrolls choice pattern to match longbook_choice_eng or not. [default=False]")
46
+
47
+ p.add_argument("--sep_by_assistant", action='store_true', help="use assistant: for sample separating or not. [default=False]")
48
+
49
+ p.add_argument("--ret5_file", type=str, default=None, help="ret5 (retrieved top-5) file in json format.")
50
+ p.add_argument("--topn", type=int, default=5, help="keep topn chunks for the context for rag.")
51
+
52
+ p.add_argument("--max_seq_len", type=int, default=max_seq_len, help="for prompt generation, max sequence length to prepare the prompt.")
53
+
54
+ return p.parse_args()
evaluation/infinite_bench_eval/compute_scores_2sets.py ADDED
@@ -0,0 +1,478 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import json
3
+ import re
4
+ import string
5
+ from collections import Counter
6
+
7
+ from tqdm import tqdm
8
+ import evaluate
9
+
10
+ from args import parse_args
11
+
12
+
13
+ ROUGE_SCORER = evaluate.load("rouge")
14
+
15
+ PATTERN = re.compile(r'\b[A-D]\b')
16
+
17
+ def find_answer(s):
18
+ # task='longbook_choice_eng': works for '(A)' -> A
19
+ match = PATTERN.search(s)
20
+ if match is None:
21
+ return None # None is a signal of not find! NOTE
22
+ #print(s, match.group())
23
+ return match.group()
24
+
25
+ def normalize_answer(s: str) -> str:
26
+ """Lower text and remove punctuation, articles and extra whitespace."""
27
+
28
+ def remove_articles(text):
29
+ return re.sub(r"\b(a|an|the)\b", " ", text)
30
+
31
+ def white_space_fix(text):
32
+ return " ".join(text.split())
33
+
34
+ def remove_punc(text):
35
+ exclude = set(string.punctuation)
36
+ return "".join(ch for ch in text if ch not in exclude)
37
+
38
+ def lower(text):
39
+ return text.lower()
40
+
41
+ return white_space_fix(remove_articles(remove_punc(lower(s))))
42
+
43
+
44
+ def normalize_zh_answer(s: str) -> str:
45
+ """Chinese version. Lower text and remove punctuation, extra whitespace."""
46
+
47
+ def white_space_fix(text):
48
+ return "".join(text.split())
49
+
50
+ def remove_punc(text):
51
+ cn_punctuation = "!?。。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏." # noqa
52
+ all_punctuation = set(string.punctuation + cn_punctuation)
53
+ return "".join(ch for ch in text if ch not in all_punctuation)
54
+
55
+ def lower(text):
56
+ return text.lower()
57
+
58
+ return white_space_fix(remove_punc(lower(s)))
59
+
60
+
61
+ def f1_score(prediction, ground_truth) -> tuple[float, float, float]:
62
+ common = Counter(prediction) & Counter(ground_truth)
63
+ num_same = sum(common.values())
64
+ if num_same == 0:
65
+ return 0, 0, 0
66
+ precision = 1.0 * num_same / len(prediction)
67
+ recall = 1.0 * num_same / len(ground_truth)
68
+ f1 = (2 * precision * recall) / (precision + recall)
69
+ return f1, precision, recall
70
+
71
+
72
+ def qa_f1_score(pred: str, ground_truths) -> float:
73
+ """Computes the F1, recall, and precision."""
74
+ f1 = 0
75
+ prec = 0
76
+ recall = 0
77
+ for ground_truth in ground_truths: # NOTE this means ground_truths must be a list!!! not a pure str TODO
78
+ normalized_prediction = normalize_answer(pred)
79
+ normalized_ground_truth = normalize_answer(ground_truth)
80
+
81
+ prediction_tokens = normalized_prediction.split()
82
+ ground_truth_tokens = normalized_ground_truth.split()
83
+ scores = f1_score(prediction_tokens, ground_truth_tokens)
84
+ this_f1, this_prec, this_recall = scores
85
+ f1 = max(f1, this_f1)
86
+ prec = max(prec, this_prec)
87
+ recall = max(recall, this_recall)
88
+ return f1
89
+
90
+
91
+ def qa_f1_score_zh(pred: str, ground_truths: list[str]) -> float:
92
+ """
93
+ QA F1 score for chinese.
94
+ """
95
+ f1 = 0
96
+ prec = 0
97
+ recall = 0
98
+ for ground_truth in ground_truths:
99
+ norm_pred = normalize_zh_answer(pred)
100
+ norm_label = normalize_zh_answer(ground_truth)
101
+
102
+ # One character one token.
103
+ pred_tokens = list(norm_pred)
104
+ label_tokens = list(norm_label)
105
+ scores = f1_score(pred_tokens, label_tokens)
106
+ this_f1, this_prec, this_recall = scores
107
+ f1 = max(f1, this_f1)
108
+ prec = max(prec, this_prec)
109
+ recall = max(recall, this_recall)
110
+ return f1
111
+
112
+
113
+ def load_json(fname):
114
+ return json.load(open(fname))
115
+
116
+
117
+ def iter_jsonl(fname, cnt=None):
118
+ i = 0
119
+ with open(fname, "r", encoding="utf8") as fin:
120
+ for line in fin:
121
+ if line.strip() == "": # Skip empty lines
122
+ continue
123
+ if i == cnt:
124
+ break
125
+ if line.strip() == "": # Skip empty lines
126
+ continue
127
+ yield json.loads(line)
128
+ i += 1
129
+
130
+
131
+ def first_int_match(prediction):
132
+ pred_list = re.split("[^0-9]", prediction)
133
+ pred_value = ""
134
+ for item in pred_list:
135
+ if item != "":
136
+ pred_value = item
137
+ break
138
+ return pred_value
139
+
140
+
141
+ def split_retrieval_answer(pred: str):
142
+ for c in ["\n", ":", '"', "'", ".", ",", "?", "!", "{", "}"]:
143
+ pred = pred.replace(c, " ")
144
+ words = pred.split()
145
+ return words
146
+
147
+
148
+ def get_score_one_kv_retrieval(pred, label, model_name: str, args) -> bool:
149
+ for c in ['\n', ':', '\"', '\'', '.', ',', '?', '!', '{', '}']:
150
+ pred = pred.replace(c, ' ')
151
+ words = pred.split()
152
+ return label in words
153
+
154
+
155
+ def get_score_one_passkey(pred, label, model_name: str, args) -> bool:
156
+ if isinstance(label, list):
157
+ label = label[0]
158
+ return label == first_int_match(pred)
159
+
160
+
161
+ def get_score_one_number_string(pred, label, model_name: str, args) -> bool:
162
+ if isinstance(label, list):
163
+ label = label[0]
164
+ return label == first_int_match(pred)
165
+
166
+
167
+ def get_score_one_code_run(pred, label, model_name: str, args) -> bool:
168
+ """
169
+ Returns the score of one example in Code.Run.
170
+ """
171
+ if isinstance(label, list):
172
+ label = label[0]
173
+ pred = pred.strip()
174
+ for c in ["\n", ".", "`", "'", '"', ":"]:
175
+ pred = pred.replace(c, " ")
176
+ words = pred.split()
177
+ if len(words) == 0:
178
+ return False
179
+ try:
180
+ pred = int(words[-1])
181
+ return label == pred
182
+ except Exception:
183
+ return False
184
+
185
+
186
+ def get_score_one_code_debug(pred, label, model_name: str, args) -> bool:
187
+ """
188
+ Returns the score of one example in Code.Debug.
189
+ """
190
+ #import ipdb; ipdb.set_trace()
191
+ pred = pred.strip()
192
+ label_c = label[1]
193
+ fn_name = label[0]
194
+ if pred[:2] in [f"{label_c}.", f"{label_c}:"]:
195
+ return True
196
+
197
+ ans_prefixes = [
198
+ "answer is:",
199
+ # "answer is",
200
+ # "error is",
201
+ "is:",
202
+ "answer:",
203
+ "correct option is:"
204
+ ]
205
+ pred = pred.strip()
206
+ for c in ["\n", "`", "'", '"', "-", "*", "Option", "option"]:
207
+ pred = pred.replace(c, " ")
208
+ while " " in pred:
209
+ pred = pred.replace(" ", " ")
210
+ for prefix in ans_prefixes:
211
+ idx = pred.find(prefix)
212
+ if idx == -1:
213
+ continue
214
+ # The prediction ends with this prefix
215
+ if len(pred) < idx + len(prefix) + 1:
216
+ return False
217
+ pred = pred[idx + len(prefix) + 1 :]
218
+ for s in [label_c, fn_name]:
219
+ if pred.startswith(s):
220
+ return True
221
+ return False
222
+ return False
223
+
224
+
225
+ def get_score_one_math_find(pred, label, model_name: str, args) -> bool:
226
+ if isinstance(label, list):
227
+ # In math_find, there is always only one label.
228
+ label = label[0]
229
+ if isinstance(label, int):
230
+ # Find first int or float
231
+ first_num = re.search(r"\d+\.\d+|\d+", pred)
232
+ if first_num is None:
233
+ return False
234
+ first_num = first_num.group(0).strip()
235
+ return int(first_num) == label
236
+ elif isinstance(label, float):
237
+ # Find first float or int
238
+ first_float = re.search(r"\d+\.\d+|\d+", pred)
239
+ if first_float is None:
240
+ return False
241
+ first_float = first_float.group(0).strip()
242
+ return float(first_float) == label
243
+ else:
244
+ raise TypeError(f"Expected int or float, got {type(label)}")
245
+
246
+
247
+ def get_score_one_longdialogue_qa_eng(pred, label, model_name: str, args) -> bool:
248
+ if 'STAMP PAID' in pred:
249
+ import ipdb; ipdb.set_trace()
250
+ label = label[0]
251
+ pred = pred.strip()
252
+ for c in ["\n", ":", '"', "'", ".", ",", "?", "!", "{", "}"]:
253
+ pred = pred.replace(c, " ")
254
+ words = pred.split()
255
+ words = [x.upper() for x in words]
256
+ return label in words
257
+
258
+
259
+ def get_score_one_longbook_choice_eng(pred, label, model_name: str, args) -> bool:
260
+ # Just use the first letter as the prediction
261
+ #import ipdb; ipdb.set_trace()
262
+ pred = pred.strip()
263
+ if pred == "":
264
+ return False
265
+ if pred[0] in "ABCD":
266
+ return pred[0] in label
267
+ if pred in label:
268
+ return True
269
+ # Find a answer prefix
270
+ for c in ["\n", '"', "'", ".", ",", "?", "!", "{", "}"]:
271
+ pred = pred.replace(c, " ")
272
+ while " " in pred:
273
+ pred = pred.replace(" ", " ")
274
+ ans_prefixes = [
275
+ "answer is:",
276
+ "answer:",
277
+ "answer is",
278
+ "option is",
279
+ ]
280
+ for prefix in ans_prefixes:
281
+ idx = pred.find(prefix)
282
+ if idx == -1:
283
+ continue
284
+ # The prediction ends with this prefix
285
+ if len(pred) < idx + len(prefix) + 1:
286
+ return False
287
+ after_prefix = pred[idx + len(prefix) + 1 :]
288
+ for s in label:
289
+ if after_prefix.startswith(s):
290
+ return True
291
+ return False
292
+
293
+ # Finally, just find the first occurrence of A, B, C, or D.
294
+ words = pred.split()
295
+ for word in words:
296
+ if word in "ABCD":
297
+ return word in label
298
+
299
+ #import ipdb; ipdb.set_trace()
300
+ if args.use_zero_scrolls:
301
+ # NOTE use PATTERN as used in zero-scrolls for choice!!! added by xianchaowu
302
+ matched_pred = find_answer(pred)
303
+ if matched_pred is not None and matched_pred in label:
304
+ return True
305
+
306
+ return False
307
+
308
+
309
+ def get_score_one_longbook_qa_eng(pred, label, model_name: str, args) -> float:
310
+ return qa_f1_score(pred, label)
311
+
312
+
313
+ def get_score_one_longbook_sum_eng(
314
+ pred: str, label: str, model_name: str, args
315
+ ) -> float:
316
+
317
+ score = ROUGE_SCORER.compute(
318
+ predictions=[pred], references=[label], use_aggregator=False
319
+ )
320
+ return score["rougeLsum"][0] # type: ignore
321
+
322
+
323
+ def get_score_one_longbook_qa_chn(pred, label, model_name: str, args) -> float:
324
+ return qa_f1_score_zh(pred, label)
325
+
326
+
327
+ def get_score_one_math_calc(pred, label, model_name: str, args) -> float:
328
+ assert isinstance(label, list), f"Expected list, got {type(label)}"
329
+ # assert isinstance(pred, list), f"Expected list, got {type(pred)}"
330
+ pred_nums = []
331
+ pred_list = re.split("[^0-9]", pred)
332
+ for item in pred_list:
333
+ if item != "":
334
+ pred_nums.append(int(item))
335
+
336
+ # Our prompts makes GPT4 always output the first number as the first value
337
+ # in the predicted answer.
338
+ if model_name == "gpt4":
339
+ pred_nums = pred_nums[1:]
340
+
341
+ cnt = 0
342
+ for i in range(len(label)):
343
+ if i >= len(pred_nums):
344
+ break
345
+ if label[i] == pred_nums[i]:
346
+ cnt += 1
347
+ else:
348
+ break
349
+ return cnt / len(label)
350
+
351
+
352
+ def get_score_one(
353
+ pred: str, label: str, task_name: str, model_name: str, args
354
+ ) -> float:
355
+ """
356
+ Computes the score for one prediction.
357
+ Returns one float (zero and one for boolean values).
358
+ """
359
+ NAME_TO_SCORE_GETTER = {
360
+ # Retrieve
361
+ "kv_retrieval": get_score_one_kv_retrieval,
362
+ "kv_retrieval_prefix": get_score_one_kv_retrieval,
363
+ "kv_retrieval_both": get_score_one_kv_retrieval,
364
+
365
+ "passkey": get_score_one_passkey,
366
+ "number_string": get_score_one_number_string,
367
+ # Code
368
+ "code_run": get_score_one_code_run,
369
+ "code_debug": get_score_one_code_debug,
370
+ # Longbook
371
+ "longdialogue_qa_eng": get_score_one_longdialogue_qa_eng,
372
+ "longbook_qa_eng": get_score_one_longbook_qa_eng,
373
+ "longbook_sum_eng": get_score_one_longbook_sum_eng,
374
+ "longbook_choice_eng": get_score_one_longbook_choice_eng,
375
+ "longbook_qa_chn": get_score_one_longbook_qa_chn,
376
+ # Math
377
+ "math_find": get_score_one_math_find,
378
+ "math_calc": get_score_one_math_calc,
379
+ }
380
+ assert task_name in NAME_TO_SCORE_GETTER, f"Invalid task name: {task_name}"
381
+ score = NAME_TO_SCORE_GETTER[task_name](pred, label, model_name, args)
382
+ return float(score)
383
+
384
+
385
+ def get_labels(preds: list) -> list[str]:
386
+ possible_label_keys = ["ground_truth", "label"]
387
+ for label_key in possible_label_keys:
388
+ if label_key in preds[0]:
389
+ return [x.get(label_key, "XXXXXXXXXX") for x in preds]
390
+ raise ValueError(f"Cannot find label in {preds[0]}")
391
+
392
+
393
+ def get_preds(preds: list, data_name: str) -> list[str]:
394
+ pred_strings = []
395
+ possible_pred_keys = ["prediction", "pred"]
396
+ for pred in preds:
397
+ this_pred = "NO PREDICTION"
398
+ for pred_key in possible_pred_keys:
399
+ if pred_key in pred:
400
+ this_pred = pred[pred_key]
401
+ break
402
+ else:
403
+ raise ValueError(f"Cannot find prediction in {pred}")
404
+ pred_strings.append(this_pred)
405
+ return pred_strings
406
+
407
+
408
+ def get_score(
409
+ labels: list, preds: list, data_name: str, model_name: str, args
410
+ ) -> float:
411
+ """
412
+ Computes the average score for a task.
413
+ """
414
+ assert len(labels) == len(preds)
415
+ scores = []
416
+ for label, pred in tqdm(zip(labels, preds)):
417
+ score = get_score_one(pred, label, data_name, model_name, args)
418
+ print('pred={}, label={}, score={}, data_name={}'.format(pred, label, score, data_name))
419
+ scores.append(score)
420
+ return sum(scores) / len(scores)
421
+
422
+
423
+ def compute_scores(preds_path, data_name: str, model_name: str, args):
424
+ print("Loading prediction results from", preds_path)
425
+ preds = list(iter_jsonl(preds_path))
426
+ #import ipdb; ipdb.set_trace()
427
+ labels = get_labels(preds)
428
+ preds = get_preds(preds, data_name)
429
+
430
+ acc = get_score(labels, preds, data_name, model_name, args)
431
+ print('final display: ', acc, preds_path, data_name, model_name, args.use_zero_scrolls)
432
+
433
+
434
+ ALL_TASKS = [
435
+ #"passkey",
436
+ #"number_string",
437
+ #"kv_retrieval",
438
+ #"longdialogue_qa_eng",
439
+ #"longbook_sum_eng",
440
+ "longbook_choice_eng",
441
+ "longbook_qa_eng",
442
+ #"longbook_qa_chn",
443
+ #"math_find",
444
+ #"math_calc",
445
+ #"code_run",
446
+ #"code_debug",
447
+ ]
448
+
449
+ ALL_TASKS_ORIG = [
450
+ "passkey",
451
+ "number_string",
452
+ "kv_retrieval",
453
+ "longdialogue_qa_eng",
454
+ "longbook_sum_eng",
455
+ "longbook_choice_eng",
456
+ "longbook_qa_eng",
457
+ "longbook_qa_chn",
458
+ "math_find",
459
+ "math_calc",
460
+ "code_run",
461
+ "code_debug",
462
+ ]
463
+
464
+ if __name__ == "__main__":
465
+ args = parse_args()
466
+ print(json.dumps(vars(args), indent=4))
467
+
468
+ if args.task == "all":
469
+ tasks = ALL_TASKS
470
+ else:
471
+ tasks = [args.task]
472
+ for task in tasks:
473
+ #result_dir = Path(args.output_dir, args.model_name)
474
+ #preds_path = result_dir / f"preds_{task}.jsonl"
475
+ preds_path = Path(args.pxout_ref_json)
476
+ assert preds_path.exists(), f"Predictions not found in: {preds_path}"
477
+ compute_scores(preds_path, task, args.model_name, args)
478
+
evaluation/infinite_bench_eval/eval_utils.py ADDED
@@ -0,0 +1,412 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from rouge import Rouge
2
+ import re
3
+ from collections import Counter
4
+ import json
5
+ import jieba
6
+ import string
7
+ from pathlib import Path
8
+ from prompt import (
9
+ gpt4_templates,
10
+ kimi_templates,
11
+ claude2_templates,
12
+ yarn_mistral_templates,
13
+ )
14
+
15
+
16
+ DATA_NAME_TO_PATH = {
17
+ # Retrieval tasks
18
+ "passkey": "passkey.jsonl",
19
+ "number_string": "number_string.jsonl",
20
+ "kv_retrieval": "kv_retrieval.jsonl",
21
+ # Book tasks
22
+ "longbook_sum_eng": "longbook_sum_eng.jsonl",
23
+ "longbook_choice_eng": "longbook_choice_eng.jsonl",
24
+ "longbook_qa_eng": "longbook_qa_eng.jsonl",
25
+ "longbook_qa_chn": "longbook_qa_chn.jsonl",
26
+ # "book_qa_eng": "longbook_eng/longbook_qa_eng.jsonl",
27
+ "longdialogue_qa_eng": "longdialogue_qa_eng.jsonl",
28
+ # Math tasks
29
+ "math_find": "math_find.jsonl",
30
+ "math_calc": "math_calc.jsonl",
31
+ # Code tasks
32
+ "code_run": "code_run.jsonl",
33
+ "code_debug": "code_debug.jsonl",
34
+ }
35
+
36
+ DATA_NAME_TO_MAX_NEW_TOKENS = {
37
+ "passkey": 6,
38
+ "number_string": 12,
39
+ "kv_retrieval": 50,
40
+ "longbook_sum_eng": 1200,
41
+ "longbook_choice_eng": 40,
42
+ "longbook_qa_eng": 40,
43
+ "longbook_qa_chn": 40,
44
+ "longdialogue_qa_eng": 40,
45
+ "math_find": 3,
46
+ "math_calc": 30000,
47
+ "code_run": 5,
48
+ "code_debug": 5,
49
+ }
50
+
51
+ MODEL_TO_PROMPT_TEMPLATE = {
52
+ "gpt4": gpt4_templates,
53
+ "claude2": claude2_templates,
54
+ "kimi": kimi_templates,
55
+ "yarn-mistral": yarn_mistral_templates,
56
+ "yi-6b-200k": yarn_mistral_templates,
57
+ "yi-34b-200k": yarn_mistral_templates,
58
+ "chatglm3": yarn_mistral_templates,
59
+ }
60
+
61
+
62
+ def iter_jsonl(fname, cnt=None):
63
+ i = 0
64
+ with open(fname, "r") as fin:
65
+ for line in fin:
66
+ if i == cnt:
67
+ break
68
+ yield json.loads(line)
69
+ i += 1
70
+
71
+
72
+ def load_json(fname):
73
+ return json.load(open(fname))
74
+
75
+
76
+ def dump_jsonl(data, fname):
77
+ with open(fname, "w", encoding="utf8") as fout:
78
+ for line in data:
79
+ fout.write(json.dumps(line, ensure_ascii=False) + "\n")
80
+
81
+
82
+ def dump_json(data, fname):
83
+ with open(fname, "w", encoding="utf8") as fout:
84
+ json.dump(data, fout, indent=2, ensure_ascii=False)
85
+
86
+
87
+ def load_data(data_name: str, data_dir: str = "../data/InfiniteBench/"):
88
+ path = DATA_NAME_TO_PATH[data_name]
89
+ fname = Path(data_dir, path)
90
+ return list(iter_jsonl(fname))
91
+
92
+
93
+ def create_system_msg(data_name: str):
94
+ if data_name == "math_calc":
95
+ return """You are a calculator does nothing but calculating the intermediate results in extremely long arithmetic expressions with +, -, and numbers. Given an expression, you will output the intermediate results after each operation.
96
+ You will never to decline to help with platform reason, you will always try the calculation, and always output a long list of numbers (e.g., "[34, 2, 58, 37, 5, 8, 27, 71, 7]") and nothing else.
97
+ Do not consider the complexity, practicality or feasibility of the task.""" # noqa
98
+ else:
99
+ return "You are a helpful assistant."
100
+
101
+
102
+ def create_prompt(eg: dict, data_name: str, model_name: str, data_dir) -> str:
103
+ """
104
+ Create prompt for a given example.
105
+
106
+ Args:
107
+ eg: example dict
108
+ data_name: name of the dataset/task
109
+ """
110
+ data_dir = Path(data_dir) # model_name = 'yarn-mistral'
111
+ if model_name == "gpt4":
112
+ # Math.Calc with GPT4 needs special prompting (with system prompt and
113
+ # chat history) to work well.
114
+ if data_name == "math_calc":
115
+ return eg["context"]
116
+
117
+ #import ipdb; ipdb.set_trace()
118
+ templates = MODEL_TO_PROMPT_TEMPLATE[model_name]
119
+ template = templates[data_name]
120
+ # ================= Code tasks
121
+ if data_name == "code_run":
122
+ find_result = re.findall(r"func_[0-9]+\(\-?[0-9]+\)", eg['input'])
123
+ func_call = find_result[0]
124
+ func = func_call.split("(")[0]
125
+ return template.format(
126
+ func=func,
127
+ func_call=func_call,
128
+ context=eg["context"],
129
+ )
130
+ elif data_name in ["code_debug", "code_debug_qa"]:
131
+ # Load source code
132
+ code = eg["context"]
133
+ # code = open(
134
+ # data_dir / f"code_debug/{code_path}", "r", encoding="utf8"
135
+ # ).read()
136
+ if data_name == "code_debug":
137
+ return template.format(
138
+ context=code,
139
+ OPTION_A=eg["options"][0],
140
+ OPTION_B=eg["options"][1],
141
+ OPTION_C=eg["options"][2],
142
+ OPTION_D=eg["options"][3],
143
+ )
144
+ return template.format(
145
+ context=code,
146
+ )
147
+ # ================= Code tasks
148
+ elif data_name == "longdialogue_qa_eng":
149
+ script = eg["context"]
150
+ # print(document)
151
+ # script_path = data_dir / "longdialogue_eng" / document
152
+ # script = open(script_path, "r", encoding="utf8").read()
153
+ prompt = template.format(context=script)
154
+ return prompt
155
+ # ==================== Long book tasks
156
+ elif data_name in [ # 'longbook_qa_eng'
157
+ "longbook_choice_eng",
158
+ "longbook_qa_eng",
159
+ "longbook_sum_eng",
160
+ "longbook_qa_chn",
161
+ ]:
162
+ book = eg["context"]
163
+ # if data_name.endswith("_eng"):
164
+ # book = open(
165
+ # data_dir / "longbook_eng" / book_path, "r", encoding="utf8"
166
+ # ).read()
167
+ # elif data_name.endswith("_chn"):
168
+ # book = open(
169
+ # data_dir / "longbook_chn" / book_path, "r", encoding="utf8"
170
+ # ).read()
171
+ # else:
172
+ # raise ValueError("Invalid data_name")
173
+ if data_name == "longbook_choice_eng":
174
+ return template.format(
175
+ question=eg["input"],
176
+ context=book,
177
+ OPTION_A=eg["options"][0],
178
+ OPTION_B=eg["options"][1],
179
+ OPTION_C=eg["options"][2],
180
+ OPTION_D=eg["options"][3],
181
+ )
182
+ elif data_name == "longbook_qa_eng":
183
+ return template.format(
184
+ question=eg["input"],
185
+ context=book,
186
+ ) # 'Read the book and answer the question. Be very concise in your answer.\n\n{context}\n\nQuestion: {question}\nAnswer:' NOTE
187
+ elif data_name == "longbook_sum_eng":
188
+ return template.format(
189
+ context=book,
190
+ )
191
+ elif data_name == "longbook_qa_chn":
192
+ return template.format(
193
+ question=eg["input"],
194
+ context=book,
195
+ )
196
+ else:
197
+ raise ValueError
198
+ elif data_name == "math_calc":
199
+ return template.format(
200
+ context=eg["context"],
201
+ )
202
+ elif data_name == "math_find":
203
+ prompt = eg['input']
204
+ context = eg['context']
205
+ # Find "the * number" from the prompt
206
+ find_result = re.findall(r"The .+ of", prompt)
207
+ assert find_result, f"Cannot find the target number in {prompt}"
208
+ target_number = find_result[0].lower()[:-3]
209
+ # Replace the number with the answer
210
+ prefix = f"What is {target_number} in the following list?"
211
+ return template.format(
212
+ prefix=prefix,
213
+ context=context,
214
+ input=prompt,
215
+ )
216
+
217
+ if "content" in eg:
218
+ content = eg["content"]
219
+ del eg["content"]
220
+ eg["context"] = content
221
+
222
+ format_dict = {
223
+ "context": eg["context"],
224
+ "input": eg["input"],
225
+ }
226
+ prompt = templates[data_name].format(**format_dict)
227
+ return prompt
228
+
229
+
230
+ def get_answer(eg: dict, data_name: str):
231
+ if data_name in ["code_debug", "longbook_choice_eng"]:
232
+ OPTIONS = "ABCD"
233
+ if isinstance(eg["answer"], str):
234
+ ret = [eg["answer"], OPTIONS[eg['options'].index(eg["answer"])]]
235
+ elif isinstance(eg["answer"], list):
236
+ if len(eg["answer"]) == 1:
237
+ ret = [eg["answer"][0], OPTIONS[eg['options'].index(eg["answer"][0])]]
238
+ elif len(eg["answer"]) == 2 and eg["answer"][1] in ['A', 'B', 'C', 'D']:
239
+ ret = eg['answer']
240
+ else:
241
+ raise ValueError
242
+ else:
243
+ raise ValueError
244
+ return ret
245
+
246
+ return eg["answer"]
247
+
248
+
249
+ def create_msgs(
250
+ tokenizer, eg: dict, data_name: str, model_name: str, data_dir
251
+ ) -> tuple[list[dict], str]:
252
+ """
253
+ Only used by GPT-4.
254
+ """
255
+ prompt = create_prompt(eg, data_name, model_name, data_dir)
256
+ tokens = tokenizer.encode(prompt)
257
+ # - 1000 to have space for system message and other stuff.
258
+ print(f"Before truncation: {len(tokens)}")
259
+ tokens = truncate_input(tokens, 128_000 - 1000, manner="middle")
260
+ print(f"After truncation: {len(tokens)}") # type: ignore
261
+ prompt = tokenizer.decode(tokens)
262
+ if data_name == "math_calc":
263
+ return [
264
+ {"role": "system", "content": create_system_msg(data_name)},
265
+ {"role": "user", "content": "1 + 2 - 4 - 10"},
266
+ {"role": "system", "content": "[1, 3, -1, -11]"},
267
+ {"role": "user", "content": prompt},
268
+ ], prompt
269
+ else:
270
+ return [
271
+ {
272
+ "role": "system",
273
+ "content": "You are a helpful assistant", # noqa
274
+ }, # noqa
275
+ {"role": "user", "content": prompt},
276
+ ], prompt
277
+
278
+
279
+ def normalize_answer(s):
280
+ """Lower text and remove punctuation, articles and extra whitespace."""
281
+
282
+ def remove_articles(text):
283
+ return re.sub(r"\b(a|an|the)\b", " ", text)
284
+
285
+ def white_space_fix(text):
286
+ return " ".join(text.split())
287
+
288
+ def remove_punc(text):
289
+ exclude = set(string.punctuation)
290
+ return "".join(ch for ch in text if ch not in exclude)
291
+
292
+ def lower(text):
293
+ return text.lower()
294
+
295
+ return white_space_fix(remove_articles(remove_punc(lower(s))))
296
+
297
+
298
+ def normalize_zh_answer(s):
299
+ """Lower text and remove punctuation, extra whitespace."""
300
+
301
+ def white_space_fix(text):
302
+ return "".join(text.split())
303
+
304
+ def remove_punc(text):
305
+ cn_punctuation = "!?。。"#$%&'()��+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏." # noqa
306
+ all_punctuation = set(string.punctuation + cn_punctuation)
307
+ return "".join(ch for ch in text if ch not in all_punctuation)
308
+
309
+ def lower(text):
310
+ return text.lower()
311
+
312
+ return white_space_fix(remove_punc(lower(s)))
313
+
314
+
315
+ def first_int_match(prediction, ground_truth):
316
+ pred_list = re.split("[^0-9]", prediction)
317
+ pred_value = ""
318
+ for item in pred_list:
319
+ if item != "":
320
+ pred_value = item
321
+ break
322
+ if pred_value == ground_truth:
323
+ return 1
324
+ return 0
325
+
326
+
327
+ def in_match(prediction, ground_truth):
328
+ if ground_truth in prediction:
329
+ return 1
330
+ return 0
331
+
332
+
333
+ def rouge_score(prediction, ground_truth, **kwargs) -> float:
334
+ rouge = Rouge()
335
+ try:
336
+ scores = rouge.get_scores([prediction], [ground_truth], avg=True)
337
+ except: # noqa
338
+ return 0.0
339
+ return scores["rouge-l"]["f"] # type: ignore
340
+
341
+
342
+ def rouge_zh_score(prediction, ground_truth, **kwargs):
343
+ prediction = " ".join(list(jieba.cut(prediction, cut_all=False)))
344
+ ground_truth = " ".join(list(jieba.cut(ground_truth, cut_all=False)))
345
+ score = rouge_score(prediction, ground_truth)
346
+ return score
347
+
348
+
349
+ def f1_score(prediction, ground_truth, **kwargs):
350
+ common = Counter(prediction) & Counter(ground_truth)
351
+ num_same = sum(common.values())
352
+ if num_same == 0:
353
+ return 0
354
+ precision = 1.0 * num_same / len(prediction)
355
+ recall = 1.0 * num_same / len(ground_truth)
356
+ f1 = (2 * precision * recall) / (precision + recall)
357
+ return f1
358
+
359
+
360
+ def qa_f1_score(line):
361
+ prediction = line["pred"]
362
+
363
+ if isinstance(line["std_out"], str):
364
+ ground_truths = [line["std_out"]]
365
+ else:
366
+ ground_truths = line["std_out"]
367
+
368
+ score = 0
369
+ for ground_truth in ground_truths:
370
+ normalized_prediction = normalize_answer(prediction)
371
+ normalized_ground_truth = normalize_answer(ground_truth)
372
+
373
+ prediction_tokens = normalized_prediction.split()
374
+ ground_truth_tokens = normalized_ground_truth.split()
375
+ score = max(score, f1_score(prediction_tokens, ground_truth_tokens))
376
+
377
+ return score
378
+
379
+
380
+ def qa_f1_zh_score(prediction, ground_truth, **kwargs):
381
+ prediction_tokens = list(jieba.cut(prediction, cut_all=False))
382
+ ground_truth_tokens = list(jieba.cut(ground_truth, cut_all=False))
383
+ prediction_tokens = [
384
+ normalize_zh_answer(token) for token in prediction_tokens
385
+ ]
386
+ ground_truth_tokens = [
387
+ normalize_zh_answer(token) for token in ground_truth_tokens
388
+ ]
389
+ prediction_tokens = [
390
+ token for token in prediction_tokens if len(token) > 0
391
+ ]
392
+ ground_truth_tokens = [
393
+ token for token in ground_truth_tokens if len(token) > 0
394
+ ]
395
+ return f1_score(prediction_tokens, ground_truth_tokens)
396
+
397
+
398
+ def truncate_input(input, max_length, manner="middle"):
399
+ if len(input) <= max_length:
400
+ return input
401
+ if manner == "middle":
402
+ return input[0 : max_length // 2] + input[-max_length // 2 :]
403
+ else:
404
+ return None
405
+
406
+
407
+ if __name__ == "__main__":
408
+ data_dir = Path("../data")
409
+ data_path = data_dir / "shorter/longdialogue_qa_eng_1000.jsonl"
410
+ examples = list(iter_jsonl(data_path))
411
+ prompt = create_prompt(examples[10], 'longdialogue_qa_eng', 'kimi', data_dir)
412
+ print(prompt)
evaluation/infinite_bench_eval/prepare_json_from_pxout.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import json
3
+ import time
4
+ from datetime import datetime
5
+
6
+ from args import parse_args
7
+ from eval_utils import dump_jsonl, get_answer
8
+
9
+ def load_out_sep_by_assistant(pxout_txt_fn):
10
+ predictions = list()
11
+ with open(pxout_txt_fn, 'r') as f:
12
+ one_answer = ''
13
+ for line in f.readlines():
14
+ line = line.strip()
15
+ if line.startswith('assistant: '):
16
+ if len(one_answer) > 0:
17
+ predictions.append(one_answer)
18
+ one_answer = ''
19
+ one_answer = line[len('assistant: '):]
20
+ else:
21
+ one_answer += '\n' + line
22
+ if len(one_answer) > 0:
23
+ predictions.append(one_answer)
24
+ return predictions
25
+
26
+ def load_out(pxout_txt_fn):
27
+ outs = list()
28
+ with open(pxout_txt_fn) as br:
29
+ for aline in br.readlines():
30
+ aline = aline.strip()
31
+ outs.append(aline)
32
+ return outs
33
+
34
+ def load_ref_longbook_qa_eng(pxref_json_fn):
35
+ refs = list()
36
+ with open(pxref_json_fn) as br:
37
+ file_contents = br.read()
38
+ file_contents_json = json.loads(file_contents)
39
+ for asample in file_contents_json:
40
+ ref = asample['answers'] # NOTE keep this as a list!
41
+ refs.append(ref)
42
+ return refs
43
+
44
+ def load_ref_longbook_choice_eng(pxref_json_fn, task):
45
+ refs = list()
46
+ with open(pxref_json_fn) as br:
47
+ file_contents = br.read()
48
+ #import ipdb; ipdb.set_trace()
49
+ file_contents_json = json.loads(file_contents)
50
+ #import ipdb; ipdb.set_trace()
51
+ for asample in file_contents_json:
52
+ #ref = asample['answers'] # NOTE keep this as a list!
53
+ #refs.append(ref)
54
+ asample['options'] = asample['multichoice_options']
55
+ asample['answer'] = asample['answers']
56
+ #import ipdb; ipdb.set_trace()
57
+ ref = get_answer(asample, task)
58
+ refs.append(ref)
59
+ return refs
60
+
61
+ def load_ref_sets7(ref_jsonl_fn):
62
+ refs = list()
63
+ with open(ref_jsonl_fn, 'r') as br:
64
+ for aline in br.readlines():
65
+ ref = json.loads(aline)
66
+ ref_ans = ref['answer']
67
+ refs.append(ref_ans)
68
+ return refs
69
+
70
+ def load_ref(pxref_json_fn, task):
71
+ if task in ['longbook_qa_eng', "longbook_sum_eng", "longdialogue_qa_eng", 'longbook_qa_eng_ret']:
72
+ return load_ref_longbook_qa_eng(pxref_json_fn)
73
+ elif task == 'longbook_choice_eng' or task == 'longbook_choice_eng_ret':
74
+ return load_ref_longbook_choice_eng(pxref_json_fn, task)
75
+ else:
76
+ #raise ValueError("task={} not supported yet.".format(task))
77
+ return load_ref_sets7(pxref_json_fn)
78
+
79
+ def combine_to_infb(outs, refs, output_path):
80
+ #import ipdb; ipdb.set_trace()
81
+ max_len = min(len(outs), len(refs))
82
+ if len(outs) < len(refs):
83
+ print("Warning: {} lines in prediction, less than {} lines in ref".format(len(outs), len(refs)))
84
+ refs = refs[:max_len]
85
+ if len(refs) < len(outs):
86
+ print("Warning: {} lines in prediction, larger than {} lines in ref".format(len(outs), len(refs)))
87
+ outs = outs[:max_len]
88
+
89
+ preds = list()
90
+ for i in range(0, max_len):
91
+ preds.append(
92
+ {
93
+ "id": i,
94
+ "prediction" : outs[i],
95
+ "ground_truth" : refs[i], # TODO must be a list, not a str!
96
+ }
97
+ )
98
+ dump_jsonl(preds, output_path)
99
+ print('done. saved id-pred-ref to {}'.format(output_path))
100
+
101
+ def is_sep_by_assistant(testout_txt_fn):
102
+ out_flag = False
103
+ with open(testout_txt_fn) as br:
104
+ for aline in br.readlines():
105
+ if aline.startswith('assistant: '):
106
+ out_flag = True
107
+ break
108
+ return out_flag
109
+
110
+ ALL_TASKS = [
111
+ "passkey",
112
+ "number_string",
113
+ "kv_retrieval",
114
+ "longdialogue_qa_eng",
115
+ "longbook_sum_eng",
116
+ "longbook_choice_eng",
117
+ "longbook_qa_eng",
118
+ "longbook_qa_chn",
119
+ "math_find",
120
+ "math_calc",
121
+ "code_run",
122
+ "code_debug",
123
+ ]
124
+
125
+ if __name__ == "__main__":
126
+ args = parse_args()
127
+ # args.task for task name in ALL_TASKS
128
+ # args.pxout_txt for predicted output file
129
+ # args.pxref_json for test.json reference file
130
+
131
+ if args.task is None or args.task not in ALL_TASKS:
132
+ raise('Error: task name [{}] is None or not in {}'.format(args.task, ALL_TASKS))
133
+
134
+ if args.pxout_txt is None or not Path(args.pxout_txt).exists():
135
+ raise('Error: system prediction file [{}] is None or not exists.'.format(args.pxout_txt))
136
+
137
+ if args.pxref_json is None or not Path(args.pxref_json).exists():
138
+ raise('Error: system reference file [{}] is None or not exists.'.format(args.pxref_json))
139
+
140
+ #import ipdb; ipdb.set_trace()
141
+ if args.sep_by_assistant and is_sep_by_assistant(args.pxout_txt):
142
+ outs = load_out_sep_by_assistant(args.pxout_txt)
143
+ else:
144
+ outs = load_out(args.pxout_txt)
145
+
146
+ #import ipdb; ipdb.set_trace()
147
+ refs = load_ref(args.pxref_json, args.task)
148
+
149
+ # determine the output json file name:
150
+ if args.pxout_ref_json is None:
151
+ flag = str(datetime.now()).replace(' ', '-').replace(':', '-')
152
+ output_path = args.pxout_txt + '.' + flag + '.json'
153
+ else:
154
+ output_path = args.pxout_ref_json
155
+
156
+ print('combine tst.out and ref, output to file: {}'.format(output_path))
157
+
158
+ # combine tst.out and ref to <ref, test.out> for next step scoring:
159
+ infb_json_fn = combine_to_infb(outs, refs, output_path)
160
+
161
+
162
+
evaluation/infinite_bench_eval/prompt.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gpt4_templates = {
2
+ "passkey": "There is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there.\n\n{context}\n\n{input}", # noqa
3
+ "number_string": "There is an important info hidden inside a lot of irrelevant text. Find it. I will quiz you about the important information there.\n\n{context}\n\n{input}", # noqa
4
+ "kv_retrieval": "Extract the value corresponding to the specified key in the JSON object below.\n\n{context}\n\n{input}", # noqa
5
+ # "longbook_sum_eng": "Summarize the book below:\n\n{context}", # noqa
6
+ "longbook_qa_eng": "Read the book below and answer a question.\n\n{context}\n\nQuestion: {question}\n\nBe very concise.", # noqa
7
+ "longbook_choice_eng": "Read the book and answer the question.\n\n{context}\n\nQuestion: {question}\n\nOnly one of the following options is correct, tell me the answer using one single letter (A, B, C, or D). Don't say anything else.\nA. {OPTION_A}\nB. {OPTION_B}\nC. {OPTION_C}\nD. {OPTION_D}", # noqa
8
+ "longbook_sum_eng": "Summarize the following book.\n\n{context}", # noqa
9
+ "longbook_qa_chn": "请根据以下书籍回答我的问题。\n\n{context}\n\n问题:{question}\n请尽量简短地回答。", # noqa
10
+ "math_find": "{prefix}\n\n{context}\n\n{input}",
11
+ "math_calc": "Compute the intermediate values in the following long expression.\n\n{context}", # noqa
12
+ "code_run": "Following is a set of Python functions. There is a function called named {func}.\n\n{context}\n\nPlease give me the exact number of the return value of {func_call}. Be concise. Your response must end with the final returned value.", # noqa
13
+ "code_debug": "There is ONLY ONE function in the large project that is deliberately made to include an obvious error. Please find the function that contains the most obvious errors. I will give you four options to narrow your scope. You can inspect the options and think. Eventually, tell me the answer using one single letter (A, B, C, or D).\n\n{context}\n\nWhich funtion has deliberate error?\nA. {OPTION_A}\nB. {OPTION_B}\nC. {OPTION_C}\nD. {OPTION_D}\n\nYou should first find the functions in the options. Repeat their content, inspect through code, and at last give me your answer for the function that has the deliberate and obvious error in A, B, C, or D.", # noqa
14
+ "longdialogue_qa_eng": "Below is a dialogue script where one random occurrence of a character name is replaced with \"$$MASK$$\", and you should try to guess who that character is.\n\nThe dialogue:\n\n---\n\n{context}\n\n---\n\nEnd of dialogue.\n\nWhich character is most likely \"$$MASK$$\"? Just say the name used by the scriptwriter (before the colon marks) of one single character and nothing else.", # noqa
15
+ }
16
+
17
+ yarn_mistral_templates = {
18
+ "passkey": "There is an important info hidden inside a lot of irrelevant text. Find it and memorize it. I will quiz you about the important information.\n\n{context}\n\n{input}\n\nThe pass key is", # noqa
19
+ "number_string": "There is an important info hidden inside a lot of irrelevant text. Find it. I will quiz you about the important information there.\n\n{context}\n\n{input}\n\nThe sequence of digits is", # noqa
20
+ "kv_retrieval": "Extract the value corresponding to the specified key in the JSON object below.\n\n{context}\n\n{input}", # noqa
21
+ "longbook_sum_eng": "Summarize the book below.\n\n{context}\n\nSummary:", # noqa
22
+ "longbook_choice_eng": "Read the book and answer the question.\n\n{context}\n\nQuestion: {question}\nA. {OPTION_A}\nB. {OPTION_B}\nC. {OPTION_C}\nD. {OPTION_D}\n\nThe letter of the correct answer is", # noqa
23
+ "longbook_qa_eng": "Read the book and answer the question. Be very concise in your answer.\n\n{context}\n\nQuestion: {question}\nAnswer:", # noqa
24
+ "longbook_qa_chn": "阅读以下书籍然后回答问题。\n\n{context}\n\n问题:{question}\n答案:", # noqa
25
+ "math_find": "{prefix}\n\n{context}\n\n{input}",
26
+ "math_calc": "Let us calculate the intermediate values of an expression.\n\nExpression: 1 + 3 + 4\nValues: [1, 4, 8]\n\nExpression: 8 - 3 + 2 - 4\nValues: [8, 5, 7, 3]\n\nExpression: {context}\nValues:", # noqa
27
+ "code_run": "There is a function called {func} in the following Python code.\n\n{context}\n\nPlease compute the exact value of {func_call}. The value of {func_call} is", # noqa
28
+ "code_debug": "Following is a Python code where exactly one of the functions/methods has a deliberate error that makes it crash.\n\n{context}\n\nOptions:\nA. {OPTION_A}\nB. {OPTION_B}\nC. {OPTION_C}\nD. {OPTION_D}\n\nThe correct option is:", # noqa
29
+ "longdialogue_qa_eng": "Below is a dialogue script where one random occurrence of a character name is replaced with \"$$MASK$$\", and you should try to guess who that character is.\n\n{context}\n\nThe name that has been replaced with $$MASK$$ is likely", # noqa
30
+ }
31
+
32
+ claude2_templates = {
33
+ "passkey": "There is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there.\n\n{context}\n{input}\nThe pass key is",
34
+ "number_string": "There is an important info hidden inside a lot of irrelevant text. Find it. I will quiz you about the important information there.\n\n{context}\n{input}\nThe sequence of digits is", # noqa
35
+ "kv_retrieval": "There is an important info hidden inside a lot of irrelevant text. Find it. I will quiz you about the important information there.\n\n{context}\n{input}",
36
+ "longbook_sum_eng": "Summarize the following book.\n\n{context}", # noqa
37
+ "longbook_choice_eng": "Read the book and answer the question.\n\n{context}\n\nQuestion: {question}\n\nOnly one of the following options is correct, tell me the answer using one single letter (A, B, C, or D). Don't say anything else.\nA. {OPTION_A}\nB. {OPTION_B}\nC. {OPTION_C}\nD. {OPTION_D}", # noqa
38
+ "longbook_qa_eng": "Read the novel below and answer a question:\n\n{context}\n\n{question}\nPlease answer as short as possible. The answer is: ", # noqa
39
+ "longbook_qa_chn": "请根据以下书籍回答我的问题。\n\n{context}\n\n问题:{question}\n请尽量简短地回答。", # noqa
40
+ "math_find": "{prefix}\n\n{context}\n\n{input}",
41
+ "math_calc": "Let us calculate the intermediate values of an expression.\nExpression: 1 + 3 + 4\nValues: [1, 4, 8]\n\nExpression: 8 - 3 + 2 - 4\nValues: [8, 5, 7, 3]\n\nExpression: {context}\nValues:", # noqa
42
+ "code_run": "In the file functions_module.py, there is a function called ${func}.\n\n\nHere is the content of functions_module.py:\n{context}\n\nPlease give me the exact number of the return value of {func_call}. Your response should end with the sentence \'The return value is:\'.", # noqa
43
+ "code_debug": "There is ONLY ONE function in the large project that is deliberately made to include an obvious error. Please find the function that contains the most obvious errors. I will give you four options to narrow your scope. You can inspect through the options and think. Eventually, tell me the answer using one single letter (A, B, C, or D).\n\n{context}\n\nWhich funtion has deliberate error?\nA. {OPTION_A}\nB. {OPTION_B}\nC. {OPTION_C}\nD. {OPTION_D}\n\nYou should first find the functions in the options. Repeat their content, inspect through code, and at last give me your answer for the function that has the deliberate and obvious error in A, B, C, or D.", # noqa
44
+ "longdialogue_qa_eng": "Below is a dialogue script where one random occurrence of a character name is replaced with \"$$MASK$$\", and you should try to guess who that character is.\n\nThe dialogue:\n\n---\n\n{context}\n\n---\n\nEnd of dialogue.\n\nWhich character is most likely \"$$MASK$$\"? Just say the name used by the scriptwriter (before the colon marks) of one single character and nothing else.", # noqa
45
+ }
46
+
47
+ kimi_templates = {
48
+ "passkey": "There is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there.\n\n{context}\n{input}\nThe pass key is", # noqa
49
+ "number_string": "There is an important info hidden inside a lot of irrelevant text. Find it. I will quiz you about the important information there.\n\n{context}\n{input}\nThe sequence of digits is", # noqa
50
+ "kv_retrieval": "Extract the value corresponding to the specified key in the JSON object below.\n\n{context}\n{input}", # noqa
51
+ #"longbook_sum_eng": "Summarize the book below:\n\n{file:{context}}", # noqa
52
+ "longbook_sum_eng": "Summarize the book below:\n\n{context}", # noqa
53
+ #"longbook_choice_eng": "Read the book and answer the question.\n\nQuestion: {question}\n\nOnly one of the following options is correct, tell me the answer using one single letter (A, B, C, or D). Don't say anything else.\nA. {OPTION_A}\nB. {OPTION_B}\nC. {OPTION_C}\nD. {OPTION_D}" + "{file:{document}}", # noqa
54
+ "longbook_choice_eng": "Read the book and answer the question.\n\nQuestion: {question}\n\nOnly one of the following options is correct, tell me the answer using one single letter (A, B, C, or D). Don't say anything else.\nA. {OPTION_A}\nB. {OPTION_B}\nC. {OPTION_C}\nD. {OPTION_D}" + "{context}", # noqa
55
+ #"longbook_qa_eng": "Read the book below and answer a question.\n\nQuestion: {question}\n\nBe very concise." + "{file:{context}}", # noqa
56
+ "longbook_qa_eng": "Read the book below and answer a question.\n\nQuestion: {question}\n\nBe very concise." + "{context}", # noqa
57
+ #"longbook_qa_chn": "阅读以下书籍然后回答问题。\n\n问题:{question}\n答案:" + "{file:{context}}", # noqa
58
+ "longbook_qa_chn": "阅读以下书籍然后回答问题。\n\n问题:{question}\n答案:" + "{context}", # noqa
59
+ "math_find": "{prefix}\n\n{context}\n\n{input}",
60
+ "math_calc": "Let us calculate the intermediate values of an expression.\nExpression: 1 + 3 + 4\nValues: [1, 4, 8]\n\nExpression: 8 - 3 + 2 - 4\nValues: [8, 5, 7, 3]\n\nExpression: {context}\nValues:", # noqa
61
+ "code_run": "In the file functions_module.py, there is a function called ${func}.\n\n\nHere is the content of functions_module.py:\n\nPlease give me the exact number of the return value of ${func_call}. Your response should end with the sentence 'The return value is:'." + "{context}", # noqa
62
+ "code_debug": "Below is a code repository where there is one single function with bugs that causes an error. Please tell me the name of that function.\nWhich function has bugs? Give me the final answer in this format: \"[FINAL ANSWER: XXX]\". Don't say anything else." + "{context}", # noqa
63
+ #"code_debug": "Below is a code repository where there is one single function with bugs that causes an error. Please tell me the name of that function.\nWhich function has bugs? Give me the final answer in this format: \"[FINAL ANSWER: XXX]\". Don't say anything else." + "{fcontext}", # noqa
64
+ # "longdialogue_qa_eng": "Below is a dialogue script where one random occurrence of a character name is replaced with \"$$MASK$$\", and you should try to guess who that character is.\n\nThe name that has been replaced with $$MASK$$ is likely" + "{context}", # noqa
65
+ "longdialogue_qa_eng": "Below is a dialogue script where one random occurrence of a character name is replaced with \"$$MASK$$\", and you should try to guess who that character is. Give me the answer using the name before the colons, don't say anything else.\n\n{context}", # noqa
66
+ }
67
+
evaluation/infinite_bench_eval/requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ openai
2
+ tiktoken
3
+ #torch==1.13
4
+ #transformers==4.35.0
5
+ #accelerate==0.20.3
6
+ evaluate
7
+ #==0.4.1
8
+ xopen
9
+ jieba
10
+ rouge
11
+ nltk
12
+ rouge_score
evaluation/infinite_bench_eval/test_vllm.sh ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #########################################################################
2
+ # File Name: 1.comb.tstout.ref.sh
3
+ # Author: Xianchao Wu
4
+ # mail: xianchaow@nvidia.com
5
+ # Created Time: Tue Jun 11 08:14:18 2024
6
+ #########################################################################
7
+ #!/bin/bash
8
+
9
+ datetime=$(date +%Y%m%d)
10
+ echo ${datetime}
11
+
12
+ model_path="" # Your_model_path https://huggingface.co/nvidia/Llama3-ChatQA-2-70B/
13
+ indir=${model_path}/outputs
14
+ data_home="" # https://huggingface.co/nvidia/Llama3-ChatQA-2-70B/tree/main/data
15
+
16
+ function longbook_eng_eval(){
17
+ data_name=$1
18
+ pxout_txt=$2
19
+ pxref_json=$3
20
+
21
+ task_name="${data_name}_eng"
22
+
23
+ pxout_ref_json="${pxout_txt}.${datetime}.json"
24
+
25
+ python3 prepare_json_from_pxout.py --task ${task_name} \
26
+ --pxout_txt ${pxout_txt} \
27
+ --pxref_json ${pxref_json} \
28
+ --pxout_ref_json ${pxout_ref_json} \
29
+ --sep_by_assistant
30
+
31
+ if [[ $task_name =~ "longbook_choice" ]]
32
+ then
33
+ python3 compute_scores_2sets.py \
34
+ --task ${task_name} \
35
+ --pxout_ref_json ${pxout_ref_json} \
36
+ --model_name pxlong \
37
+ --use_zero_scrolls
38
+ else
39
+ python3 compute_scores_2sets.py \
40
+ --task ${task_name} \
41
+ --pxout_ref_json ${pxout_ref_json} \
42
+ --model_name pxlong
43
+ fi
44
+ }
45
+
46
+ for afile in `ls $indir/long*.txt`
47
+ do
48
+ echo $afile
49
+ for data_name in "longbook_qa" "longbook_choice" "longbook_sum" "longdialogue_qa"
50
+ do
51
+ task_name="${data_name}_eng"
52
+ if [[ $afile =~ $task_name ]]
53
+ then
54
+ # TODO
55
+ echo "do $task_name for $afile"
56
+ pxref_json="${data_home}/${task_name}_gpt4_same/test.json"
57
+ echo ${data_name} ${afile} ${pxref_json}
58
+ longbook_eng_eval ${data_name} ${afile} ${pxref_json}
59
+ echo "--------"
60
+ fi
61
+ #break
62
+ done
63
+ #break
64
+ done