root
add infinitebench evaluation
a550e38
from pathlib import Path
import json
import time
from datetime import datetime
from args import parse_args
from eval_utils import dump_jsonl, get_answer
def load_out_sep_by_assistant(pxout_txt_fn):
predictions = list()
with open(pxout_txt_fn, 'r') as f:
one_answer = ''
for line in f.readlines():
line = line.strip()
if line.startswith('assistant: '):
if len(one_answer) > 0:
predictions.append(one_answer)
one_answer = ''
one_answer = line[len('assistant: '):]
else:
one_answer += '\n' + line
if len(one_answer) > 0:
predictions.append(one_answer)
return predictions
def load_out(pxout_txt_fn):
outs = list()
with open(pxout_txt_fn) as br:
for aline in br.readlines():
aline = aline.strip()
outs.append(aline)
return outs
def load_ref_longbook_qa_eng(pxref_json_fn):
refs = list()
with open(pxref_json_fn) as br:
file_contents = br.read()
file_contents_json = json.loads(file_contents)
for asample in file_contents_json:
ref = asample['answers'] # NOTE keep this as a list!
refs.append(ref)
return refs
def load_ref_longbook_choice_eng(pxref_json_fn, task):
refs = list()
with open(pxref_json_fn) as br:
file_contents = br.read()
#import ipdb; ipdb.set_trace()
file_contents_json = json.loads(file_contents)
#import ipdb; ipdb.set_trace()
for asample in file_contents_json:
#ref = asample['answers'] # NOTE keep this as a list!
#refs.append(ref)
asample['options'] = asample['multichoice_options']
asample['answer'] = asample['answers']
#import ipdb; ipdb.set_trace()
ref = get_answer(asample, task)
refs.append(ref)
return refs
def load_ref_sets7(ref_jsonl_fn):
refs = list()
with open(ref_jsonl_fn, 'r') as br:
for aline in br.readlines():
ref = json.loads(aline)
ref_ans = ref['answer']
refs.append(ref_ans)
return refs
def load_ref(pxref_json_fn, task):
if task in ['longbook_qa_eng', "longbook_sum_eng", "longdialogue_qa_eng", 'longbook_qa_eng_ret']:
return load_ref_longbook_qa_eng(pxref_json_fn)
elif task == 'longbook_choice_eng' or task == 'longbook_choice_eng_ret':
return load_ref_longbook_choice_eng(pxref_json_fn, task)
else:
#raise ValueError("task={} not supported yet.".format(task))
return load_ref_sets7(pxref_json_fn)
def combine_to_infb(outs, refs, output_path):
#import ipdb; ipdb.set_trace()
max_len = min(len(outs), len(refs))
if len(outs) < len(refs):
print("Warning: {} lines in prediction, less than {} lines in ref".format(len(outs), len(refs)))
refs = refs[:max_len]
if len(refs) < len(outs):
print("Warning: {} lines in prediction, larger than {} lines in ref".format(len(outs), len(refs)))
outs = outs[:max_len]
preds = list()
for i in range(0, max_len):
preds.append(
{
"id": i,
"prediction" : outs[i],
"ground_truth" : refs[i], # TODO must be a list, not a str!
}
)
dump_jsonl(preds, output_path)
print('done. saved id-pred-ref to {}'.format(output_path))
def is_sep_by_assistant(testout_txt_fn):
out_flag = False
with open(testout_txt_fn) as br:
for aline in br.readlines():
if aline.startswith('assistant: '):
out_flag = True
break
return out_flag
ALL_TASKS = [
"passkey",
"number_string",
"kv_retrieval",
"longdialogue_qa_eng",
"longbook_sum_eng",
"longbook_choice_eng",
"longbook_qa_eng",
"longbook_qa_chn",
"math_find",
"math_calc",
"code_run",
"code_debug",
]
if __name__ == "__main__":
args = parse_args()
# args.task for task name in ALL_TASKS
# args.pxout_txt for predicted output file
# args.pxref_json for test.json reference file
if args.task is None or args.task not in ALL_TASKS:
raise('Error: task name [{}] is None or not in {}'.format(args.task, ALL_TASKS))
if args.pxout_txt is None or not Path(args.pxout_txt).exists():
raise('Error: system prediction file [{}] is None or not exists.'.format(args.pxout_txt))
if args.pxref_json is None or not Path(args.pxref_json).exists():
raise('Error: system reference file [{}] is None or not exists.'.format(args.pxref_json))
#import ipdb; ipdb.set_trace()
if args.sep_by_assistant and is_sep_by_assistant(args.pxout_txt):
outs = load_out_sep_by_assistant(args.pxout_txt)
else:
outs = load_out(args.pxout_txt)
#import ipdb; ipdb.set_trace()
refs = load_ref(args.pxref_json, args.task)
# determine the output json file name:
if args.pxout_ref_json is None:
flag = str(datetime.now()).replace(' ', '-').replace(':', '-')
output_path = args.pxout_txt + '.' + flag + '.json'
else:
output_path = args.pxout_ref_json
print('combine tst.out and ref, output to file: {}'.format(output_path))
# combine tst.out and ref to <ref, test.out> for next step scoring:
infb_json_fn = combine_to_infb(outs, refs, output_path)