File size: 5,445 Bytes

a550e38

from pathlib import Path
import json
import time
from datetime import datetime

from args import parse_args
from eval_utils import dump_jsonl, get_answer

def load_out_sep_by_assistant(pxout_txt_fn):
    predictions = list()
    with open(pxout_txt_fn, 'r') as f:
        one_answer = ''
        for line in f.readlines():
            line = line.strip()
            if line.startswith('assistant: '):
                if len(one_answer) > 0:
                    predictions.append(one_answer)
                    one_answer = ''
                one_answer = line[len('assistant: '):]
            else:
                one_answer += '\n' + line
        if len(one_answer) > 0:
            predictions.append(one_answer)
    return predictions

def load_out(pxout_txt_fn):
    outs = list()
    with open(pxout_txt_fn) as br:
        for aline in br.readlines():
            aline = aline.strip()
            outs.append(aline)
    return outs

def load_ref_longbook_qa_eng(pxref_json_fn):
    refs = list()
    with open(pxref_json_fn) as br:
        file_contents = br.read()
        file_contents_json = json.loads(file_contents)
        for asample in file_contents_json:
            ref = asample['answers'] # NOTE keep this as a list!
            refs.append(ref)
    return refs

def load_ref_longbook_choice_eng(pxref_json_fn, task):
    refs = list()
    with open(pxref_json_fn) as br:
        file_contents = br.read()
        #import ipdb; ipdb.set_trace()
        file_contents_json = json.loads(file_contents)
        #import ipdb; ipdb.set_trace()
        for asample in file_contents_json:
            #ref = asample['answers'] # NOTE keep this as a list!
            #refs.append(ref)
            asample['options'] = asample['multichoice_options']
            asample['answer'] = asample['answers']
            #import ipdb; ipdb.set_trace()
            ref = get_answer(asample, task) 
            refs.append(ref)
    return refs

def load_ref_sets7(ref_jsonl_fn):
    refs = list()
    with open(ref_jsonl_fn, 'r') as br:
        for aline in br.readlines():
            ref = json.loads(aline)
            ref_ans = ref['answer']
            refs.append(ref_ans)
    return refs

def load_ref(pxref_json_fn, task):
    if task in ['longbook_qa_eng', "longbook_sum_eng", "longdialogue_qa_eng", 'longbook_qa_eng_ret']:
        return load_ref_longbook_qa_eng(pxref_json_fn)
    elif task == 'longbook_choice_eng' or task == 'longbook_choice_eng_ret':
        return load_ref_longbook_choice_eng(pxref_json_fn, task)
    else:
        #raise ValueError("task={} not supported yet.".format(task))
        return load_ref_sets7(pxref_json_fn)

def combine_to_infb(outs, refs, output_path):
    #import ipdb; ipdb.set_trace()
    max_len = min(len(outs), len(refs))
    if len(outs) < len(refs):
        print("Warning: {} lines in prediction, less than {} lines in ref".format(len(outs), len(refs)))
        refs = refs[:max_len]
    if len(refs) < len(outs):
        print("Warning: {} lines in prediction, larger than {} lines in ref".format(len(outs), len(refs)))
        outs = outs[:max_len]

    preds = list()
    for i in range(0, max_len):
        preds.append(
            {
                "id": i,
                "prediction" : outs[i],
                "ground_truth" : refs[i], # TODO must be a list, not a str!
            }
        )
    dump_jsonl(preds, output_path)
    print('done. saved id-pred-ref to {}'.format(output_path))

def is_sep_by_assistant(testout_txt_fn):
    out_flag = False
    with open(testout_txt_fn) as br:
        for aline in br.readlines():
            if aline.startswith('assistant: '):
                out_flag = True
                break
    return out_flag

ALL_TASKS = [
    "passkey",
    "number_string",
    "kv_retrieval",
    "longdialogue_qa_eng",
    "longbook_sum_eng",
    "longbook_choice_eng",
    "longbook_qa_eng",
    "longbook_qa_chn",
    "math_find",
    "math_calc",
    "code_run",
    "code_debug",
]

if __name__ == "__main__":
    args = parse_args()
    # args.task for task name in ALL_TASKS
    # args.pxout_txt for predicted output file
    # args.pxref_json for test.json reference file
    
    if args.task is None or args.task not in ALL_TASKS:
        raise('Error: task name [{}] is None or not in {}'.format(args.task, ALL_TASKS))

    if args.pxout_txt is None or not Path(args.pxout_txt).exists():
        raise('Error: system prediction file [{}] is None or not exists.'.format(args.pxout_txt))
    
    if args.pxref_json is None or not Path(args.pxref_json).exists():
        raise('Error: system reference file [{}] is None or not exists.'.format(args.pxref_json))

    #import ipdb; ipdb.set_trace()
    if args.sep_by_assistant and is_sep_by_assistant(args.pxout_txt):
        outs = load_out_sep_by_assistant(args.pxout_txt)
    else:
        outs = load_out(args.pxout_txt)

    #import ipdb; ipdb.set_trace()
    refs = load_ref(args.pxref_json, args.task)

    # determine the output json file name:
    if args.pxout_ref_json is None: 
        flag = str(datetime.now()).replace(' ', '-').replace(':', '-')
        output_path = args.pxout_txt + '.' + flag + '.json'
    else:
        output_path = args.pxout_ref_json

    print('combine tst.out and ref, output to file: {}'.format(output_path))

    # combine tst.out and ref to <ref, test.out> for next step scoring:
    infb_json_fn = combine_to_infb(outs, refs, output_path)