LastingBench / utils /convert.py
kixx's picture
Upload 34 files
b1e25b1 verified
import json
import argparse
def main():
parser = argparse.ArgumentParser(description='Convert and merge JSONL files with question-answer mappings')
parser.add_argument('orig_path', help='Path to the original JSONL file')
parser.add_argument('out_path', help='Path to the output JSONL file')
parser.add_argument('mapping_paths', nargs='+', help='Path(s) to mapping JSONL file(s)')
args = parser.parse_args()
# Original data file paths from command line arguments
orig_path = args.orig_path
out_path = args.out_path
mapping_paths = args.mapping_paths
# Step 1: Build question -> {context, answers} mapping
mapping = {}
for mp in mapping_paths:
with open(mp, 'r', encoding='utf-8') as f_map:
for idx, line in enumerate(f_map):
obj = json.loads(line)
q = obj.get("question")
if q is None:
continue
# Ensure we get the context
ctx = obj.get("context", "")
# Some files have "answer" field, some have "answers"
raw_ans = obj.get("answers", obj.get("answer", []))
# Normalize answer(s) to list format
if isinstance(raw_ans, list):
ans = raw_ans
else:
ans = [raw_ans]
# If the same question appears in multiple mapping files, later ones will overwrite earlier ones
mapping[q] = {"context": ctx, "answers": ans}
# Step 2: Read original file, perform replacement and write output
with open(orig_path, 'r', encoding='utf-8') as f_in, \
open(out_path, 'w', encoding='utf-8') as f_out:
for line in f_in:
item = json.loads(line)
inp = item.get("input")
if inp in mapping:
item["context"] = mapping[inp]["context"]
item["answers"] = mapping[inp]["answers"]
f_out.write(json.dumps(item, ensure_ascii=False) + "\n")
print(f"Merge completed, output file: {out_path}")
if __name__ == "__main__":
main()