kixx
/

LastingBench

Model card Files Files and versions Community

LastingBench / utils /convert.py

kixx's picture

Upload 34 files

b1e25b1 verified 3 months ago

history blame contribute delete

2.13 kB

	import json
	import argparse

	def main():
	parser = argparse.ArgumentParser(description='Convert and merge JSONL files with question-answer mappings')
	parser.add_argument('orig_path', help='Path to the original JSONL file')
	parser.add_argument('out_path', help='Path to the output JSONL file')
	parser.add_argument('mapping_paths', nargs='+', help='Path(s) to mapping JSONL file(s)')

	args = parser.parse_args()

	# Original data file paths from command line arguments
	orig_path = args.orig_path
	out_path = args.out_path
	mapping_paths = args.mapping_paths

	# Step 1: Build question -> {context, answers} mapping
	mapping = {}
	for mp in mapping_paths:
	with open(mp, 'r', encoding='utf-8') as f_map:
	for idx, line in enumerate(f_map):
	obj = json.loads(line)
	q = obj.get("question")
	if q is None:
	continue
	# Ensure we get the context
	ctx = obj.get("context", "")
	# Some files have "answer" field, some have "answers"
	raw_ans = obj.get("answers", obj.get("answer", []))
	# Normalize answer(s) to list format
	if isinstance(raw_ans, list):
	ans = raw_ans
	else:
	ans = [raw_ans]
	# If the same question appears in multiple mapping files, later ones will overwrite earlier ones
	mapping[q] = {"context": ctx, "answers": ans}

	# Step 2: Read original file, perform replacement and write output
	with open(orig_path, 'r', encoding='utf-8') as f_in, \
	open(out_path, 'w', encoding='utf-8') as f_out:
	for line in f_in:
	item = json.loads(line)
	inp = item.get("input")
	if inp in mapping:
	item["context"] = mapping[inp]["context"]
	item["answers"] = mapping[inp]["answers"]
	f_out.write(json.dumps(item, ensure_ascii=False) + "\n")

	print(f"Merge completed, output file: {out_path}")

	if __name__ == "__main__":
	main()