import json from tqdm import tqdm QUERY_TYPE="Q0" # TREC format legacy RELEVANCE_SCORE=1 PASSAGE_ID_VALID_PREFIX=["MARCO", "KILT"] # disable WAPO ### read: cast 22 eval json eval_json_path="/root/Corpus/CAsT22_eval_queries/cqr_inferred_results.json" # read data with open(eval_json_path, 'r') as fr: data = json.load(fr) # write: qrels.txt (format: {qid}\t{query}) eval_qrels_path = "/root/Corpus/CAsT22_eval_queries/cqr_qrels.txt" qid_pid_pair_list = [] # filter out duplicate pair exists in evaluation file with open(eval_qrels_path, 'w') as fw: for sample in tqdm(data): conv_id = sample['number'] for turn in sample['turn']: turn_id = turn['number'] automatic_rewritten_utterance = turn['automatic_rewritten_utterance'] q_id = f"{conv_id}_{turn_id}" if "provenance" in turn.keys(): for passage_id in turn["provenance"]: if any([valid_prefix in passage_id for valid_prefix in PASSAGE_ID_VALID_PREFIX]): if ' ' in passage_id: print(f"delete whitespace in passage_id: {passage_id}") passage_id = passage_id.replace(' ', '') qid_pid_pair = f"{q_id}&{passage_id}" if qid_pid_pair not in qid_pid_pair_list: qid_pid_pair_list.append(qid_pid_pair) fw.write(f"{q_id} {QUERY_TYPE} {passage_id} {RELEVANCE_SCORE}\n") else: print(f"skip appending duplicate qid&pid pair: qid = {q_id}, p_id = {passage_id}") else: print(f"exclude passage id: {passage_id}") else: print('no provenance for turn') print(turn)