NetsPresso_QA / make_qrels_cast22.py
geonmin-kim's picture
Upload folder using huggingface_hub
d6585f5
import json
from tqdm import tqdm
QUERY_TYPE="Q0" # TREC format legacy
RELEVANCE_SCORE=1
PASSAGE_ID_VALID_PREFIX=["MARCO", "KILT"] # disable WAPO
### read: cast 22 eval json
eval_json_path="/root/Corpus/CAsT22_eval_queries/cqr_inferred_results.json"
# read data
with open(eval_json_path, 'r') as fr:
data = json.load(fr)
# write: qrels.txt (format: {qid}\t{query})
eval_qrels_path = "/root/Corpus/CAsT22_eval_queries/cqr_qrels.txt"
qid_pid_pair_list = [] # filter out duplicate pair exists in evaluation file
with open(eval_qrels_path, 'w') as fw:
for sample in tqdm(data):
conv_id = sample['number']
for turn in sample['turn']:
turn_id = turn['number']
automatic_rewritten_utterance = turn['automatic_rewritten_utterance']
q_id = f"{conv_id}_{turn_id}"
if "provenance" in turn.keys():
for passage_id in turn["provenance"]:
if any([valid_prefix in passage_id for valid_prefix in PASSAGE_ID_VALID_PREFIX]):
if ' ' in passage_id:
print(f"delete whitespace in passage_id: {passage_id}")
passage_id = passage_id.replace(' ', '')
qid_pid_pair = f"{q_id}&{passage_id}"
if qid_pid_pair not in qid_pid_pair_list:
qid_pid_pair_list.append(qid_pid_pair)
fw.write(f"{q_id} {QUERY_TYPE} {passage_id} {RELEVANCE_SCORE}\n")
else:
print(f"skip appending duplicate qid&pid pair: qid = {q_id}, p_id = {passage_id}")
else:
print(f"exclude passage id: {passage_id}")
else:
print('no provenance for turn')
print(turn)