Spaces:
Runtime error
Runtime error
import json | |
from tqdm import tqdm | |
QUERY_TYPE="Q0" # TREC format legacy | |
RELEVANCE_SCORE=1 | |
PASSAGE_ID_VALID_PREFIX=["MARCO", "KILT"] # disable WAPO | |
### read: cast 22 eval json | |
eval_json_path="/root/Corpus/CAsT22_eval_queries/cqr_inferred_results.json" | |
# read data | |
with open(eval_json_path, 'r') as fr: | |
data = json.load(fr) | |
# write: qrels.txt (format: {qid}\t{query}) | |
eval_qrels_path = "/root/Corpus/CAsT22_eval_queries/cqr_qrels.txt" | |
qid_pid_pair_list = [] # filter out duplicate pair exists in evaluation file | |
with open(eval_qrels_path, 'w') as fw: | |
for sample in tqdm(data): | |
conv_id = sample['number'] | |
for turn in sample['turn']: | |
turn_id = turn['number'] | |
automatic_rewritten_utterance = turn['automatic_rewritten_utterance'] | |
q_id = f"{conv_id}_{turn_id}" | |
if "provenance" in turn.keys(): | |
for passage_id in turn["provenance"]: | |
if any([valid_prefix in passage_id for valid_prefix in PASSAGE_ID_VALID_PREFIX]): | |
if ' ' in passage_id: | |
print(f"delete whitespace in passage_id: {passage_id}") | |
passage_id = passage_id.replace(' ', '') | |
qid_pid_pair = f"{q_id}&{passage_id}" | |
if qid_pid_pair not in qid_pid_pair_list: | |
qid_pid_pair_list.append(qid_pid_pair) | |
fw.write(f"{q_id} {QUERY_TYPE} {passage_id} {RELEVANCE_SCORE}\n") | |
else: | |
print(f"skip appending duplicate qid&pid pair: qid = {q_id}, p_id = {passage_id}") | |
else: | |
print(f"exclude passage id: {passage_id}") | |
else: | |
print('no provenance for turn') | |
print(turn) | |