Spaces:
Runtime error
Runtime error
# dedup script shared by Sean MacAvaney on TREC Slack workspace, 2022/10/28 | |
# usage: python dedup.py msmarco-v2-passage-neardupes.txt.gz myrun1 myrun2 ... | |
# outputs myrun1.dedup, myrun2.dedup, etc. | |
from collections import Counter | |
import argparse | |
import gzip | |
parser = argparse.ArgumentParser() | |
parser.add_argument('dupefile') | |
parser.add_argument('runs', nargs='+') | |
args = parser.parse_args() | |
equiv_map = {} | |
for line in gzip.open(args.dupefile, 'rt'): | |
cols = line.split() | |
class_id, doc_id = cols[0], cols[1] | |
if class_id != doc_id: | |
equiv_map[doc_id] = class_id | |
for run in args.runs: | |
classes = set() | |
qid_count = Counter() | |
with open(run, 'rt') as fin, open(run+'.dedup', 'wt') as fout: | |
for line in fin: | |
qid, q0, did, rank, score, runid = line.split() | |
class_id = equiv_map.get(did, did) | |
key = (qid, class_id) | |
if key not in classes: | |
classes.add(key) | |
new_rank = str(qid_count[qid]) | |
fout.write(' '.join([qid, q0, class_id, new_rank, score, runid]) + '\n') | |
qid_count[qid] += 1 | |