# # Pyserini: Reproducible IR research with sparse and dense representations # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # This file converts the runfile containing the passage id from MS MARCO Passage v2 to the # runfile containing the docid from MS MARCO Doc v2. The passage with the max score are # selected as the document score # # Note that this file can only be used after running `build_passage_to_doc_id_map.py` under the # same folder, to prepare for the idmap.tsv files. # # Usage: # python scripts/msmarco_v2/convert_passage_run_to_doc_run.py \ # --input runs/run.mspsg.dev.txt \ # --id-map /path/to/id_map_dir # generated by build_psg_doc_idmap.py \ # --output runs/run.msdoc-converted-from-psg.dev.txt import os import argparse from collections import defaultdict def load_id_map_from_file(id_map_fn): psgid2docid = {} with open(id_map_fn) as f: for line in f: psgid, docid = line.strip().split("\t") psgid2docid[psgid] = docid return psgid2docid def load_id_map_from_dir(id_map_dir): return { fn.replace(".idmap.tsv", ""): load_id_map_from_file(os.path.join(id_map_dir, fn)) for fn in os.listdir(id_map_dir) } def main(args): input_runfile, output_runfile = args.input, args.output id_map_dir = args.id_map id_map = load_id_map_from_dir(id_map_dir) docid_runs = defaultdict(dict) with open(input_runfile) as f: for line in f: qid, _, psgid, rank, score, tag = line.strip().split() score = float(score) psg_fn = "_".join(psgid.split("_")[:-1]) docid = id_map[psg_fn][psgid] if docid not in docid_runs[qid]: docid_runs[qid][docid] = score else: docid_runs[qid][docid] = max(score, docid_runs[qid][docid]) with open(output_runfile, "w") as f: for qid in sorted(docid_runs, key=lambda q: int(q)): docid2score = docid_runs[qid] for rank, (docid, score) in enumerate(sorted(docid2score.items(), key=lambda kv: kv[1], reverse=True)): f.write(f"{qid} Q0 {docid} {rank} {score} convert-from-passage-v2\n") print("finished") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Convert runfile that contain psg id into runfile that contain doc id.") parser.add_argument("--input", type=str, required=True, help="path to msmarco passage runfile.") parser.add_argument("--id-map", type=str, required=True, help="directory that contains msmarco passage-doc id mapping .tsv files. Generated by `build_passage_to_doc_id_map.py` under the same directory.") parser.add_argument("--output", type=str, required=True, help="output path to store document id runfile.") args = parser.parse_args() main(args)