# # Pyserini: Reproducible IR research with sparse and dense representations # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Starting point for writing this script # https://github.com/jacklin64/pyserini/blob/msmarcov2/scripts/msmarco_v2/segment_docs.py import argparse import os import sys import gzip import json from tqdm import tqdm import re import glob from multiprocessing import Pool, Manager def read_doc_corpus(f_ins, docid_to_title, docid_to_headings, docid_to_url): for f_in in f_ins: print('read {}...'.format(f_in)) with gzip.open(f_in, 'rt', encoding='utf8') as in_fh: for json_string in tqdm(in_fh): doc = json.loads(json_string) docid = doc['docid'] headings = doc['headings'] title = doc['title'] url = doc['url'] docid_to_title[docid] = title docid_to_headings[docid] = headings docid_to_url[docid] = url docid_to_pass_num[docid] = 0 def passage_corpus_to_tsv(f_ins, f_out): print('Output passages...') output = open(f_out, 'w') output_id = open(f_out.replace(".json", ".id"), 'w') max_len = 0 total_len = 0 counter = 0 for f_in in f_ins: with gzip.open(f_in, 'rt', encoding='utf8') as in_fh: for json_string in tqdm(in_fh): input_dict = json.loads(json_string) docid = input_dict['docid'] pid = input_dict['pid'] passage = input_dict['passage'] passage_len = len(input_dict['passage']) doc_url = docid_to_url[docid] doc_title = docid_to_title[docid] doc_headings = docid_to_headings[docid] docid_to_pass_num[docid]+=1 total_len += passage_len if (passage_len > max_len): max_len = passage_len counter+=1 output_dict = input_dict output_dict['url'] = doc_url output_dict['title'] = doc_title output_dict['headings'] = doc_headings output.write(json.dumps(output_dict) + '\n') output_id.write(pid+'\n') print('maximum passage length: {}'.format(max_len)) print('average passage length: {}'.format(total_len/counter)) output.close() output_id.close() print('Done!') if __name__ == '__main__': parser = argparse.ArgumentParser( description='Concatenate MS MARCO original docs with predicted queries') parser.add_argument('--original_psg_path', required=True, help='MS MARCO .tsv corpus file.') parser.add_argument('--original_doc_path', required=True, help='doc json file path.') parser.add_argument('--output_psg_path', required=True, help='Output file in the anserini jsonl format.') parser.add_argument('--num_workers', default=1, type=int) args = parser.parse_args() os.makedirs(args.output_psg_path, exist_ok=True) doc_files = glob.glob(os.path.join(args.original_doc_path, '*.gz')) psg_files = glob.glob(os.path.join(args.original_psg_path, '*.gz')) manager = Manager() docid_to_title = manager.dict() docid_to_headings = manager.dict() docid_to_url = manager.dict() docid_to_pass_num = manager.dict() num_files = len(doc_files) pool = Pool(args.num_workers) num_files_per_worker=num_files//args.num_workers for i in range(args.num_workers): if i==(args.num_workers-1): file_list = doc_files[i*num_files_per_worker:] else: file_list = doc_files[i*num_files_per_worker:((i+1)*num_files_per_worker)] pool.apply_async(read_doc_corpus ,(file_list, docid_to_title, docid_to_headings, docid_to_url)) pool.close() pool.join() print('Total document size: {}'.format(len(docid_to_title))) num_files = len(psg_files) pool = Pool(args.num_workers) num_files_per_worker=num_files//args.num_workers for i in range(args.num_workers): f_out = os.path.join(args.output_psg_path, 'psg' + str(i) + '.json') if i==(args.num_workers-1): file_list = psg_files[i*num_files_per_worker:] else: file_list = psg_files[i*num_files_per_worker:((i+1)*num_files_per_worker)] pool.apply_async(passage_corpus_to_tsv ,(file_list, f_out)) pool.close() pool.join() print('Done!')