Spaces:
Runtime error
Runtime error
# | |
# Pyserini: Python interface to the Anserini IR toolkit built on Lucene | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# | |
import json | |
import os | |
import argparse | |
def convert_collection(args): | |
print('Converting collection...') | |
predictions_file = open(args.predictions) | |
file_index = 0 | |
with open(args.collection_path) as f: | |
for i, line in enumerate(f): | |
# Start writting to a new file whent the current one reached its maximum capacity. | |
if i % args.max_docs_per_file == 0: | |
if i > 0: | |
output_jsonl_file.close() | |
output_path = os.path.join(args.output_folder, 'docs{:02d}.json'.format(file_index)) | |
output_jsonl_file = open(output_path, 'w') | |
file_index += 1 | |
doc_id, doc_text = line.rstrip().split('\t') | |
# Reads from predictions and merge then to the original doc text. | |
pred_text = [] | |
for _ in range(args.stride): | |
pred_text.append(predictions_file.readline().strip()) | |
pred_text = ' '.join(pred_text) | |
pred_text = pred_text.replace(' / ', ' ') | |
text = (doc_text + ' ') * args.original_copies + pred_text | |
output_dict = {'id': doc_id, 'contents': text} | |
output_jsonl_file.write(json.dumps(output_dict) + '\n') | |
if i % 100000 == 0: | |
print('Converted {} docs in {} files'.format(i, file_index)) | |
output_jsonl_file.close() | |
predictions_file.close() | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser( | |
description='Augments MS MARCO TSV collection with predicted queries ' + | |
'to create an expanded Anserini jsonl collection.') | |
parser.add_argument('--collection-path', required=True, help='MS MARCO tsv collection.') | |
parser.add_argument('--predictions', required=True, help='Query predictions file.') | |
parser.add_argument('--output-folder', required=True, help='Qutput folder for jsonl collection.') | |
parser.add_argument('--stride', required=True, type=int, | |
help='Every [s] lines in predictions file is associated with each document.') | |
parser.add_argument('--max-docs-per-file', default=1000000, type=int, | |
help='Maximum number of documents in each jsonl file.') | |
parser.add_argument('--original-copies', default=1, type=int, | |
help='Number of copies of the original document to duplicate.') | |
args = parser.parse_args() | |
if not os.path.exists(args.output_folder): | |
os.makedirs(args.output_folder) | |
convert_collection(args) | |
print('Done!') | |