Spaces:

geonmin-kim
/

NetsPresso_QA

Runtime error

App Files Files Community

NetsPresso_QA / tools /scripts /msmarco /augment_collection_with_predictions.py

geonmin-kim

Upload folder using huggingface_hub

d6585f5 about 1 year ago

raw

history blame

No virus

3.17 kB

	#
	# Pyserini: Python interface to the Anserini IR toolkit built on Lucene
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	import json
	import os
	import argparse


	def convert_collection(args):
	print('Converting collection...')

	predictions_file = open(args.predictions)
	file_index = 0
	with open(args.collection_path) as f:
	for i, line in enumerate(f):
	# Start writting to a new file whent the current one reached its maximum capacity.
	if i % args.max_docs_per_file == 0:
	if i > 0:
	output_jsonl_file.close()
	output_path = os.path.join(args.output_folder, 'docs{:02d}.json'.format(file_index))
	output_jsonl_file = open(output_path, 'w')
	file_index += 1

	doc_id, doc_text = line.rstrip().split('\t')

	# Reads from predictions and merge then to the original doc text.
	pred_text = []
	for _ in range(args.stride):
	pred_text.append(predictions_file.readline().strip())
	pred_text = ' '.join(pred_text)
	pred_text = pred_text.replace(' / ', ' ')
	text = (doc_text + ' ') * args.original_copies + pred_text

	output_dict = {'id': doc_id, 'contents': text}
	output_jsonl_file.write(json.dumps(output_dict) + '\n')

	if i % 100000 == 0:
	print('Converted {} docs in {} files'.format(i, file_index))

	output_jsonl_file.close()
	predictions_file.close()


	if __name__ == '__main__':
	parser = argparse.ArgumentParser(
	description='Augments MS MARCO TSV collection with predicted queries ' +
	'to create an expanded Anserini jsonl collection.')
	parser.add_argument('--collection-path', required=True, help='MS MARCO tsv collection.')
	parser.add_argument('--predictions', required=True, help='Query predictions file.')
	parser.add_argument('--output-folder', required=True, help='Qutput folder for jsonl collection.')
	parser.add_argument('--stride', required=True, type=int,
	help='Every [s] lines in predictions file is associated with each document.')
	parser.add_argument('--max-docs-per-file', default=1000000, type=int,
	help='Maximum number of documents in each jsonl file.')
	parser.add_argument('--original-copies', default=1, type=int,
	help='Number of copies of the original document to duplicate.')

	args = parser.parse_args()

	if not os.path.exists(args.output_folder):
	os.makedirs(args.output_folder)

	convert_collection(args)
	print('Done!')