Spaces:

geonmin-kim
/

NetsPresso_QA

Runtime error

App Files Files Community

NetsPresso_QA / scripts /ltr_msmarco /append_d2q_to_collection_jsonl.py

geonmin-kim

Upload folder using huggingface_hub

d6585f5 about 1 year ago

raw

history blame

No virus

2.74 kB

	#
	# Pyserini: Reproducible IR research with sparse and dense representations
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	import argparse
	import json
	import os
	from pyserini.analysis import Analyzer, get_lucene_analyzer
	"""
	append d2q prediction as an extra field to collection jsonl
	"""
	if __name__ == '__main__':
	parser = argparse.ArgumentParser(
	description='Converts MSMARCO\'s tsv collection to Anserini jsonl files.')
	parser.add_argument('--collection_path', required=True, help='MS MARCO .tsv collection file')
	parser.add_argument('--predictions', required=True, help='File containing predicted queries.')
	parser.add_argument('--output_folder', required=True, help='output folder')
	parser.add_argument('--max_docs_per_file', default=1000000, type=int,
	help='maximum number of documents in each jsonl file.')

	args = parser.parse_args()
	if not os.path.exists(args.output_folder):
	os.makedirs(args.output_folder)
	analyzer = Analyzer(get_lucene_analyzer())
	print('Converting collection...')

	file_index = 0
	new_words = 0
	total_words = 0

	with open(args.collection_path) as f_corpus, open(args.predictions) as f_pred:
	for i, (line_doc, line_pred) in enumerate(zip(f_corpus, f_pred)):
	# Write to a new file when the current one reaches maximum capacity.
	if i % args.max_docs_per_file == 0:
	if i > 0:
	output_jsonl_file.close()
	output_path = os.path.join(args.output_folder, f'docs{file_index:02d}.json')
	output_jsonl_file = open(output_path, 'w')
	file_index += 1

	doc_json = json.loads(line_doc)
	pred_text = line_pred.rstrip()

	predict_text = pred_text + ' '
	analyzed = analyzer.analyze(predict_text)
	for token in analyzed:
	assert ' ' not in token
	predict = ' '.join(analyzed)

	doc_json['predict'] = predict
	output_jsonl_file.write(json.dumps(doc_json) + '\n')

	if i % 100000 == 0:
	print('Converted {} docs in {} files'.format(i, file_index))

	output_jsonl_file.close()
	print('Done!')