Spaces:

geonmin-kim
/

NetsPresso_QA

Runtime error

App Files Files Community

NetsPresso_QA / tools /scripts /msmarco /convert_trec_to_msmarco_run.py

geonmin-kim

Upload folder using huggingface_hub

d6585f5 10 months ago

raw

history blame

No virus

3.46 kB

	#
	# Pyserini: Python interface to the Anserini IR toolkit built on Lucene
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	import argparse
	import logging


	if __name__ == '__main__':
	parser = argparse.ArgumentParser(
	description='Converts a TREC run file to a MS MARCO-formatted run file.')
	parser.add_argument('--input', required=True, default='', help='TREC-formatted run file')
	parser.add_argument('--output', required=True, default='',
	help='output MS MARCO-formatted run file')
	parser.add_argument('--k', type=int, default=-1,
	help='Number of hits to write to the run file. Write all hits if -1.')
	parser.add_argument('--quiet', action='store_true', help="Suppresses all warnings.")

	args = parser.parse_args()

	with open(args.output, 'w') as fout:
	last_score = None
	last_query_id = ''
	last_doc_id = ''
	n_docs = 0
	for line in open(args.input):
	query_id, _, doc_id, rank, score, _ = line.strip().split(' ')
	rank = int(rank)
	score = float(score)
	if query_id != last_query_id:
	last_score = None
	n_docs = 0

	if last_score is not None:
	if score == last_score and not args.quiet:
	logging.warning(
	f'Score of {score} for doc id {doc_id} is the same of doc id '
	f'{last_doc_id} for query id {query_id}. This will likely impact metrics '
	' negatively.')

	if rank == last_rank and not args.quiet:
	logging.warning(
	f'Rank of {rank} for doc id {doc_id} is the same of doc id '
	f'{last_doc_id} for query id {query_id}. This will likely impact metrics '
	' negatively.')

	if score > last_score and not args.quiet:
	logging.warning(
	f'Score of {score} for current doc id {doc_id} is greater than the score '
	f'{last_score} of the previous doc id {last_doc_id} for query id '
	f'{query_id}. This will likely impact metrics negatively.')

	if rank < last_rank and not args.quiet:
	logging.warning(
	f'Rank of {rank} for current doc id {doc_id} is lower than the rank '
	f'{last_rank} of the previous doc id {last_doc_id} for query id '
	f'{query_id}. This will likely impact metrics negatively.')

	if args.k == -1 or n_docs < args.k:
	fout.write('{}\t{}\t{}\n'.format(query_id, doc_id, rank))

	last_query_id = query_id
	last_doc_id = doc_id
	last_score = score
	last_rank = rank
	n_docs += 1

	logging.info(f'Done! Wrote output file to {args.output}')