Spaces:

geonmin-kim
/

NetsPresso_QA

Runtime error

App Files Files Community

NetsPresso_QA / pyserini /trectools /_base.py

geonmin-kim

Upload folder using huggingface_hub

d6585f5 about 1 year ago

raw

history blame contribute delete

No virus

12.3 kB

	#
	# Pyserini: Reproducible IR research with sparse and dense representations
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	import itertools
	import numpy as np
	import pandas as pd

	from concurrent.futures import ThreadPoolExecutor
	from copy import deepcopy
	from enum import Enum
	from typing import List, Set, Tuple


	class AggregationMethod(Enum):
	SUM = 'sum'


	class RescoreMethod(Enum):
	RRF = 'rrf'
	SCALE = 'scale'
	NORMALIZE = 'normalize'


	class Qrels:
	"""Wrapper class for TREC Qrels.

	Parameters
	----------
	filepath : str
	File path of a given TREC Qrels.
	"""

	columns = ['topic', 'q0', 'docid', 'relevance_grade']

	def __init__(self, filepath: str = None):
	self.filepath = filepath
	self.qrels_data = pd.DataFrame(columns=Qrels.columns)

	if filepath is not None:
	self.read_run(self.filepath)

	def read_run(self, filepath: str):
	self.qrels_data = pd.read_csv(filepath, sep='\s+', names=Qrels.columns)

	def get_relevance_grades(self) -> Set[str]:
	"""Return a set with all relevance grades."""

	return set(sorted(self.qrels_data["relevance_grade"].unique()))

	def topics(self) -> Set[str]:
	"""Return a set with all topics."""

	return set(sorted(self.qrels_data["topic"].unique()))

	def get_docids(self, topic, relevance_grades=None) -> List[str]:
	""""Return a list of docids for a given topic and a list relevance grades.

	Parameters:
	----------
	relevance : List[int]
	E.g. [0, 1, 2]. If not provided, then all relevance will be returned.
	topic : int
	"""

	if relevance_grades is None:
	relevance_grades = self.get_relevance_grades()

	filtered_df = self.qrels_data[self.qrels_data['topic'] == topic]
	filtered_df = filtered_df[filtered_df['relevance_grade'].isin(relevance_grades)]

	return filtered_df['docid'].tolist()


	class TrecRun:
	"""Wrapper class for a TREC run.

	Parameters
	----------
	filepath : str
	File path of a given TREC Run.
	"""

	columns = ['topic', 'q0', 'docid', 'rank', 'score', 'tag']

	def __init__(self, filepath: str = None, resort: bool = False):
	self.reset_data()
	self.filepath = filepath
	self.resort = resort

	if filepath is not None:
	self.read_run(self.filepath,self.resort)

	def reset_data(self):
	self.run_data = pd.DataFrame(columns=TrecRun.columns)

	def read_run(self, filepath: str, resort: bool = False) -> None:
	self.run_data = pd.read_csv(filepath, sep='\s+', names=TrecRun.columns, dtype={'docid': 'str'})
	if resort:
	self.run_data.sort_values(["topic", "score"], inplace=True, ascending=[True, False])
	self.run_data["rank"] = self.run_data.groupby("topic")["score"].rank(ascending=False,method='first')

	def topics(self) -> Set[str]:
	"""Return a set with all topics."""
	return set(sorted(self.run_data["topic"].unique()))

	def clone(self):
	"""Return a deep copy of the current instance."""
	return deepcopy(self)

	def save_to_txt(self, output_path: str, tag: str = None) -> None:
	if len(self.run_data) == 0:
	raise Exception('Nothing to save. TrecRun is empty')

	if tag is not None:
	self.run_data['tag'] = tag

	self.run_data = self.run_data.sort_values(by=['topic', 'score'], ascending=[True, False])
	self.run_data.to_csv(output_path, sep=' ', header=False, index=False)

	def get_docs_by_topic(self, topic: str, max_docs: int = None):
	docs = self.run_data[self.run_data['topic'] == topic]

	if max_docs is not None:
	docs = docs.head(max_docs)

	return docs

	def rescore(self, method: RescoreMethod, rrf_k: int = None, scale: float = None):
	# Refer to this guide on how to efficiently manipulate dataframes: https://engineering.upside.com/a-beginners-guide-to-optimizing-pandas-code-for-speed-c09ef2c6a4d6
	if method == RescoreMethod.RRF:
	assert rrf_k is not None, 'Parameter "rrf_k" must be a valid integer.'
	self.run_data['score'] = 1 / (rrf_k + self.run_data['rank'].values)
	elif method == RescoreMethod.SCALE:
	assert scale is not None, 'Parameter "scale" must not be none.'
	self.run_data['score'] = self.run_data['score'].values * scale
	elif method == RescoreMethod.NORMALIZE:
	for topic in self.topics():
	scores = self.run_data[self.run_data['topic'] == topic]['score'].copy().values
	low = np.min(scores)
	high = np.max(scores)

	if high - low == 0:
	self.run_data.loc[self.run_data['topic'] == topic, 'score'] = 1
	else:
	scores = (scores - low) / (high - low)
	scores = [float(score) for score in scores]
	self.run_data.loc[self.run_data['topic'] == topic, 'score'] = scores
	else:
	raise NotImplementedError()

	return self

	def to_numpy(self) -> np.ndarray:
	return self.run_data.to_numpy(copy=True)

	def discard_qrels(self, qrels: Qrels, clone=True):
	"""Discard each docid in self if docid is also in the given qrels.
	This operation is performed on each topic separately.

	Parameters:
	----------
	qrels : Qrels
	Qrels with docids to remove from TrecRun.
	clone : Bool
	Return a new TrecRun object if True, else self will be modified and returned.
	"""

	return self._filter_from_qrels(qrels, False, clone=clone)

	def retain_qrels(self, qrels: Qrels, clone=True):
	"""Retain each docid in self if docid is also in the given qrels.
	This operation is performed on each topic separately.
	After this operation, judged@x based on the given qrels should be 1.

	Parameters:
	----------
	qrels : Qrels
	Qrels with docids to keep in TrecRun.
	clone : Bool
	Return a new TrecRun object if True, else self will be modified and returned.
	"""

	return self._filter_from_qrels(qrels, True, clone=clone)

	def _filter_from_qrels(self, qrels: Qrels, keep: bool, clone=True):
	"""Private helper function to remove/keep each docid in self if docid is also in the given Qrels object.
	This operation is performed on each topic separately.

	Parameters:
	----------
	qrels : Qrels
	Qrels with docids to remove from or keep in TrecRun.
	clone : Bool
	Return a new TrecRun object if True, else self will be modified and returned.
	"""

	df_list = []
	for topic in self.topics():
	if topic not in qrels.topics():
	continue

	qrels_docids = qrels.get_docids(topic)
	topic_df = self.run_data[self.run_data['topic'] == topic]
	if keep is True:
	topic_df = topic_df[topic_df['docid'].isin(qrels_docids)]
	else:
	topic_df = topic_df[~topic_df['docid'].isin(qrels_docids)]
	df_list.append(topic_df)

	run = TrecRun() if clone is True else self
	return TrecRun.from_dataframes(df_list, run)

	@staticmethod
	def get_all_topics_from_runs(runs) -> Set[str]:
	all_topics = set()
	for run in runs:
	all_topics = all_topics.union(run.topics())

	return all_topics

	@staticmethod
	def merge(runs, aggregation: AggregationMethod, depth: int = None, k: int = None):
	"""Return a TrecRun by aggregating docid in various ways such as summing scores

	Parameters
	----------
	runs : List[TrecRun]
	List of ``TrecRun`` objects.
	aggregation : AggregationMethod
	The aggregation method to use.
	depth : int
	Maximum number of results from each input run to consider. Set to ``None`` by default, which indicates that
	the complete list of results is considered.
	k : int
	Length of final results list. Set to ``None`` by default, which indicates that the union of all input documents
	are ranked.
	"""

	if len(runs) < 2:
	raise Exception('Merge requires at least 2 runs.')

	rows = []

	if aggregation == AggregationMethod.SUM:
	topics = list(TrecRun.get_all_topics_from_runs(runs))

	def merge_topic(topic):
	doc_scores = dict()

	for run in runs:
	for docid, score in run.get_docs_by_topic(topic, depth)[['docid', 'score']].values:
	doc_scores[docid] = doc_scores.get(docid, 0.0) + score

	sorted_doc_scores = sorted(iter(doc_scores.items()), key=lambda x: (-x[1], x[0]))
	sorted_doc_scores = sorted_doc_scores if k is None else sorted_doc_scores[:k]

	return [
	(topic, 'Q0', docid, rank, score, 'merge_sum')
	for rank, (docid, score) in enumerate(sorted_doc_scores, start=1)
	]

	max_workers = max(len(topics)/10, 1)
	with ThreadPoolExecutor(max_workers=int(max_workers)) as exec:
	results = list(exec.map(merge_topic, topics))

	rows = list(itertools.chain.from_iterable(results))
	else:
	raise NotImplementedError()

	return TrecRun.from_list(rows)

	@staticmethod
	def from_dataframes(dfs, run=None):
	"""Return a TrecRun by populating dataframe with the provided list of dataframes.

	Parameters
	----------
	dfs: List[Dataframe]
	A list of Dataframes conforming to TrecRun.columns

	run: TrecRun
	Set to ``None`` by default. If None, then a new instance of TrecRun will be created.
	Else, the given TrecRun will be modified.
	"""

	res = TrecRun() if run is None else run
	res.reset_data()
	res.run_data = pd.concat([df for df in dfs])

	return res

	@staticmethod
	def from_list(rows, run=None):
	"""Return a TrecRun by populating dataframe with the provided list of tuples.
	For performance reasons, df.to_numpy() is faster than df.iterrows().
	When manipulating dataframes, we first dump to np.ndarray and construct a list of tuples with new values.
	Then use this function to convert the list of tuples to a TrecRun object.

	Parameters
	----------
	rows: List[tuples]
	List of tuples in the following format: (topic, 'Q0', docid, rank, score, tag)

	run: TrecRun
	Set to ``None`` by default. If None, then a new instance of TrecRun will be created.
	Else, the given TrecRun will be modified.
	"""

	res = TrecRun() if run is None else run

	df = pd.DataFrame(rows)
	df.columns = TrecRun.columns
	res.run_data = df.copy()

	return res

	@staticmethod
	def from_search_results(docid_score_pair: Tuple[str, float], topic=1):
	rows = []

	for rank, (docid, score) in enumerate(docid_score_pair, start=1):
	rows.append((topic, 'Q0', docid, rank, score, 'searcher'))

	return TrecRun.from_list(rows)

	@staticmethod
	def concat(runs):
	"""Return a new TrecRun by concatenating a list of TrecRuns

	Parameters
	----------
	runs : List[TrecRun]
	List of ``TrecRun`` objects.
	"""

	run = TrecRun()
	run.run_data = pd.concat([run.run_data for run in runs])
	return run