geonmin-kim's picture
Upload folder using huggingface_hub
d6585f5
#
# Pyserini: Reproducible IR research with sparse and dense representations
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from enum import Enum
from pyserini.trectools import AggregationMethod, RescoreMethod, TrecRun
from typing import List
class FusionMethod(Enum):
RRF = 'rrf'
INTERPOLATION = 'interpolation'
AVERAGE = 'average'
def reciprocal_rank_fusion(runs: List[TrecRun], rrf_k: int = 60, depth: int = None, k: int = None):
"""Perform reciprocal rank fusion on a list of ``TrecRun`` objects. Implementation follows Cormack et al.
(SIGIR 2009) paper titled "Reciprocal Rank Fusion Outperforms Condorcet and Individual Rank Learning Methods."
Parameters
----------
runs : List[TrecRun]
List of ``TrecRun`` objects.
rrf_k : int
Parameter to avoid vanishing importance of lower-ranked documents. Note that this is different from the *k* in
top *k* retrieval; set to 60 by default, per Cormack et al.
depth : int
Maximum number of results from each input run to consider. Set to ``None`` by default, which indicates that
the complete list of results is considered.
k : int
Length of final results list. Set to ``None`` by default, which indicates that the union of all input documents
are ranked.
Returns
-------
TrecRun
Output ``TrecRun`` that combines input runs via reciprocal rank fusion.
"""
# TODO: Add option to *not* clone runs, thus making the method destructive, but also more efficient.
rrf_runs = [run.clone().rescore(method=RescoreMethod.RRF, rrf_k=rrf_k) for run in runs]
return TrecRun.merge(rrf_runs, AggregationMethod.SUM, depth=depth, k=k)
def interpolation(runs: List[TrecRun], alpha: int = 0.5, depth: int = None, k: int = None):
"""Perform fusion by interpolation on a list of exactly two ``TrecRun`` objects.
new_score = first_run_score * alpha + (1 - alpha) * second_run_score.
Parameters
----------
runs : List[TrecRun]
List of ``TrecRun`` objects. Exactly two runs.
alpha : int
Parameter alpha will be applied on the first run and (1 - alpha) will be applied on the second run.
depth : int
Maximum number of results from each input run to consider. Set to ``None`` by default, which indicates that
the complete list of results is considered.
k : int
Length of final results list. Set to ``None`` by default, which indicates that the union of all input documents
are ranked.
Returns
-------
TrecRun
Output ``TrecRun`` that combines input runs via interpolation.
"""
if len(runs) != 2:
raise Exception('Interpolation must be performed on exactly two runs.')
scaled_runs = []
scaled_runs.append(runs[0].clone().rescore(method=RescoreMethod.SCALE, scale=alpha))
scaled_runs.append(runs[1].clone().rescore(method=RescoreMethod.SCALE, scale=(1-alpha)))
return TrecRun.merge(scaled_runs, AggregationMethod.SUM, depth=depth, k=k)
def average(runs: List[TrecRun], depth: int = None, k: int = None):
"""Perform fusion by averaging on a list of ``TrecRun`` objects.
Parameters
----------
runs : List[TrecRun]
List of ``TrecRun`` objects.
depth : int
Maximum number of results from each input run to consider. Set to ``None`` by default, which indicates that
the complete list of results is considered.
k : int
Length of final results list. Set to ``None`` by default, which indicates that the union of all input documents
are ranked.
Returns
-------
TrecRun
Output ``TrecRun`` that combines input runs via averaging.
"""
scaled_runs = [run.clone().rescore(method=RescoreMethod.SCALE, scale=(1/len(runs))) for run in runs]
return TrecRun.merge(scaled_runs, AggregationMethod.SUM, depth=depth, k=k)