geonmin-kim's picture
Upload folder using huggingface_hub
d6585f5
#
# Pyserini: Reproducible IR research with sparse and dense representations
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
This module provides Pyserini's Python search interface to Anserini. The main entry point is the ``LuceneSearcher``
class, which wraps the Java class with the same name in Anserini.
"""
import logging
import os
from pyserini.util import get_cache_home
from pyserini.pyclass import autoclass
logger = logging.getLogger(__name__)
# Wrappers around Lucene classes
JQuery = autoclass('org.apache.lucene.search.Query')
# Wrappers around Anserini classes
JQrels = autoclass('io.anserini.eval.Qrels')
JRelevanceJudgments = autoclass('io.anserini.eval.RelevanceJudgments')
JTopicReader = autoclass('io.anserini.search.topicreader.TopicReader')
JTopics = autoclass('io.anserini.search.topicreader.Topics')
JQueryGenerator = autoclass('io.anserini.search.query.QueryGenerator')
JBagOfWordsQueryGenerator = autoclass('io.anserini.search.query.BagOfWordsQueryGenerator')
JDisjunctionMaxQueryGenerator = autoclass('io.anserini.search.query.DisjunctionMaxQueryGenerator')
JCovid19QueryGenerator = autoclass('io.anserini.search.query.Covid19QueryGenerator')
topics_mapping = {
'trec1-adhoc': JTopics.TREC1_ADHOC,
'trec2-adhoc': JTopics.TREC2_ADHOC,
'trec3-adhoc': JTopics.TREC3_ADHOC,
'robust04': JTopics.ROBUST04,
'robust05': JTopics.ROBUST05,
'core17': JTopics.CORE17,
'core18': JTopics.CORE18,
'wt10g': JTopics.WT10G,
'trec2004-terabyte': JTopics.TREC2004_TERABYTE,
'trec2005-terabyte': JTopics.TREC2005_TERABYTE,
'trec2006-terabyte': JTopics.TREC2006_TERABYTE,
'trec2007-million-query': JTopics.TREC2007_MILLION_QUERY,
'trec2008-million-query': JTopics.TREC2008_MILLION_QUERY,
'trec2009-million-query': JTopics.TREC2009_MILLION_QUERY,
'trec2010-web': JTopics.TREC2010_WEB,
'trec2011-web': JTopics.TREC2011_WEB,
'trec2012-web': JTopics.TREC2012_WEB,
'trec2013-web': JTopics.TREC2013_WEB,
'trec2014-web': JTopics.TREC2014_WEB,
'mb11': JTopics.MB11,
'mb12': JTopics.MB12,
'mb13': JTopics.MB13,
'mb14': JTopics.MB14,
'car17v1.5-benchmarkY1test': JTopics.CAR17V15_BENCHMARK_Y1_TEST,
'car17v2.0-benchmarkY1test': JTopics.CAR17V20_BENCHMARK_Y1_TEST,
'dl19-doc': JTopics.TREC2019_DL_DOC,
'dl19-doc-unicoil': JTopics.TREC2019_DL_DOC_UNICOIL,
'dl19-doc-unicoil-noexp': JTopics.TREC2019_DL_DOC_UNICOIL_NOEXP,
'dl19-passage': JTopics.TREC2019_DL_PASSAGE,
'dl19-passage-unicoil': JTopics.TREC2019_DL_PASSAGE_UNICOIL,
'dl19-passage-unicoil-noexp': JTopics.TREC2019_DL_PASSAGE_UNICOIL_NOEXP,
'dl20': JTopics.TREC2020_DL,
'dl20-unicoil': JTopics.TREC2020_DL_UNICOIL,
'dl20-unicoil-noexp': JTopics.TREC2020_DL_UNICOIL_NOEXP,
'dl21': JTopics.TREC2021_DL,
'dl21-unicoil': JTopics.TREC2021_DL_UNICOIL,
'dl21-unicoil-noexp': JTopics.TREC2021_DL_UNICOIL_NOEXP,
'msmarco-doc-dev': JTopics.MSMARCO_DOC_DEV,
'msmarco-doc-dev-unicoil': JTopics.MSMARCO_DOC_DEV_UNICOIL,
'msmarco-doc-dev-unicoil-noexp': JTopics.MSMARCO_DOC_DEV_UNICOIL_NOEXP,
'msmarco-doc-test': JTopics.MSMARCO_DOC_TEST,
'msmarco-passage-dev-subset': JTopics.MSMARCO_PASSAGE_DEV_SUBSET,
'msmarco-passage-dev-subset-deepimpact': JTopics.MSMARCO_PASSAGE_DEV_SUBSET_DEEPIMPACT,
'msmarco-passage-dev-subset-unicoil': JTopics.MSMARCO_PASSAGE_DEV_SUBSET_UNICOIL,
'msmarco-passage-dev-subset-unicoil-noexp': JTopics.MSMARCO_PASSAGE_DEV_SUBSET_UNICOIL_NOEXP,
'msmarco-passage-dev-subset-unicoil-tilde': JTopics.MSMARCO_PASSAGE_DEV_SUBSET_UNICOIL_TILDE,
'msmarco-passage-dev-subset-distill-splade-max': JTopics.MSMARCO_PASSAGE_DEV_SUBSET_DISTILL_SPLADE_MAX,
'msmarco-passage-test-subset': JTopics.MSMARCO_PASSAGE_TEST_SUBSET,
'msmarco-v2-doc-dev': JTopics.MSMARCO_V2_DOC_DEV,
'msmarco-v2-doc-dev-unicoil': JTopics.MSMARCO_V2_DOC_DEV_UNICOIL,
'msmarco-v2-doc-dev-unicoil-noexp': JTopics.MSMARCO_V2_DOC_DEV_UNICOIL_NOEXP,
'msmarco-v2-doc-dev2': JTopics.MSMARCO_V2_DOC_DEV2,
'msmarco-v2-doc-dev2-unicoil': JTopics.MSMARCO_V2_DOC_DEV2_UNICOIL,
'msmarco-v2-doc-dev2-unicoil-noexp': JTopics.MSMARCO_V2_DOC_DEV2_UNICOIL_NOEXP,
'msmarco-v2-passage-dev': JTopics.MSMARCO_V2_PASSAGE_DEV,
'msmarco-v2-passage-dev-unicoil': JTopics.MSMARCO_V2_PASSAGE_DEV_UNICOIL,
'msmarco-v2-passage-dev-unicoil-noexp': JTopics.MSMARCO_V2_PASSAGE_DEV_UNICOIL_NOEXP,
'msmarco-v2-passage-dev2': JTopics.MSMARCO_V2_PASSAGE_DEV2,
'msmarco-v2-passage-dev2-unicoil': JTopics.MSMARCO_V2_PASSAGE_DEV2_UNICOIL,
'msmarco-v2-passage-dev2-unicoil-noexp': JTopics.MSMARCO_V2_PASSAGE_DEV2_UNICOIL_NOEXP,
'ntcir8-zh': JTopics.NTCIR8_ZH,
'clef2006-fr': JTopics.CLEF2006_FR,
'trec2002-ar': JTopics.TREC2002_AR,
'fire2012-bn': JTopics.FIRE2012_BN,
'fire2012-hi': JTopics.FIRE2012_HI,
'fire2012-en': JTopics.FIRE2012_EN,
'covid-round1': JTopics.COVID_ROUND1,
'covid-round1-udel': JTopics.COVID_ROUND1_UDEL,
'covid-round2': JTopics.COVID_ROUND2,
'covid-round2-udel': JTopics.COVID_ROUND2_UDEL,
'covid-round3': JTopics.COVID_ROUND3,
'covid-round3-udel': JTopics.COVID_ROUND3_UDEL,
'covid-round4': JTopics.COVID_ROUND4,
'covid-round4-udel': JTopics.COVID_ROUND4_UDEL,
'covid-round5': JTopics.COVID_ROUND5,
'covid-round5-udel': JTopics.COVID_ROUND5_UDEL,
'trec2018-bl': JTopics.TREC2018_BL,
'trec2019-bl': JTopics.TREC2019_BL,
'trec2020-bl': JTopics.TREC2020_BL,
'epidemic-qa-expert-prelim': JTopics.EPIDEMIC_QA_EXPERT_PRELIM,
'epidemic-qa-consumer-prelim': JTopics.EPIDEMIC_QA_CONSUMER_PRELIM,
'dpr-nq-dev': JTopics.DPR_NQ_DEV,
'dpr-nq-test': JTopics.DPR_NQ_TEST,
'dpr-trivia-dev': JTopics.DPR_TRIVIA_DEV,
'dpr-trivia-test': JTopics.DPR_TRIVIA_TEST,
'dpr-wq-test': JTopics.DPR_WQ_TEST,
'dpr-squad-test': JTopics.DPR_SQUAD_TEST,
'dpr-curated-test': JTopics.DPR_CURATED_TEST,
'dpr-trivia-test-gar-t5-answers': JTopics.DPR_TRIVIA_TEST_GART5_ANSWERS,
'dpr-trivia-test-gar-t5-titles': JTopics.DPR_TRIVIA_TEST_GART5_TITLES,
'dpr-trivia-test-gar-t5-sentences': JTopics.DPR_TRIVIA_TEST_GART5_SENTENCES,
'dpr-trivia-test-gar-t5-all': JTopics.DPR_TRIVIA_TEST_GART5_ALL,
'nq-test-gar-t5-answers': JTopics.NQ_TEST_GART5_ANSWERS,
'nq-test-gar-t5-titles': JTopics.NQ_TEST_GART5_TITLES,
'nq-test-gar-t5-sentences': JTopics.NQ_TEST_GART5_SENTENCES,
'nq-test-gar-t5-all': JTopics.NQ_TEST_GART5_ALL,
'nq-dev': JTopics.NQ_DEV,
'nq-test': JTopics.NQ_TEST,
'mrtydi-v1.1-arabic-train': JTopics.MRTYDI_V11_AR_TRAIN,
'mrtydi-v1.1-arabic-dev': JTopics.MRTYDI_V11_AR_DEV,
'mrtydi-v1.1-arabic-test': JTopics.MRTYDI_V11_AR_TEST,
'mrtydi-v1.1-bengali-train': JTopics.MRTYDI_V11_BN_TRAIN,
'mrtydi-v1.1-bengali-dev': JTopics.MRTYDI_V11_BN_DEV,
'mrtydi-v1.1-bengali-test': JTopics.MRTYDI_V11_BN_TEST,
'mrtydi-v1.1-english-train': JTopics.MRTYDI_V11_EN_TRAIN,
'mrtydi-v1.1-english-dev': JTopics.MRTYDI_V11_EN_DEV,
'mrtydi-v1.1-english-test': JTopics.MRTYDI_V11_EN_TEST,
'mrtydi-v1.1-finnish-train': JTopics.MRTYDI_V11_FI_TRAIN,
'mrtydi-v1.1-finnish-dev': JTopics.MRTYDI_V11_FI_DEV,
'mrtydi-v1.1-finnish-test': JTopics.MRTYDI_V11_FI_TEST,
'mrtydi-v1.1-indonesian-train': JTopics.MRTYDI_V11_ID_TRAIN,
'mrtydi-v1.1-indonesian-dev': JTopics.MRTYDI_V11_ID_DEV,
'mrtydi-v1.1-indonesian-test': JTopics.MRTYDI_V11_ID_TEST,
'mrtydi-v1.1-japanese-train': JTopics.MRTYDI_V11_JA_TRAIN,
'mrtydi-v1.1-japanese-dev': JTopics.MRTYDI_V11_JA_DEV,
'mrtydi-v1.1-japanese-test': JTopics.MRTYDI_V11_JA_TEST,
'mrtydi-v1.1-korean-train': JTopics.MRTYDI_V11_KO_TRAIN,
'mrtydi-v1.1-korean-dev': JTopics.MRTYDI_V11_KO_DEV,
'mrtydi-v1.1-korean-test': JTopics.MRTYDI_V11_KO_TEST,
'mrtydi-v1.1-russian-train': JTopics.MRTYDI_V11_RU_TRAIN,
'mrtydi-v1.1-russian-dev': JTopics.MRTYDI_V11_RU_DEV,
'mrtydi-v1.1-russian-test': JTopics.MRTYDI_V11_RU_TEST,
'mrtydi-v1.1-swahili-train': JTopics.MRTYDI_V11_SW_TRAIN,
'mrtydi-v1.1-swahili-dev': JTopics.MRTYDI_V11_SW_DEV,
'mrtydi-v1.1-swahili-test': JTopics.MRTYDI_V11_SW_TEST,
'mrtydi-v1.1-telugu-train': JTopics.MRTYDI_V11_TE_TRAIN,
'mrtydi-v1.1-telugu-dev': JTopics.MRTYDI_V11_TE_DEV,
'mrtydi-v1.1-telugu-test': JTopics.MRTYDI_V11_TE_TEST,
'mrtydi-v1.1-thai-train': JTopics.MRTYDI_V11_TH_TRAIN,
'mrtydi-v1.1-thai-dev': JTopics.MRTYDI_V11_TH_DEV,
'mrtydi-v1.1-thai-test': JTopics.MRTYDI_V11_TH_TEST,
'beir-v1.0.0-trec-covid-test': JTopics.BEIR_V1_0_0_TREC_COVID_TEST,
'beir-v1.0.0-bioasq-test': JTopics.BEIR_V1_0_0_BIOASQ_TEST,
'beir-v1.0.0-nfcorpus-test': JTopics.BEIR_V1_0_0_NFCORPUS_TEST,
'beir-v1.0.0-nq-test': JTopics.BEIR_V1_0_0_NQ_TEST,
'beir-v1.0.0-hotpotqa-test': JTopics.BEIR_V1_0_0_HOTPOTQA_TEST,
'beir-v1.0.0-fiqa-test': JTopics.BEIR_V1_0_0_FIQA_TEST,
'beir-v1.0.0-signal1m-test': JTopics.BEIR_V1_0_0_SIGNAL1M_TEST,
'beir-v1.0.0-trec-news-test': JTopics.BEIR_V1_0_0_TREC_NEWS_TEST,
'beir-v1.0.0-robust04-test': JTopics.BEIR_V1_0_0_ROBUST04_TEST,
'beir-v1.0.0-arguana-test': JTopics.BEIR_V1_0_0_ARGUANA_TEST,
'beir-v1.0.0-webis-touche2020-test': JTopics.BEIR_V1_0_0_WEBIS_TOUCHE2020_TEST,
'beir-v1.0.0-cqadupstack-android-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_ANDROID_TEST,
'beir-v1.0.0-cqadupstack-english-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_ENGLISH_TEST,
'beir-v1.0.0-cqadupstack-gaming-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_GAMING_TEST,
'beir-v1.0.0-cqadupstack-gis-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_GIS_TEST,
'beir-v1.0.0-cqadupstack-mathematica-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_MATHEMATICA_TEST,
'beir-v1.0.0-cqadupstack-physics-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_PHYSICS_TEST,
'beir-v1.0.0-cqadupstack-programmers-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_PROGRAMMERS_TEST,
'beir-v1.0.0-cqadupstack-stats-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_STATS_TEST,
'beir-v1.0.0-cqadupstack-tex-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_TEX_TEST,
'beir-v1.0.0-cqadupstack-unix-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_UNIX_TEST,
'beir-v1.0.0-cqadupstack-webmasters-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_WEBMASTERS_TEST,
'beir-v1.0.0-cqadupstack-wordpress-test': JTopics.BEIR_V1_0_0_CQADUPSTACK_WORDPRESS_TEST,
'beir-v1.0.0-quora-test': JTopics.BEIR_V1_0_0_QUORA_TEST,
'beir-v1.0.0-dbpedia-entity-test': JTopics.BEIR_V1_0_0_DBPEDIA_ENTITY_TEST,
'beir-v1.0.0-scidocs-test': JTopics.BEIR_V1_0_0_SCIDOCS_TEST,
'beir-v1.0.0-fever-test': JTopics.BEIR_V1_0_0_FEVER_TEST,
'beir-v1.0.0-climate-fever-test': JTopics.BEIR_V1_0_0_CLIMATE_FEVER_TEST,
'beir-v1.0.0-scifact-test': JTopics.BEIR_V1_0_0_SCIFACT_TEST,
'beir-v1.0.0-trec-covid-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_TREC_COVID_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM,
'beir-v1.0.0-bioasq-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_BIOASQ_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM,
'beir-v1.0.0-nfcorpus-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_NFCORPUS_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM,
'beir-v1.0.0-nq-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_NQ_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM,
'beir-v1.0.0-hotpotqa-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_HOTPOTQA_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM,
'beir-v1.0.0-fiqa-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_FIQA_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM,
'beir-v1.0.0-signal1m-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_SIGNAL1M_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM,
'beir-v1.0.0-trec-news-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_TREC_NEWS_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM,
'beir-v1.0.0-robust04-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_ROBUST04_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM,
'beir-v1.0.0-arguana-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_ARGUANA_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM,
'beir-v1.0.0-webis-touche2020-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_WEBIS_TOUCHE2020_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM,
'beir-v1.0.0-cqadupstack-android-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_ANDROID_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM,
'beir-v1.0.0-cqadupstack-english-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_ENGLISH_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM,
'beir-v1.0.0-cqadupstack-gaming-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_GAMING_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM,
'beir-v1.0.0-cqadupstack-gis-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_GIS_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM,
'beir-v1.0.0-cqadupstack-mathematica-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_MATHEMATICA_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM,
'beir-v1.0.0-cqadupstack-physics-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_PHYSICS_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM,
'beir-v1.0.0-cqadupstack-programmers-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_PROGRAMMERS_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM,
'beir-v1.0.0-cqadupstack-stats-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_STATS_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM,
'beir-v1.0.0-cqadupstack-tex-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_TEX_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM,
'beir-v1.0.0-cqadupstack-unix-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_UNIX_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM,
'beir-v1.0.0-cqadupstack-webmasters-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_WEBMASTERS_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM,
'beir-v1.0.0-cqadupstack-wordpress-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CQADUPSTACK_WORDPRESS_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM,
'beir-v1.0.0-quora-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_QUORA_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM,
'beir-v1.0.0-dbpedia-entity-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_DBPEDIA_ENTITY_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM,
'beir-v1.0.0-scidocs-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_SCIDOCS_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM,
'beir-v1.0.0-fever-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_FEVER_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM,
'beir-v1.0.0-climate-fever-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_CLIMATE_FEVER_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM,
'beir-v1.0.0-scifact-test-splade_distil_cocodenser_medium': JTopics.BEIR_V1_0_0_SCIFACT_TEST_SPLADE_DISTILL_COCODENSER_MEDIUM,
'hc4-v1.0-fa-dev-title': JTopics.HC4_V1_0_FA_DEV_TITLE,
'hc4-v1.0-fa-dev-desc': JTopics.HC4_V1_0_FA_DEV_DESC,
'hc4-v1.0-fa-dev-desc-title': JTopics.HC4_V1_0_FA_DEV_DESC_TITLE,
'hc4-v1.0-fa-test-title': JTopics.HC4_V1_0_FA_TEST_TITLE,
'hc4-v1.0-fa-test-desc': JTopics.HC4_V1_0_FA_TEST_DESC,
'hc4-v1.0-fa-test-desc-title': JTopics.HC4_V1_0_FA_TEST_DESC_TITLE,
'hc4-v1.0-fa-en-test-title': JTopics.HC4_V1_0_FA_EN_TEST_TITLE,
'hc4-v1.0-fa-en-test-desc': JTopics.HC4_V1_0_FA_EN_TEST_DESC,
'hc4-v1.0-fa-en-test-desc-title': JTopics.HC4_V1_0_FA_EN_TEST_DESC_TITLE,
'hc4-v1.0-ru-dev-title': JTopics.HC4_V1_0_RU_DEV_TITLE,
'hc4-v1.0-ru-dev-desc': JTopics.HC4_V1_0_RU_DEV_DESC,
'hc4-v1.0-ru-dev-desc-title': JTopics.HC4_V1_0_RU_DEV_DESC_TITLE,
'hc4-v1.0-ru-test-title': JTopics.HC4_V1_0_RU_TEST_TITLE,
'hc4-v1.0-ru-test-desc': JTopics.HC4_V1_0_RU_TEST_DESC,
'hc4-v1.0-ru-test-desc-title': JTopics.HC4_V1_0_RU_TEST_DESC_TITLE,
'hc4-v1.0-ru-en-test-title': JTopics.HC4_V1_0_RU_EN_TEST_TITLE,
'hc4-v1.0-ru-en-test-desc': JTopics.HC4_V1_0_RU_EN_TEST_DESC,
'hc4-v1.0-ru-en-test-desc-title': JTopics.HC4_V1_0_RU_EN_TEST_DESC_TITLE,
'hc4-v1.0-zh-dev-title': JTopics.HC4_V1_0_ZH_DEV_TITLE,
'hc4-v1.0-zh-dev-desc': JTopics.HC4_V1_0_ZH_DEV_DESC,
'hc4-v1.0-zh-dev-desc-title': JTopics.HC4_V1_0_ZH_DEV_DESC_TITLE,
'hc4-v1.0-zh-test-title': JTopics.HC4_V1_0_ZH_TEST_TITLE,
'hc4-v1.0-zh-test-desc': JTopics.HC4_V1_0_ZH_TEST_DESC,
'hc4-v1.0-zh-test-desc-title': JTopics.HC4_V1_0_ZH_TEST_DESC_TITLE,
'hc4-v1.0-zh-en-test-title': JTopics.HC4_V1_0_ZH_EN_TEST_TITLE,
'hc4-v1.0-zh-en-test-desc': JTopics.HC4_V1_0_ZH_EN_TEST_DESC,
'hc4-v1.0-zh-en-test-desc-title': JTopics.HC4_V1_0_ZH_EN_TEST_DESC_TITLE,
# NeuCLIR 2022 topics
'neuclir22-en-title': JTopics.NEUCLIR22_EN_TITLE,
'neuclir22-en-desc': JTopics.NEUCLIR22_EN_DESC,
'neuclir22-en-desc-title': JTopics.NEUCLIR22_EN_DESC_TITLE,
'neuclir22-fa-ht-title': JTopics.NEUCLIR22_FA_HT_TITLE,
'neuclir22-fa-ht-desc': JTopics.NEUCLIR22_FA_HT_DESC,
'neuclir22-fa-ht-desc-title': JTopics.NEUCLIR22_FA_HT_DESC_TITLE,
'neuclir22-fa-mt-title': JTopics.NEUCLIR22_FA_MT_TITLE,
'neuclir22-fa-mt-desc': JTopics.NEUCLIR22_FA_MT_DESC,
'neuclir22-fa-mt-desc-title': JTopics.NEUCLIR22_FA_MT_DESC_TITLE,
'neuclir22-ru-ht-title': JTopics.NEUCLIR22_RU_HT_TITLE,
'neuclir22-ru-ht-desc': JTopics.NEUCLIR22_RU_HT_DESC,
'neuclir22-ru-ht-desc-title': JTopics.NEUCLIR22_RU_HT_DESC_TITLE,
'neuclir22-ru-mt-title': JTopics.NEUCLIR22_RU_MT_TITLE,
'neuclir22-ru-mt-desc': JTopics.NEUCLIR22_RU_MT_DESC,
'neuclir22-ru-mt-desc-title': JTopics.NEUCLIR22_RU_MT_DESC_TITLE,
'neuclir22-zh-ht-title': JTopics.NEUCLIR22_ZH_HT_TITLE,
'neuclir22-zh-ht-desc': JTopics.NEUCLIR22_ZH_HT_DESC,
'neuclir22-zh-ht-desc-title': JTopics.NEUCLIR22_ZH_HT_DESC_TITLE,
'neuclir22-zh-mt-title': JTopics.NEUCLIR22_ZH_MT_TITLE,
'neuclir22-zh-mt-desc': JTopics.NEUCLIR22_ZH_MT_DESC,
'neuclir22-zh-mt-desc-title': JTopics.NEUCLIR22_ZH_MT_DESC_TITLE,
# MIRACL topics
'miracl-v1.0-ar-dev': JTopics.MIRACL_V10_AR_DEV,
'miracl-v1.0-bn-dev': JTopics.MIRACL_V10_BN_DEV,
'miracl-v1.0-en-dev': JTopics.MIRACL_V10_EN_DEV,
'miracl-v1.0-es-dev': JTopics.MIRACL_V10_ES_DEV,
'miracl-v1.0-fa-dev': JTopics.MIRACL_V10_FA_DEV,
'miracl-v1.0-fi-dev': JTopics.MIRACL_V10_FI_DEV,
'miracl-v1.0-fr-dev': JTopics.MIRACL_V10_FR_DEV,
'miracl-v1.0-hi-dev': JTopics.MIRACL_V10_HI_DEV,
'miracl-v1.0-id-dev': JTopics.MIRACL_V10_ID_DEV,
'miracl-v1.0-ja-dev': JTopics.MIRACL_V10_JA_DEV,
'miracl-v1.0-ko-dev': JTopics.MIRACL_V10_KO_DEV,
'miracl-v1.0-ru-dev': JTopics.MIRACL_V10_RU_DEV,
'miracl-v1.0-sw-dev': JTopics.MIRACL_V10_SW_DEV,
'miracl-v1.0-te-dev': JTopics.MIRACL_V10_TE_DEV,
'miracl-v1.0-th-dev': JTopics.MIRACL_V10_TH_DEV,
'miracl-v1.0-zh-dev': JTopics.MIRACL_V10_ZH_DEV,
'miracl-v1.0-de-dev': JTopics.MIRACL_V10_DE_DEV,
'miracl-v1.0-yo-dev': JTopics.MIRACL_V10_YO_DEV,
}
qrels_mapping = {
'trec1-adhoc': JQrels.TREC1_ADHOC,
'trec2-adhoc': JQrels.TREC2_ADHOC,
'trec3-adhoc': JQrels.TREC3_ADHOC,
'robust04': JQrels.ROBUST04,
'robust05': JQrels.ROBUST05,
'core17': JQrels.CORE17,
'core18': JQrels.CORE18,
'wt10g': JQrels.WT10G,
'trec2004-terabyte': JQrels.TREC2004_TERABYTE,
'trec2005-terabyte': JQrels.TREC2005_TERABYTE,
'trec2006-terabyte': JQrels.TREC2006_TERABYTE,
'trec2011-web': JQrels.TREC2011_WEB,
'trec2012-web': JQrels.TREC2012_WEB,
'trec2013-web': JQrels.TREC2013_WEB,
'trec2014-web': JQrels.TREC2014_WEB,
'mb11': JQrels.MB11,
'mb12': JQrels.MB12,
'mb13': JQrels.MB13,
'mb14': JQrels.MB14,
'car17v1.5-benchmarkY1test': JQrels.CAR17V15_BENCHMARK_Y1_TEST,
'car17v2.0-benchmarkY1test': JQrels.CAR17V20_BENCHMARK_Y1_TEST,
'dl19-doc': JQrels.TREC2019_DL_DOC,
'dl19-passage': JQrels.TREC2019_DL_PASSAGE,
'dl20-doc': JQrels.TREC2020_DL_DOC,
'dl20-passage': JQrels.TREC2020_DL_PASSAGE,
'dl21-doc': JQrels.TREC2021_DL_DOC,
'dl21-passage': JQrels.TREC2021_DL_PASSAGE,
'msmarco-doc-dev': JQrels.MSMARCO_DOC_DEV,
'msmarco-passage-dev-subset': JQrels.MSMARCO_PASSAGE_DEV_SUBSET,
'msmarco-v2-doc-dev': JQrels.MSMARCO_V2_DOC_DEV,
'msmarco-v2-doc-dev2': JQrels.MSMARCO_V2_DOC_DEV2,
'msmarco-v2-passage-dev': JQrels.MSMARCO_V2_PASSAGE_DEV,
'msmarco-v2-passage-dev2': JQrels.MSMARCO_V2_PASSAGE_DEV2,
'ntcir8-zh': JQrels.NTCIR8_ZH,
'clef2006-fr': JQrels.CLEF2006_FR,
'trec2002-ar': JQrels.TREC2002_AR,
'fire2012-bn': JQrels.FIRE2012_BN,
'fire2012-hi': JQrels.FIRE2012_HI,
'fire2012-en': JQrels.FIRE2012_EN,
'covid-complete': JQrels.COVID_COMPLETE,
'covid-round1': JQrels.COVID_ROUND1,
'covid-round2': JQrels.COVID_ROUND2,
'covid-round3': JQrels.COVID_ROUND3,
'covid-round3-cumulative': JQrels.COVID_ROUND3_CUMULATIVE,
'covid-round4': JQrels.COVID_ROUND4,
'covid-round4-cumulative': JQrels.COVID_ROUND4_CUMULATIVE,
'covid-round5': JQrels.COVID_ROUND5,
'trec2018-bl': JQrels.TREC2018_BL,
'trec2019-bl': JQrels.TREC2019_BL,
'trec2020-bl': JQrels.TREC2020_BL,
'mrtydi-v1.1-arabic-train': JQrels.MRTYDI_V11_AR_TRAIN,
'mrtydi-v1.1-arabic-dev': JQrels.MRTYDI_V11_AR_DEV,
'mrtydi-v1.1-arabic-test': JQrels.MRTYDI_V11_AR_TEST,
'mrtydi-v1.1-bengali-train': JQrels.MRTYDI_V11_BN_TRAIN,
'mrtydi-v1.1-bengali-dev': JQrels.MRTYDI_V11_BN_DEV,
'mrtydi-v1.1-bengali-test': JQrels.MRTYDI_V11_BN_TEST,
'mrtydi-v1.1-english-train': JQrels.MRTYDI_V11_EN_TRAIN,
'mrtydi-v1.1-english-dev': JQrels.MRTYDI_V11_EN_DEV,
'mrtydi-v1.1-english-test': JQrels.MRTYDI_V11_EN_TEST,
'mrtydi-v1.1-finnish-train': JQrels.MRTYDI_V11_FI_TRAIN,
'mrtydi-v1.1-finnish-dev': JQrels.MRTYDI_V11_FI_DEV,
'mrtydi-v1.1-finnish-test': JQrels.MRTYDI_V11_FI_TEST,
'mrtydi-v1.1-indonesian-train': JQrels.MRTYDI_V11_ID_TRAIN,
'mrtydi-v1.1-indonesian-dev': JQrels.MRTYDI_V11_ID_DEV,
'mrtydi-v1.1-indonesian-test': JQrels.MRTYDI_V11_ID_TEST,
'mrtydi-v1.1-japanese-train': JQrels.MRTYDI_V11_JA_TRAIN,
'mrtydi-v1.1-japanese-dev': JQrels.MRTYDI_V11_JA_DEV,
'mrtydi-v1.1-japanese-test': JQrels.MRTYDI_V11_JA_TEST,
'mrtydi-v1.1-korean-train': JQrels.MRTYDI_V11_KO_TRAIN,
'mrtydi-v1.1-korean-dev': JQrels.MRTYDI_V11_KO_DEV,
'mrtydi-v1.1-korean-test': JQrels.MRTYDI_V11_KO_TEST,
'mrtydi-v1.1-russian-train': JQrels.MRTYDI_V11_RU_TRAIN,
'mrtydi-v1.1-russian-dev': JQrels.MRTYDI_V11_RU_DEV,
'mrtydi-v1.1-russian-test': JQrels.MRTYDI_V11_RU_TEST,
'mrtydi-v1.1-swahili-train': JQrels.MRTYDI_V11_SW_TRAIN,
'mrtydi-v1.1-swahili-dev': JQrels.MRTYDI_V11_SW_DEV,
'mrtydi-v1.1-swahili-test': JQrels.MRTYDI_V11_SW_TEST,
'mrtydi-v1.1-telugu-train': JQrels.MRTYDI_V11_TE_TRAIN,
'mrtydi-v1.1-telugu-dev': JQrels.MRTYDI_V11_TE_DEV,
'mrtydi-v1.1-telugu-test': JQrels.MRTYDI_V11_TE_TEST,
'mrtydi-v1.1-thai-train': JQrels.MRTYDI_V11_TH_TRAIN,
'mrtydi-v1.1-thai-dev': JQrels.MRTYDI_V11_TH_DEV,
'mrtydi-v1.1-thai-test': JQrels.MRTYDI_V11_TH_TEST,
'beir-v1.0.0-trec-covid-test': JQrels.BEIR_V1_0_0_TREC_COVID_TEST,
'beir-v1.0.0-bioasq-test': JQrels.BEIR_V1_0_0_BIOASQ_TEST,
'beir-v1.0.0-nfcorpus-test': JQrels.BEIR_V1_0_0_NFCORPUS_TEST,
'beir-v1.0.0-nq-test': JQrels.BEIR_V1_0_0_NQ_TEST,
'beir-v1.0.0-hotpotqa-test': JQrels.BEIR_V1_0_0_HOTPOTQA_TEST,
'beir-v1.0.0-fiqa-test': JQrels.BEIR_V1_0_0_FIQA_TEST,
'beir-v1.0.0-signal1m-test': JQrels.BEIR_V1_0_0_SIGNAL1M_TEST,
'beir-v1.0.0-trec-news-test': JQrels.BEIR_V1_0_0_TREC_NEWS_TEST,
'beir-v1.0.0-robust04-test': JQrels.BEIR_V1_0_0_ROBUST04_TEST,
'beir-v1.0.0-arguana-test': JQrels.BEIR_V1_0_0_ARGUANA_TEST,
'beir-v1.0.0-webis-touche2020-test': JQrels.BEIR_V1_0_0_WEBIS_TOUCHE2020_TEST,
'beir-v1.0.0-cqadupstack-android-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_ANDROID_TEST,
'beir-v1.0.0-cqadupstack-english-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_ENGLISH_TEST,
'beir-v1.0.0-cqadupstack-gaming-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_GAMING_TEST,
'beir-v1.0.0-cqadupstack-gis-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_GIS_TEST,
'beir-v1.0.0-cqadupstack-mathematica-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_MATHEMATICA_TEST,
'beir-v1.0.0-cqadupstack-physics-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_PHYSICS_TEST,
'beir-v1.0.0-cqadupstack-programmers-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_PROGRAMMERS_TEST,
'beir-v1.0.0-cqadupstack-stats-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_STATS_TEST,
'beir-v1.0.0-cqadupstack-tex-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_TEX_TEST,
'beir-v1.0.0-cqadupstack-unix-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_UNIX_TEST,
'beir-v1.0.0-cqadupstack-webmasters-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_WEBMASTERS_TEST,
'beir-v1.0.0-cqadupstack-wordpress-test': JQrels.BEIR_V1_0_0_CQADUPSTACK_WORDPRESS_TEST,
'beir-v1.0.0-quora-test': JQrels.BEIR_V1_0_0_QUORA_TEST,
'beir-v1.0.0-dbpedia-entity-test': JQrels.BEIR_V1_0_0_DBPEDIA_ENTITY_TEST,
'beir-v1.0.0-scidocs-test': JQrels.BEIR_V1_0_0_SCIDOCS_TEST,
'beir-v1.0.0-fever-test': JQrels.BEIR_V1_0_0_FEVER_TEST,
'beir-v1.0.0-climate-fever-test': JQrels.BEIR_V1_0_0_CLIMATE_FEVER_TEST,
'beir-v1.0.0-scifact-test': JQrels.BEIR_V1_0_0_SCIFACT_TEST,
'hc4-v1.0-fa-dev': JQrels.HC4_V1_0_FA_DEV,
'hc4-v1.0-fa-test': JQrels.HC4_V1_0_FA_TEST,
'hc4-v1.0-ru-dev': JQrels.HC4_V1_0_RU_DEV,
'hc4-v1.0-ru-test': JQrels.HC4_V1_0_RU_TEST,
'hc4-v1.0-zh-dev': JQrels.HC4_V1_0_ZH_DEV,
'hc4-v1.0-zh-test': JQrels.HC4_V1_0_ZH_TEST,
'hc4-neuclir22-fa-test': JQrels.HC4_NEUCLIR22_FA_TEST,
'hc4-neuclir22-ru-test': JQrels.HC4_NEUCLIR22_RU_TEST,
'hc4-neuclir22-zh-test': JQrels.HC4_NEUCLIR22_ZH_TEST,
'miracl-v1.0-ar-dev': JQrels.MIRACL_V10_AR_DEV,
'miracl-v1.0-bn-dev': JQrels.MIRACL_V10_BN_DEV,
'miracl-v1.0-en-dev': JQrels.MIRACL_V10_EN_DEV,
'miracl-v1.0-es-dev': JQrels.MIRACL_V10_ES_DEV,
'miracl-v1.0-fa-dev': JQrels.MIRACL_V10_FA_DEV,
'miracl-v1.0-fi-dev': JQrels.MIRACL_V10_FI_DEV,
'miracl-v1.0-fr-dev': JQrels.MIRACL_V10_FR_DEV,
'miracl-v1.0-hi-dev': JQrels.MIRACL_V10_HI_DEV,
'miracl-v1.0-id-dev': JQrels.MIRACL_V10_ID_DEV,
'miracl-v1.0-ja-dev': JQrels.MIRACL_V10_JA_DEV,
'miracl-v1.0-ko-dev': JQrels.MIRACL_V10_KO_DEV,
'miracl-v1.0-ru-dev': JQrels.MIRACL_V10_RU_DEV,
'miracl-v1.0-sw-dev': JQrels.MIRACL_V10_SW_DEV,
'miracl-v1.0-te-dev': JQrels.MIRACL_V10_TE_DEV,
'miracl-v1.0-th-dev': JQrels.MIRACL_V10_TH_DEV,
'miracl-v1.0-zh-dev': JQrels.MIRACL_V10_ZH_DEV,
'miracl-v1.0-de-dev': JQrels.MIRACL_V10_DE_DEV,
'miracl-v1.0-yo-dev': JQrels.MIRACL_V10_YO_DEV,
}
def get_topics(collection_name):
"""
Parameters
----------
collection_name : str
collection_name
Returns
-------
result : dictionary
Topics as a dictionary
"""
if collection_name not in topics_mapping:
raise ValueError(f'Topic {collection_name} Not Found')
topics = JTopicReader.getTopicsWithStringIds(topics_mapping[collection_name])
t = {}
for topic in topics.keySet().toArray():
if topic.isdigit():
# parse the keys into integers
topic_key = int(topic)
else:
topic_key = topic
t[topic_key] = {}
for key in topics.get(topic).keySet().toArray():
t[topic_key][key] = topics.get(topic).get(key)
return t
def get_topics_with_reader(reader_class, file):
# Yes, this is an insanely ridiculous method name.
topics = JTopicReader.getTopicsWithStringIdsFromFileWithTopicReaderClass(reader_class, file)
if topics is None:
raise ValueError(f'Unable to initialize TopicReader {reader_class} with file {file}!')
t = {}
for topic in topics.keySet().toArray():
if topic.isdigit():
# parse the keys into integers
topic_key = int(topic)
else:
topic_key = topic
t[topic_key] = {}
for key in topics.get(topic).keySet().toArray():
t[topic_key][key] = topics.get(topic).get(key)
return t
def get_qrels_file(collection_name):
"""
Parameters
----------
collection_name : str
collection_name
Returns
-------
path : str
path of the qrels file
"""
if collection_name in qrels_mapping:
qrels = qrels_mapping[collection_name]
target_path = os.path.join(get_cache_home(), qrels.path)
if os.path.exists(target_path):
return target_path
target_dir = os.path.split(target_path)[0]
if not os.path.exists(target_dir):
os.makedirs(target_dir)
with open(target_path, 'w') as file:
qrels_content = JRelevanceJudgments.getQrelsResource(qrels)
file.write(qrels_content)
return target_path
raise FileNotFoundError(f'no qrels file for {collection_name}')
def get_qrels(collection_name):
"""
Parameters
----------
collection_name : str
collection_name
Returns
-------
result : dictionary
qrels as a dictionary
"""
file_path = get_qrels_file(collection_name)
qrels = {}
with open(file_path, 'r') as f:
for line in f:
qid, _, docid, judgement = line.rstrip().split()
if qid.isdigit():
qrels_key = int(qid)
else:
qrels_key = qid
if docid.isdigit():
doc_key = int(docid)
else:
doc_key = docid
if qrels_key in qrels:
qrels[qrels_key][doc_key] = judgement
else:
qrels[qrels_key] = {doc_key: judgement}
return qrels