# # Pyserini: Reproducible IR research with sparse and dense representations # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import os import sys # Use Pyserini in this repo (as opposed to pip install) sys.path.insert(0, './') from pyserini.prebuilt_index_info import TF_INDEX_INFO_CURRENT, IMPACT_INDEX_INFO_CURRENT, FAISS_INDEX_INFO __boilerplate__ = ''' # Pyserini: Prebuilt Indexes Pyserini provides a number of pre-built Lucene indexes. To list what's available in code: ```python from pyserini.search.lucene import LuceneSearcher LuceneSearcher.list_prebuilt_indexes() from pyserini.index.lucene import IndexReader IndexReader.list_prebuilt_indexes() ``` It's easy initialize a searcher from a pre-built index: ```python searcher = LuceneSearcher.from_prebuilt_index('robust04') ``` You can use this simple Python one-liner to download the pre-built index: ``` python -c "from pyserini.search.lucene import LuceneSearcher; LuceneSearcher.from_prebuilt_index('robust04')" ``` The downloaded index will be in `~/.cache/pyserini/indexes/`. It's similarly easy initialize an index reader from a pre-built index: ```python index_reader = IndexReader.from_prebuilt_index('robust04') index_reader.stats() ``` The output will be: ``` {'total_terms': 174540872, 'documents': 528030, 'non_empty_documents': 528030, 'unique_terms': 923436} ``` Note that unless the underlying index was built with the `-optimize` option (i.e., merging all index segments into a single segment), `unique_terms` will show -1. Nope, that's not a bug. Below is a summary of the pre-built indexes that are currently available. Detailed configuration information for the pre-built indexes are stored in [`pyserini/prebuilt_index_info.py`](../pyserini/prebuilt_index_info.py). ''' def generate_prebuilt(index): print('
') for entry in index: # No, this is not an HTML bug. This is intentional to get GitHub formatting to not add italics to the entry. print(f'
{entry}') if 'readme' in index[entry]: print(f'[readme]') print(f'
{index[entry]["description"]}') print(f'
') print('
') if __name__ == '__main__': print(__boilerplate__) print('\n\n## Standard Lucene Indexes') generate_prebuilt(TF_INDEX_INFO_CURRENT) print('\n\n## Lucene Impact Indexes') generate_prebuilt(IMPACT_INDEX_INFO_CURRENT) print('\n\n## Faiss Indexes') generate_prebuilt(FAISS_INDEX_INFO)