File size: 1,280 Bytes
d6585f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import logging
from argparse import ArgumentParser

from transformers import AutoTokenizer

from pyserini.search import SimpleSearcher
from pyserini.analysis import JWhiteSpaceAnalyzer


# logger = logging.getLogger(__name__)
logger = logging.getLogger(os.path.basename(__file__))
logger.setLevel(logging.INFO)


def main(args):
    query = args.query
    index = args.index
    if args.do_tokenize:
        tokenizer = AutoTokenizer.from_pretrained('bert-multilingual-base-uncased')
        query = " ".join(tokenizer.tokenize(query))

    logger.info(f'searching for: {query}')
    searcher = SimpleSearcher(index)
    searcher.set_analyzer(JWhiteSpaceAnalyzer())
    hits = searcher.search(query, 1000)

    for i in range(len(hits)):
        doc = hits[i]
        print(f'{i+1:2} {hits[i].docid:4} {hits[i].score:.5f}')


if __name__ == "__main__":
    parser = ArgumentParser()
    parser.add_argument('--query', '-q', type=str, required=True, help="The query to search in the index")
    parser.add_argument('--index', '-i', type=str, required=True, help="Path to the anserini index directory")
    parser.add_argument('--do-tokenize', '-t', action='store_false', help="Whether to perform mbert tokenization on the query")
    
    args = parser.parse_args()
    main(args)