# # Pyserini: Reproducible IR research with sparse and dense representations # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import os import shutil import unittest from typing import List from pyserini.index.lucene import LuceneIndexer, IndexReader, JacksonObjectMapper from pyserini.search.lucene import JLuceneSearcherResult, LuceneSearcher class TestSearch(unittest.TestCase): def setUp(self): self.docs = [] self.tmp_dir = "temp_dir" # The current directory depends on if you're running inside an IDE or from command line. curdir = os.getcwd() if curdir.endswith('tests'): self.test_file = '../tests/resources/simple_cacm_corpus.json' else: self.test_file = 'tests/resources/simple_cacm_corpus.json' def test_indexer(self): indexer = LuceneIndexer(self.tmp_dir) with open(self.test_file) as f: for doc in f: indexer.add_doc_raw(doc) indexer.close() searcher = LuceneSearcher(self.tmp_dir) self.assertEqual(3, searcher.num_docs) hits = searcher.search('semantic networks') self.assertTrue(isinstance(hits, List)) self.assertTrue(isinstance(hits[0], JLuceneSearcherResult)) self.assertEqual(1, len(hits)) self.assertEqual('CACM-2274', hits[0].docid) self.assertAlmostEqual(1.53650, hits[0].score, places=5) def test_indexer_batch1(self): batch = [] with open(self.test_file) as f: for doc in f: batch.append(doc) # Test different ways to initialize indexer. indexer = LuceneIndexer(self.tmp_dir) indexer.add_batch_raw(batch) indexer.close() searcher = LuceneSearcher(self.tmp_dir) self.assertEqual(3, searcher.num_docs) hits = searcher.search('semantic networks') self.assertTrue(isinstance(hits, List)) self.assertTrue(isinstance(hits[0], JLuceneSearcherResult)) self.assertEqual(1, len(hits)) self.assertEqual('CACM-2274', hits[0].docid) self.assertAlmostEqual(1.53650, hits[0].score, places=5) # Test different ways to initialize indexer. indexer = LuceneIndexer(self.tmp_dir, threads=2) indexer.add_batch_raw(batch) indexer.close() searcher = LuceneSearcher(self.tmp_dir) self.assertEqual(3, searcher.num_docs) hits = searcher.search('semantic networks') self.assertTrue(isinstance(hits, List)) self.assertTrue(isinstance(hits[0], JLuceneSearcherResult)) self.assertEqual(1, len(hits)) self.assertEqual('CACM-2274', hits[0].docid) self.assertAlmostEqual(1.53650, hits[0].score, places=5) # Test different ways to initialize indexer. indexer = LuceneIndexer(self.tmp_dir, threads=4) indexer.add_batch_raw(batch) indexer.close() searcher = LuceneSearcher(self.tmp_dir) self.assertEqual(3, searcher.num_docs) hits = searcher.search('semantic networks') self.assertTrue(isinstance(hits, List)) self.assertTrue(isinstance(hits[0], JLuceneSearcherResult)) self.assertEqual(1, len(hits)) self.assertEqual('CACM-2274', hits[0].docid) self.assertAlmostEqual(1.53650, hits[0].score, places=5) # Test different ways to initialize indexer indexer = LuceneIndexer(args=['-index', self.tmp_dir, '-threads', '4']) indexer.add_batch_raw(batch) indexer.close() searcher = LuceneSearcher(self.tmp_dir) self.assertEqual(3, searcher.num_docs) hits = searcher.search('semantic networks') self.assertTrue(isinstance(hits, List)) self.assertTrue(isinstance(hits[0], JLuceneSearcherResult)) self.assertEqual(1, len(hits)) self.assertEqual('CACM-2274', hits[0].docid) self.assertAlmostEqual(1.53650, hits[0].score, places=5) def test_indexer_with_args(self): indexer = LuceneIndexer(args=['-index', self.tmp_dir, '-pretokenized']) with open(self.test_file) as f: for doc in f: indexer.add_doc_raw(doc) indexer.close() searcher = LuceneSearcher(self.tmp_dir) self.assertEqual(3, searcher.num_docs) hits = searcher.search('semantic networks') self.assertTrue(isinstance(hits, List)) self.assertTrue(isinstance(hits[0], JLuceneSearcherResult)) self.assertEqual(1, len(hits)) self.assertEqual('CACM-2274', hits[0].docid) self.assertAlmostEqual(0.62610, hits[0].score, places=5) def test_indexer_append1(self): indexer = LuceneIndexer(self.tmp_dir) indexer.add_doc_raw('{"id": "0", "contents": "Document 0"}') indexer.close() reader = IndexReader(self.tmp_dir) stats = reader.stats() self.assertEqual(1, stats['documents']) self.assertIsNotNone(reader.doc('0')) indexer = LuceneIndexer(self.tmp_dir, append=True) indexer.add_doc_raw('{"id": "1", "contents": "Document 1"}') indexer.close() reader = IndexReader(self.tmp_dir) stats = reader.stats() self.assertEqual(2, stats['documents']) self.assertIsNotNone(reader.doc('0')) self.assertIsNotNone(reader.doc('1')) def test_indexer_append2(self): # Make sure it's okay if we append to an empty index. indexer = LuceneIndexer(self.tmp_dir, append=True) indexer.add_doc_raw('{"id": "0", "contents": "Document 0"}') indexer.close() reader = IndexReader(self.tmp_dir) stats = reader.stats() self.assertEqual(1, stats['documents']) self.assertIsNotNone(reader.doc('0')) # Confirm that we are overwriting. indexer = LuceneIndexer(self.tmp_dir) indexer.add_doc_raw('{"id": "1", "contents": "Document 1"}') indexer.close() reader = IndexReader(self.tmp_dir) stats = reader.stats() self.assertEqual(1, stats['documents']) self.assertIsNone(reader.doc('0')) self.assertIsNotNone(reader.doc('1')) # Now we're appending. indexer = LuceneIndexer(self.tmp_dir, append=True) indexer.add_doc_raw('{"id": "x", "contents": "Document x"}') indexer.close() reader = IndexReader(self.tmp_dir) stats = reader.stats() self.assertEqual(2, stats['documents']) self.assertIsNone(reader.doc('0')) self.assertIsNotNone(reader.doc('1')) self.assertIsNotNone(reader.doc('x')) def test_indexer_type_raw(self): indexer = LuceneIndexer(self.tmp_dir) indexer.add_doc_raw('{"id": "doc0", "contents": "document 0 contents"}') indexer.add_doc_raw('{"id": "doc1", "contents": "document 1 contents"}') indexer.close() reader = IndexReader(self.tmp_dir) stats = reader.stats() self.assertEqual(2, stats['documents']) self.assertIsNotNone(reader.doc('doc0')) self.assertIsNotNone(reader.doc('doc1')) def test_indexer_type_raw_batch(self): batch = ['{"id": "doc0", "contents": "document 0 contents"}', '{"id": "doc1", "contents": "document 1 contents"}'] indexer = LuceneIndexer(self.tmp_dir) indexer.add_batch_raw(batch) indexer.close() reader = IndexReader(self.tmp_dir) stats = reader.stats() self.assertEqual(2, stats['documents']) self.assertIsNotNone(reader.doc('doc0')) self.assertIsNotNone(reader.doc('doc1')) def test_indexer_type_dict(self): indexer = LuceneIndexer(self.tmp_dir) indexer.add_doc_dict({'id': 'doc0', 'contents': 'document 0 contents'}) indexer.add_doc_dict({'id': 'doc1', 'contents': 'document 1 contents'}) indexer.close() reader = IndexReader(self.tmp_dir) stats = reader.stats() self.assertEqual(2, stats['documents']) self.assertIsNotNone(reader.doc('doc0')) self.assertIsNotNone(reader.doc('doc1')) def test_indexer_type_dict_batch(self): batch = [{'id': 'doc0', 'contents': 'document 0 contents'}, {'id': 'doc1', 'contents': 'document 1 contents'}] indexer = LuceneIndexer(self.tmp_dir) indexer.add_batch_dict(batch) indexer.close() reader = IndexReader(self.tmp_dir) stats = reader.stats() self.assertEqual(2, stats['documents']) self.assertIsNotNone(reader.doc('doc0')) self.assertIsNotNone(reader.doc('doc1')) def test_indexer_type_json(self): mapper = JacksonObjectMapper() indexer = LuceneIndexer(self.tmp_dir) indexer.add_doc_json(mapper.createObjectNode().put('id', 'doc0').put('contents', 'document 0 contents')) indexer.add_doc_json(mapper.createObjectNode().put('id', 'doc1').put('contents', 'document 1 contents')) indexer.close() reader = IndexReader(self.tmp_dir) stats = reader.stats() self.assertEqual(2, stats['documents']) self.assertIsNotNone(reader.doc('doc0')) self.assertIsNotNone(reader.doc('doc1')) def test_indexer_type_json_batch(self): mapper = JacksonObjectMapper() batch = [mapper.createObjectNode().put('id', 'doc0').put('contents', 'document 0 contents'), mapper.createObjectNode().put('id', 'doc1').put('contents', 'document 1 contents')] indexer = LuceneIndexer(self.tmp_dir) indexer.add_batch_json(batch) indexer.close() reader = IndexReader(self.tmp_dir) stats = reader.stats() self.assertEqual(2, stats['documents']) self.assertIsNotNone(reader.doc('doc0')) self.assertIsNotNone(reader.doc('doc1')) def tearDown(self): shutil.rmtree(self.tmp_dir)