NetsPresso_QA / tests /test_index_otf.py
geonmin-kim's picture
Upload folder using huggingface_hub
d6585f5
#
# Pyserini: Reproducible IR research with sparse and dense representations
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os
import shutil
import unittest
from typing import List
from pyserini.index.lucene import LuceneIndexer, IndexReader, JacksonObjectMapper
from pyserini.search.lucene import JLuceneSearcherResult, LuceneSearcher
class TestSearch(unittest.TestCase):
def setUp(self):
self.docs = []
self.tmp_dir = "temp_dir"
# The current directory depends on if you're running inside an IDE or from command line.
curdir = os.getcwd()
if curdir.endswith('tests'):
self.test_file = '../tests/resources/simple_cacm_corpus.json'
else:
self.test_file = 'tests/resources/simple_cacm_corpus.json'
def test_indexer(self):
indexer = LuceneIndexer(self.tmp_dir)
with open(self.test_file) as f:
for doc in f:
indexer.add_doc_raw(doc)
indexer.close()
searcher = LuceneSearcher(self.tmp_dir)
self.assertEqual(3, searcher.num_docs)
hits = searcher.search('semantic networks')
self.assertTrue(isinstance(hits, List))
self.assertTrue(isinstance(hits[0], JLuceneSearcherResult))
self.assertEqual(1, len(hits))
self.assertEqual('CACM-2274', hits[0].docid)
self.assertAlmostEqual(1.53650, hits[0].score, places=5)
def test_indexer_batch1(self):
batch = []
with open(self.test_file) as f:
for doc in f:
batch.append(doc)
# Test different ways to initialize indexer.
indexer = LuceneIndexer(self.tmp_dir)
indexer.add_batch_raw(batch)
indexer.close()
searcher = LuceneSearcher(self.tmp_dir)
self.assertEqual(3, searcher.num_docs)
hits = searcher.search('semantic networks')
self.assertTrue(isinstance(hits, List))
self.assertTrue(isinstance(hits[0], JLuceneSearcherResult))
self.assertEqual(1, len(hits))
self.assertEqual('CACM-2274', hits[0].docid)
self.assertAlmostEqual(1.53650, hits[0].score, places=5)
# Test different ways to initialize indexer.
indexer = LuceneIndexer(self.tmp_dir, threads=2)
indexer.add_batch_raw(batch)
indexer.close()
searcher = LuceneSearcher(self.tmp_dir)
self.assertEqual(3, searcher.num_docs)
hits = searcher.search('semantic networks')
self.assertTrue(isinstance(hits, List))
self.assertTrue(isinstance(hits[0], JLuceneSearcherResult))
self.assertEqual(1, len(hits))
self.assertEqual('CACM-2274', hits[0].docid)
self.assertAlmostEqual(1.53650, hits[0].score, places=5)
# Test different ways to initialize indexer.
indexer = LuceneIndexer(self.tmp_dir, threads=4)
indexer.add_batch_raw(batch)
indexer.close()
searcher = LuceneSearcher(self.tmp_dir)
self.assertEqual(3, searcher.num_docs)
hits = searcher.search('semantic networks')
self.assertTrue(isinstance(hits, List))
self.assertTrue(isinstance(hits[0], JLuceneSearcherResult))
self.assertEqual(1, len(hits))
self.assertEqual('CACM-2274', hits[0].docid)
self.assertAlmostEqual(1.53650, hits[0].score, places=5)
# Test different ways to initialize indexer
indexer = LuceneIndexer(args=['-index', self.tmp_dir, '-threads', '4'])
indexer.add_batch_raw(batch)
indexer.close()
searcher = LuceneSearcher(self.tmp_dir)
self.assertEqual(3, searcher.num_docs)
hits = searcher.search('semantic networks')
self.assertTrue(isinstance(hits, List))
self.assertTrue(isinstance(hits[0], JLuceneSearcherResult))
self.assertEqual(1, len(hits))
self.assertEqual('CACM-2274', hits[0].docid)
self.assertAlmostEqual(1.53650, hits[0].score, places=5)
def test_indexer_with_args(self):
indexer = LuceneIndexer(args=['-index', self.tmp_dir, '-pretokenized'])
with open(self.test_file) as f:
for doc in f:
indexer.add_doc_raw(doc)
indexer.close()
searcher = LuceneSearcher(self.tmp_dir)
self.assertEqual(3, searcher.num_docs)
hits = searcher.search('semantic networks')
self.assertTrue(isinstance(hits, List))
self.assertTrue(isinstance(hits[0], JLuceneSearcherResult))
self.assertEqual(1, len(hits))
self.assertEqual('CACM-2274', hits[0].docid)
self.assertAlmostEqual(0.62610, hits[0].score, places=5)
def test_indexer_append1(self):
indexer = LuceneIndexer(self.tmp_dir)
indexer.add_doc_raw('{"id": "0", "contents": "Document 0"}')
indexer.close()
reader = IndexReader(self.tmp_dir)
stats = reader.stats()
self.assertEqual(1, stats['documents'])
self.assertIsNotNone(reader.doc('0'))
indexer = LuceneIndexer(self.tmp_dir, append=True)
indexer.add_doc_raw('{"id": "1", "contents": "Document 1"}')
indexer.close()
reader = IndexReader(self.tmp_dir)
stats = reader.stats()
self.assertEqual(2, stats['documents'])
self.assertIsNotNone(reader.doc('0'))
self.assertIsNotNone(reader.doc('1'))
def test_indexer_append2(self):
# Make sure it's okay if we append to an empty index.
indexer = LuceneIndexer(self.tmp_dir, append=True)
indexer.add_doc_raw('{"id": "0", "contents": "Document 0"}')
indexer.close()
reader = IndexReader(self.tmp_dir)
stats = reader.stats()
self.assertEqual(1, stats['documents'])
self.assertIsNotNone(reader.doc('0'))
# Confirm that we are overwriting.
indexer = LuceneIndexer(self.tmp_dir)
indexer.add_doc_raw('{"id": "1", "contents": "Document 1"}')
indexer.close()
reader = IndexReader(self.tmp_dir)
stats = reader.stats()
self.assertEqual(1, stats['documents'])
self.assertIsNone(reader.doc('0'))
self.assertIsNotNone(reader.doc('1'))
# Now we're appending.
indexer = LuceneIndexer(self.tmp_dir, append=True)
indexer.add_doc_raw('{"id": "x", "contents": "Document x"}')
indexer.close()
reader = IndexReader(self.tmp_dir)
stats = reader.stats()
self.assertEqual(2, stats['documents'])
self.assertIsNone(reader.doc('0'))
self.assertIsNotNone(reader.doc('1'))
self.assertIsNotNone(reader.doc('x'))
def test_indexer_type_raw(self):
indexer = LuceneIndexer(self.tmp_dir)
indexer.add_doc_raw('{"id": "doc0", "contents": "document 0 contents"}')
indexer.add_doc_raw('{"id": "doc1", "contents": "document 1 contents"}')
indexer.close()
reader = IndexReader(self.tmp_dir)
stats = reader.stats()
self.assertEqual(2, stats['documents'])
self.assertIsNotNone(reader.doc('doc0'))
self.assertIsNotNone(reader.doc('doc1'))
def test_indexer_type_raw_batch(self):
batch = ['{"id": "doc0", "contents": "document 0 contents"}',
'{"id": "doc1", "contents": "document 1 contents"}']
indexer = LuceneIndexer(self.tmp_dir)
indexer.add_batch_raw(batch)
indexer.close()
reader = IndexReader(self.tmp_dir)
stats = reader.stats()
self.assertEqual(2, stats['documents'])
self.assertIsNotNone(reader.doc('doc0'))
self.assertIsNotNone(reader.doc('doc1'))
def test_indexer_type_dict(self):
indexer = LuceneIndexer(self.tmp_dir)
indexer.add_doc_dict({'id': 'doc0', 'contents': 'document 0 contents'})
indexer.add_doc_dict({'id': 'doc1', 'contents': 'document 1 contents'})
indexer.close()
reader = IndexReader(self.tmp_dir)
stats = reader.stats()
self.assertEqual(2, stats['documents'])
self.assertIsNotNone(reader.doc('doc0'))
self.assertIsNotNone(reader.doc('doc1'))
def test_indexer_type_dict_batch(self):
batch = [{'id': 'doc0', 'contents': 'document 0 contents'},
{'id': 'doc1', 'contents': 'document 1 contents'}]
indexer = LuceneIndexer(self.tmp_dir)
indexer.add_batch_dict(batch)
indexer.close()
reader = IndexReader(self.tmp_dir)
stats = reader.stats()
self.assertEqual(2, stats['documents'])
self.assertIsNotNone(reader.doc('doc0'))
self.assertIsNotNone(reader.doc('doc1'))
def test_indexer_type_json(self):
mapper = JacksonObjectMapper()
indexer = LuceneIndexer(self.tmp_dir)
indexer.add_doc_json(mapper.createObjectNode().put('id', 'doc0').put('contents', 'document 0 contents'))
indexer.add_doc_json(mapper.createObjectNode().put('id', 'doc1').put('contents', 'document 1 contents'))
indexer.close()
reader = IndexReader(self.tmp_dir)
stats = reader.stats()
self.assertEqual(2, stats['documents'])
self.assertIsNotNone(reader.doc('doc0'))
self.assertIsNotNone(reader.doc('doc1'))
def test_indexer_type_json_batch(self):
mapper = JacksonObjectMapper()
batch = [mapper.createObjectNode().put('id', 'doc0').put('contents', 'document 0 contents'),
mapper.createObjectNode().put('id', 'doc1').put('contents', 'document 1 contents')]
indexer = LuceneIndexer(self.tmp_dir)
indexer.add_batch_json(batch)
indexer.close()
reader = IndexReader(self.tmp_dir)
stats = reader.stats()
self.assertEqual(2, stats['documents'])
self.assertIsNotNone(reader.doc('doc0'))
self.assertIsNotNone(reader.doc('doc1'))
def tearDown(self):
shutil.rmtree(self.tmp_dir)