|
import os |
|
os.environ["TOKENIZERS_PARALLELISM"] = "false" |
|
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" |
|
|
|
import json |
|
from colbert import Indexer, Searcher |
|
from colbert.infra import Run, RunConfig, ColBERTConfig |
|
|
|
INDEX_NAME = 'index' |
|
ANTHOLOGY_PATH = 'anthology.bib' |
|
COLLECTION_PATH = 'collection.json' |
|
DATASET_PATH = 'dataset.json' |
|
|
|
nbits = 2 |
|
doc_maxlen = 300 |
|
checkpoint = 'colbert-ir/colbertv2.0' |
|
|
|
|
|
def index_anthology(collection, index_name='index'): |
|
with Run().context(RunConfig(nranks=1, experiment='notebook')): |
|
config = ColBERTConfig( |
|
doc_maxlen=doc_maxlen, |
|
nbits=nbits, |
|
kmeans_niters=4, |
|
index_path=INDEX_NAME |
|
) |
|
indexer = Indexer( |
|
checkpoint=checkpoint, |
|
config=config |
|
) |
|
indexer.index( |
|
name=index_name, |
|
collection=collection, |
|
overwrite=True |
|
) |
|
|
|
|
|
def search_anthology(collection, index_name=INDEX_NAME): |
|
with Run().context(RunConfig(nranks=0, experiment='notebook')): |
|
searcher = Searcher(index=index_name, collection=collection) |
|
|
|
queries = ["What are some recent examples of grammar checkers?"] |
|
|
|
for query in queries: |
|
print(f"#> {query}") |
|
results = searcher.search(query, k=3) |
|
|
|
|
|
for passage_id, passage_rank, passage_score in zip(*results): |
|
print(f"\t [{passage_rank}] \t\t {passage_score:.1f} \t\t {searcher.collection[passage_id]}") |
|
|
|
print(results) |
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
with open(COLLECTION_PATH, 'r', encoding='utf-8') as f: |
|
collection = json.loads(f.read()) |
|
with open(DATASET_PATH, 'r', encoding='utf-8') as f: |
|
dataset = json.loads(f.read()) |
|
|
|
index_anthology(collection, index_name=INDEX_NAME) |
|
search_anthology(collection, index_name=INDEX_NAME) |