import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Prevents deadlocks in ColBERT tokenization
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"     # Allows multiple libraries in OpenMP runtime. This can cause unexected behavior, but allows ColBERT to work

import json
from colbert import Indexer, Searcher
from colbert.infra import Run, RunConfig, ColBERTConfig

INDEX_NAME      = 'index'
ANTHOLOGY_PATH  = 'anthology.bib'
COLLECTION_PATH = 'collection.json'
DATASET_PATH    = 'dataset.json'

nbits      = 2     # encode each dimension with 2 bits
doc_maxlen = 300   # truncate passages at 300 tokens
checkpoint = 'colbert-ir/colbertv2.0' # ColBERT model to use


def index_anthology(collection, index_name='index'):
    with Run().context(RunConfig(nranks=1, experiment='notebook')): # nranks specifies the number of GPUs to use
        config = ColBERTConfig(
            doc_maxlen=doc_maxlen, 
            nbits=nbits, 
            kmeans_niters=4, # specifies the number of iterations of k-means clustering; 4 is a good and fast default.
            index_path=INDEX_NAME
        ) 
        indexer = Indexer(
            checkpoint=checkpoint, 
            config=config
        )
        indexer.index(
            name=index_name, 
            collection=collection, 
            overwrite=True
        )


def search_anthology(collection, index_name=INDEX_NAME):
    with Run().context(RunConfig(nranks=0, experiment='notebook')):
        searcher = Searcher(index=index_name, collection=collection)

    queries = ["What are some recent examples of grammar checkers?"]

    for query in queries:
        print(f"#> {query}")
        results = searcher.search(query, k=3) # Find the top-3 passages for this query

        # Print out the top-k retrieved passages
        for passage_id, passage_rank, passage_score in zip(*results):
            print(f"\t [{passage_rank}] \t\t {passage_score:.1f} \t\t {searcher.collection[passage_id]}")

    print(results)


if __name__ == '__main__':
    # Load the parsed anthology
    with open(COLLECTION_PATH, 'r', encoding='utf-8') as f:
        collection = json.loads(f.read())
    with open(DATASET_PATH, 'r', encoding='utf-8') as f:
        dataset = json.loads(f.read())

    index_anthology(collection, index_name=INDEX_NAME)
    search_anthology(collection, index_name=INDEX_NAME)