davidheineman
/

colbert-acl

Model card Files Files and versions Community

davidheineman commited on Apr 14

Commit

d2f9318

•

1 Parent(s): 8b805bb

fix indexing path

Files changed (1) hide show

index.py +17 -7

index.py CHANGED Viewed

@@ -6,23 +6,33 @@ import json
 from colbert import Indexer, Searcher
 from colbert.infra import Run, RunConfig, ColBERTConfig
 INDEX_NAME      = 'index'
 ANTHOLOGY_PATH  = 'anthology.bib'
 COLLECTION_PATH = 'collection.json'
 DATASET_PATH    = 'dataset.json'
-nbits = 2          # encode each dimension with 2 bits
 doc_maxlen = 300   # truncate passages at 300 tokens
 checkpoint = 'colbert-ir/colbertv2.0' # ColBERT model to use
 def index_anthology(collection, index_name='index'):
-    with Run().context(RunConfig(nranks=1, experiment='notebook')):                 # nranks specifies the number of GPUs to use
-        config = ColBERTConfig(doc_maxlen=doc_maxlen, nbits=nbits, kmeans_niters=4) # kmeans_niters specifies the number of iterations of k-means clustering; 4 is a good and fast default.
-        indexer = Indexer(checkpoint=checkpoint, config=config)
-        indexer.index(name=index_name, collection=collection, overwrite=True)
 def search_anthology(collection, index_name=INDEX_NAME):

 from colbert import Indexer, Searcher
 from colbert.infra import Run, RunConfig, ColBERTConfig
 INDEX_NAME      = 'index'
 ANTHOLOGY_PATH  = 'anthology.bib'
 COLLECTION_PATH = 'collection.json'
 DATASET_PATH    = 'dataset.json'
+nbits      = 2     # encode each dimension with 2 bits
 doc_maxlen = 300   # truncate passages at 300 tokens
 checkpoint = 'colbert-ir/colbertv2.0' # ColBERT model to use
 def index_anthology(collection, index_name='index'):
+    with Run().context(RunConfig(nranks=1, experiment='notebook')): # nranks specifies the number of GPUs to use
+        config = ColBERTConfig(
+            doc_maxlen=doc_maxlen,
+            nbits=nbits,
+            kmeans_niters=4, # specifies the number of iterations of k-means clustering; 4 is a good and fast default.
+            index_path=INDEX_NAME
+        )
+        indexer = Indexer(
+            checkpoint=checkpoint,
+            config=config
+        )
+        indexer.index(
+            name=index_name,
+            collection=collection,
+            overwrite=True
+        )
 def search_anthology(collection, index_name=INDEX_NAME):