davidheineman commited on
Commit
d2f9318
1 Parent(s): 8b805bb

fix indexing path

Browse files
Files changed (1) hide show
  1. index.py +17 -7
index.py CHANGED
@@ -6,23 +6,33 @@ import json
6
  from colbert import Indexer, Searcher
7
  from colbert.infra import Run, RunConfig, ColBERTConfig
8
 
9
-
10
  INDEX_NAME = 'index'
11
  ANTHOLOGY_PATH = 'anthology.bib'
12
  COLLECTION_PATH = 'collection.json'
13
  DATASET_PATH = 'dataset.json'
14
 
15
-
16
- nbits = 2 # encode each dimension with 2 bits
17
  doc_maxlen = 300 # truncate passages at 300 tokens
18
  checkpoint = 'colbert-ir/colbertv2.0' # ColBERT model to use
19
 
20
 
21
  def index_anthology(collection, index_name='index'):
22
- with Run().context(RunConfig(nranks=1, experiment='notebook')): # nranks specifies the number of GPUs to use
23
- config = ColBERTConfig(doc_maxlen=doc_maxlen, nbits=nbits, kmeans_niters=4) # kmeans_niters specifies the number of iterations of k-means clustering; 4 is a good and fast default.
24
- indexer = Indexer(checkpoint=checkpoint, config=config)
25
- indexer.index(name=index_name, collection=collection, overwrite=True)
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
 
28
  def search_anthology(collection, index_name=INDEX_NAME):
 
6
  from colbert import Indexer, Searcher
7
  from colbert.infra import Run, RunConfig, ColBERTConfig
8
 
 
9
  INDEX_NAME = 'index'
10
  ANTHOLOGY_PATH = 'anthology.bib'
11
  COLLECTION_PATH = 'collection.json'
12
  DATASET_PATH = 'dataset.json'
13
 
14
+ nbits = 2 # encode each dimension with 2 bits
 
15
  doc_maxlen = 300 # truncate passages at 300 tokens
16
  checkpoint = 'colbert-ir/colbertv2.0' # ColBERT model to use
17
 
18
 
19
  def index_anthology(collection, index_name='index'):
20
+ with Run().context(RunConfig(nranks=1, experiment='notebook')): # nranks specifies the number of GPUs to use
21
+ config = ColBERTConfig(
22
+ doc_maxlen=doc_maxlen,
23
+ nbits=nbits,
24
+ kmeans_niters=4, # specifies the number of iterations of k-means clustering; 4 is a good and fast default.
25
+ index_path=INDEX_NAME
26
+ )
27
+ indexer = Indexer(
28
+ checkpoint=checkpoint,
29
+ config=config
30
+ )
31
+ indexer.index(
32
+ name=index_name,
33
+ collection=collection,
34
+ overwrite=True
35
+ )
36
 
37
 
38
  def search_anthology(collection, index_name=INDEX_NAME):