davidheineman commited on
Commit
8b805bb
1 Parent(s): bed1667

fix filepaths

Browse files
Files changed (4) hide show
  1. README.md +13 -2
  2. index.py +10 -9
  3. parse.py +3 -3
  4. search.py +2 -4
README.md CHANGED
@@ -6,6 +6,7 @@ license: apache-2.0
6
  First, clone this repo and create a conda environment and install the dependencies:
7
  ```sh
8
  git clone https://huggingface.co/davidheineman/colbert-acl
 
9
  pip install bibtexparser colbert-ir[torch,faiss-gpu]
10
  ```
11
 
@@ -43,7 +44,17 @@ Then, to test, visit:
43
  http://localhost:8893/api/search?k=25&query=How to extend context windows?
44
  ```
45
 
46
- ### Example notebooks
47
 
48
  To see an example of search, visit:
49
- [colab.research.google.com/drive/1-b90_8YSAK17KQ6C7nqKRYbCWEXQ9FGs](https://colab.research.google.com/drive/1-b90_8YSAK17KQ6C7nqKRYbCWEXQ9FGs?usp=sharing)
 
 
 
 
 
 
 
 
 
 
 
6
  First, clone this repo and create a conda environment and install the dependencies:
7
  ```sh
8
  git clone https://huggingface.co/davidheineman/colbert-acl
9
+ # torch==1.13.1 required (conda install -y -n [env] python=3.10)
10
  pip install bibtexparser colbert-ir[torch,faiss-gpu]
11
  ```
12
 
 
44
  http://localhost:8893/api/search?k=25&query=How to extend context windows?
45
  ```
46
 
47
+ ## Example notebooks
48
 
49
  To see an example of search, visit:
50
+ [colab.research.google.com/drive/1-b90_8YSAK17KQ6C7nqKRYbCWEXQ9FGs](https://colab.research.google.com/drive/1-b90_8YSAK17KQ6C7nqKRYbCWEXQ9FGs?usp=sharing)
51
+
52
+ ## Notes
53
+ - It's possible to update the index without re-computing the whole dataset. Basically the IVF table is updated, but the centroids are not re-computed. This requires a large dataset to already exist (in our case it does).
54
+ - We'll need someone to manage the storage/saving of the index, so it can be updated in real-time.
55
+ - See:
56
+ - https://github.com/stanford-futuredata/ColBERT/blob/main/colbert/index_updater.py
57
+ - https://github.com/stanford-futuredata/ColBERT/issues/111
58
+ - We also need a MySQL database which can take in a document ID and return its metadata, so the ColBERT database only stores the passage encodings, not the full text (right now it just loads the whole json into memory).
59
+ - We may be able to offload the centroids calculation to a vector DB (check on this)
60
+ - Should have 2 people on UI, 1 on MySQL, 1 on VectorDB, 1 on ColBERT
index.py CHANGED
@@ -6,18 +6,19 @@ import json
6
  from colbert import Indexer, Searcher
7
  from colbert.infra import Run, RunConfig, ColBERTConfig
8
 
9
- INDEX_NAME = 'index'
10
- ANTHOLOGY_PATH = 'anthology.bib'
11
- COLLECTION_PATH = 'acl/collection.json'
12
- DATASET_PATH = 'acl/dataset.json'
13
 
 
 
 
 
14
 
15
- def index_anthology(collection, index_name='index'):
16
- nbits = 2 # encode each dimension with 2 bits
17
- doc_maxlen = 300 # truncate passages at 300 tokens
18
-
19
- checkpoint = 'colbert-ir/colbertv2.0'
20
 
 
 
 
 
 
 
21
  with Run().context(RunConfig(nranks=1, experiment='notebook')): # nranks specifies the number of GPUs to use
22
  config = ColBERTConfig(doc_maxlen=doc_maxlen, nbits=nbits, kmeans_niters=4) # kmeans_niters specifies the number of iterations of k-means clustering; 4 is a good and fast default.
23
  indexer = Indexer(checkpoint=checkpoint, config=config)
 
6
  from colbert import Indexer, Searcher
7
  from colbert.infra import Run, RunConfig, ColBERTConfig
8
 
 
 
 
 
9
 
10
+ INDEX_NAME = 'index'
11
+ ANTHOLOGY_PATH = 'anthology.bib'
12
+ COLLECTION_PATH = 'collection.json'
13
+ DATASET_PATH = 'dataset.json'
14
 
 
 
 
 
 
15
 
16
+ nbits = 2 # encode each dimension with 2 bits
17
+ doc_maxlen = 300 # truncate passages at 300 tokens
18
+ checkpoint = 'colbert-ir/colbertv2.0' # ColBERT model to use
19
+
20
+
21
+ def index_anthology(collection, index_name='index'):
22
  with Run().context(RunConfig(nranks=1, experiment='notebook')): # nranks specifies the number of GPUs to use
23
  config = ColBERTConfig(doc_maxlen=doc_maxlen, nbits=nbits, kmeans_niters=4) # kmeans_niters specifies the number of iterations of k-means clustering; 4 is a good and fast default.
24
  indexer = Indexer(checkpoint=checkpoint, config=config)
parse.py CHANGED
@@ -1,8 +1,8 @@
1
  import bibtexparser, json
2
 
3
- ANTHOLOGY_PATH = 'anthology.bib'
4
- COLLECTION_PATH = 'acl/collection.json'
5
- DATASET_PATH = 'acl/dataset.json'
6
 
7
  def parse_anthology_bibtex(anthology_path):
8
  with open(anthology_path, 'r', encoding='utf-8') as f:
 
1
  import bibtexparser, json
2
 
3
+ ANTHOLOGY_PATH = 'anthology.bib'
4
+ COLLECTION_PATH = 'collection.json'
5
+ DATASET_PATH = 'dataset.json'
6
 
7
  def parse_anthology_bibtex(anthology_path):
8
  with open(anthology_path, 'r', encoding='utf-8') as f:
search.py CHANGED
@@ -149,10 +149,8 @@ def generate_candidates(Q):
149
 
150
  def search_colbert(query, k):
151
  # Add the appropriate [Q], [D] tokens and encode with ColBERT
152
- Q = searcher.encode(query)
153
-
154
- # Cut off query to maxlen tokens
155
- Q = Q[:, :searcher.config.query_maxlen]
156
 
157
  # Find the passage candidates (i.e., closest candidates to the Q centroid)
158
  pids, centroid_scores = generate_candidates(Q)
 
149
 
150
  def search_colbert(query, k):
151
  # Add the appropriate [Q], [D] tokens and encode with ColBERT
152
+ Q = searcher.encode(query)
153
+ Q = Q[:, :searcher.config.query_maxlen] # Cut off query to maxlen tokens
 
 
154
 
155
  # Find the passage candidates (i.e., closest candidates to the Q centroid)
156
  pids, centroid_scores = generate_candidates(Q)