davidheineman
/

colbert-acl

davidheineman commited on Apr 14

Commit

8b805bb

•

1 Parent(s): bed1667

fix filepaths

Files changed (4) hide show

README.md CHANGED Viewed

@@ -6,6 +6,7 @@ license: apache-2.0
 First, clone this repo and create a conda environment and install the dependencies:
 ```sh
 git clone https://huggingface.co/davidheineman/colbert-acl
 pip install bibtexparser colbert-ir[torch,faiss-gpu]
 ```
@@ -43,7 +44,17 @@ Then, to test, visit:
 http://localhost:8893/api/search?k=25&query=How to extend context windows?
 ```
-### Example notebooks
 To see an example of search, visit:
-[colab.research.google.com/drive/1-b90_8YSAK17KQ6C7nqKRYbCWEXQ9FGs](https://colab.research.google.com/drive/1-b90_8YSAK17KQ6C7nqKRYbCWEXQ9FGs?usp=sharing)

 First, clone this repo and create a conda environment and install the dependencies:
 ```sh
 git clone https://huggingface.co/davidheineman/colbert-acl
+# torch==1.13.1 required (conda install -y -n [env] python=3.10)
 pip install bibtexparser colbert-ir[torch,faiss-gpu]
 ```
 http://localhost:8893/api/search?k=25&query=How to extend context windows?
 ```
+## Example notebooks
 To see an example of search, visit:
+[colab.research.google.com/drive/1-b90_8YSAK17KQ6C7nqKRYbCWEXQ9FGs](https://colab.research.google.com/drive/1-b90_8YSAK17KQ6C7nqKRYbCWEXQ9FGs?usp=sharing)
+## Notes
+- It's possible to update the index without re-computing the whole dataset. Basically the IVF table is updated, but the centroids are not re-computed. This requires a large dataset to already exist (in our case it does).
+    - We'll need someone to manage the storage/saving of the index, so it can be updated in real-time.
+- See:
+    - https://github.com/stanford-futuredata/ColBERT/blob/main/colbert/index_updater.py
+    - https://github.com/stanford-futuredata/ColBERT/issues/111
+- We also need a MySQL database which can take in a document ID and return its metadata, so the ColBERT database only stores the passage encodings, not the full text (right now it just loads the whole json into memory).
+- We may be able to offload the centroids calculation to a vector DB (check on this)
+- Should have 2 people on UI, 1 on MySQL, 1 on VectorDB, 1 on ColBERT

index.py CHANGED Viewed

@@ -6,18 +6,19 @@ import json
 from colbert import Indexer, Searcher
 from colbert.infra import Run, RunConfig, ColBERTConfig
-INDEX_NAME = 'index'
-ANTHOLOGY_PATH = 'anthology.bib'
-COLLECTION_PATH = 'acl/collection.json'
-DATASET_PATH = 'acl/dataset.json'
-def index_anthology(collection, index_name='index'):
-    nbits = 2          # encode each dimension with 2 bits
-    doc_maxlen = 300   # truncate passages at 300 tokens
-    checkpoint = 'colbert-ir/colbertv2.0'
     with Run().context(RunConfig(nranks=1, experiment='notebook')):                 # nranks specifies the number of GPUs to use
         config = ColBERTConfig(doc_maxlen=doc_maxlen, nbits=nbits, kmeans_niters=4) # kmeans_niters specifies the number of iterations of k-means clustering; 4 is a good and fast default.
         indexer = Indexer(checkpoint=checkpoint, config=config)

 from colbert import Indexer, Searcher
 from colbert.infra import Run, RunConfig, ColBERTConfig
+INDEX_NAME      = 'index'
+ANTHOLOGY_PATH  = 'anthology.bib'
+COLLECTION_PATH = 'collection.json'
+DATASET_PATH    = 'dataset.json'
+nbits = 2          # encode each dimension with 2 bits
+doc_maxlen = 300   # truncate passages at 300 tokens
+checkpoint = 'colbert-ir/colbertv2.0' # ColBERT model to use
+def index_anthology(collection, index_name='index'):
     with Run().context(RunConfig(nranks=1, experiment='notebook')):                 # nranks specifies the number of GPUs to use
         config = ColBERTConfig(doc_maxlen=doc_maxlen, nbits=nbits, kmeans_niters=4) # kmeans_niters specifies the number of iterations of k-means clustering; 4 is a good and fast default.
         indexer = Indexer(checkpoint=checkpoint, config=config)

parse.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import bibtexparser, json
-ANTHOLOGY_PATH = 'anthology.bib'
-COLLECTION_PATH = 'acl/collection.json'
-DATASET_PATH = 'acl/dataset.json'
 def parse_anthology_bibtex(anthology_path):
     with open(anthology_path, 'r', encoding='utf-8') as f:

 import bibtexparser, json
+ANTHOLOGY_PATH  = 'anthology.bib'
+COLLECTION_PATH = 'collection.json'
+DATASET_PATH    = 'dataset.json'
 def parse_anthology_bibtex(anthology_path):
     with open(anthology_path, 'r', encoding='utf-8') as f:

search.py CHANGED Viewed

@@ -149,10 +149,8 @@ def generate_candidates(Q):
 def search_colbert(query, k):
     # Add the appropriate [Q], [D] tokens and encode with ColBERT
-    Q = searcher.encode(query)
-    # Cut off query to maxlen tokens
-    Q = Q[:, :searcher.config.query_maxlen]
     # Find the passage candidates (i.e., closest candidates to the Q centroid)
     pids, centroid_scores = generate_candidates(Q)

 def search_colbert(query, k):
     # Add the appropriate [Q], [D] tokens and encode with ColBERT
+    Q = searcher.encode(query)
+    Q = Q[:, :searcher.config.query_maxlen] # Cut off query to maxlen tokens
     # Find the passage candidates (i.e., closest candidates to the Q centroid)
     pids, centroid_scores = generate_candidates(Q)