davidheineman
commited on
Commit
•
8b805bb
1
Parent(s):
bed1667
fix filepaths
Browse files
README.md
CHANGED
@@ -6,6 +6,7 @@ license: apache-2.0
|
|
6 |
First, clone this repo and create a conda environment and install the dependencies:
|
7 |
```sh
|
8 |
git clone https://huggingface.co/davidheineman/colbert-acl
|
|
|
9 |
pip install bibtexparser colbert-ir[torch,faiss-gpu]
|
10 |
```
|
11 |
|
@@ -43,7 +44,17 @@ Then, to test, visit:
|
|
43 |
http://localhost:8893/api/search?k=25&query=How to extend context windows?
|
44 |
```
|
45 |
|
46 |
-
|
47 |
|
48 |
To see an example of search, visit:
|
49 |
-
[colab.research.google.com/drive/1-b90_8YSAK17KQ6C7nqKRYbCWEXQ9FGs](https://colab.research.google.com/drive/1-b90_8YSAK17KQ6C7nqKRYbCWEXQ9FGs?usp=sharing)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
First, clone this repo and create a conda environment and install the dependencies:
|
7 |
```sh
|
8 |
git clone https://huggingface.co/davidheineman/colbert-acl
|
9 |
+
# torch==1.13.1 required (conda install -y -n [env] python=3.10)
|
10 |
pip install bibtexparser colbert-ir[torch,faiss-gpu]
|
11 |
```
|
12 |
|
|
|
44 |
http://localhost:8893/api/search?k=25&query=How to extend context windows?
|
45 |
```
|
46 |
|
47 |
+
## Example notebooks
|
48 |
|
49 |
To see an example of search, visit:
|
50 |
+
[colab.research.google.com/drive/1-b90_8YSAK17KQ6C7nqKRYbCWEXQ9FGs](https://colab.research.google.com/drive/1-b90_8YSAK17KQ6C7nqKRYbCWEXQ9FGs?usp=sharing)
|
51 |
+
|
52 |
+
## Notes
|
53 |
+
- It's possible to update the index without re-computing the whole dataset. Basically the IVF table is updated, but the centroids are not re-computed. This requires a large dataset to already exist (in our case it does).
|
54 |
+
- We'll need someone to manage the storage/saving of the index, so it can be updated in real-time.
|
55 |
+
- See:
|
56 |
+
- https://github.com/stanford-futuredata/ColBERT/blob/main/colbert/index_updater.py
|
57 |
+
- https://github.com/stanford-futuredata/ColBERT/issues/111
|
58 |
+
- We also need a MySQL database which can take in a document ID and return its metadata, so the ColBERT database only stores the passage encodings, not the full text (right now it just loads the whole json into memory).
|
59 |
+
- We may be able to offload the centroids calculation to a vector DB (check on this)
|
60 |
+
- Should have 2 people on UI, 1 on MySQL, 1 on VectorDB, 1 on ColBERT
|
index.py
CHANGED
@@ -6,18 +6,19 @@ import json
|
|
6 |
from colbert import Indexer, Searcher
|
7 |
from colbert.infra import Run, RunConfig, ColBERTConfig
|
8 |
|
9 |
-
INDEX_NAME = 'index'
|
10 |
-
ANTHOLOGY_PATH = 'anthology.bib'
|
11 |
-
COLLECTION_PATH = 'acl/collection.json'
|
12 |
-
DATASET_PATH = 'acl/dataset.json'
|
13 |
|
|
|
|
|
|
|
|
|
14 |
|
15 |
-
def index_anthology(collection, index_name='index'):
|
16 |
-
nbits = 2 # encode each dimension with 2 bits
|
17 |
-
doc_maxlen = 300 # truncate passages at 300 tokens
|
18 |
-
|
19 |
-
checkpoint = 'colbert-ir/colbertv2.0'
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
with Run().context(RunConfig(nranks=1, experiment='notebook')): # nranks specifies the number of GPUs to use
|
22 |
config = ColBERTConfig(doc_maxlen=doc_maxlen, nbits=nbits, kmeans_niters=4) # kmeans_niters specifies the number of iterations of k-means clustering; 4 is a good and fast default.
|
23 |
indexer = Indexer(checkpoint=checkpoint, config=config)
|
|
|
6 |
from colbert import Indexer, Searcher
|
7 |
from colbert.infra import Run, RunConfig, ColBERTConfig
|
8 |
|
|
|
|
|
|
|
|
|
9 |
|
10 |
+
INDEX_NAME = 'index'
|
11 |
+
ANTHOLOGY_PATH = 'anthology.bib'
|
12 |
+
COLLECTION_PATH = 'collection.json'
|
13 |
+
DATASET_PATH = 'dataset.json'
|
14 |
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
+
nbits = 2 # encode each dimension with 2 bits
|
17 |
+
doc_maxlen = 300 # truncate passages at 300 tokens
|
18 |
+
checkpoint = 'colbert-ir/colbertv2.0' # ColBERT model to use
|
19 |
+
|
20 |
+
|
21 |
+
def index_anthology(collection, index_name='index'):
|
22 |
with Run().context(RunConfig(nranks=1, experiment='notebook')): # nranks specifies the number of GPUs to use
|
23 |
config = ColBERTConfig(doc_maxlen=doc_maxlen, nbits=nbits, kmeans_niters=4) # kmeans_niters specifies the number of iterations of k-means clustering; 4 is a good and fast default.
|
24 |
indexer = Indexer(checkpoint=checkpoint, config=config)
|
parse.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
import bibtexparser, json
|
2 |
|
3 |
-
ANTHOLOGY_PATH
|
4 |
-
COLLECTION_PATH = '
|
5 |
-
DATASET_PATH
|
6 |
|
7 |
def parse_anthology_bibtex(anthology_path):
|
8 |
with open(anthology_path, 'r', encoding='utf-8') as f:
|
|
|
1 |
import bibtexparser, json
|
2 |
|
3 |
+
ANTHOLOGY_PATH = 'anthology.bib'
|
4 |
+
COLLECTION_PATH = 'collection.json'
|
5 |
+
DATASET_PATH = 'dataset.json'
|
6 |
|
7 |
def parse_anthology_bibtex(anthology_path):
|
8 |
with open(anthology_path, 'r', encoding='utf-8') as f:
|
search.py
CHANGED
@@ -149,10 +149,8 @@ def generate_candidates(Q):
|
|
149 |
|
150 |
def search_colbert(query, k):
|
151 |
# Add the appropriate [Q], [D] tokens and encode with ColBERT
|
152 |
-
Q = searcher.encode(query)
|
153 |
-
|
154 |
-
# Cut off query to maxlen tokens
|
155 |
-
Q = Q[:, :searcher.config.query_maxlen]
|
156 |
|
157 |
# Find the passage candidates (i.e., closest candidates to the Q centroid)
|
158 |
pids, centroid_scores = generate_candidates(Q)
|
|
|
149 |
|
150 |
def search_colbert(query, k):
|
151 |
# Add the appropriate [Q], [D] tokens and encode with ColBERT
|
152 |
+
Q = searcher.encode(query)
|
153 |
+
Q = Q[:, :searcher.config.query_maxlen] # Cut off query to maxlen tokens
|
|
|
|
|
154 |
|
155 |
# Find the passage candidates (i.e., closest candidates to the Q centroid)
|
156 |
pids, centroid_scores = generate_candidates(Q)
|