davidheineman
/

colbert-acl

Model card Files Files and versions Community

davidheineman commited on Aug 11, 2024

Commit

f12459e

1 Parent(s): ece17e8

refactor into src

Browse files

Files changed (13) hide show

.gitignore +0 -3
Dockerfile +17 -0
README.md +18 -4
collection.json +0 -3
dataset.json +2 -2
requirements.txt +5 -0
src/constants.py +10 -0
db.py → src/db.py +2 -3
index.py → src/index.py +4 -4
parse.py → src/parse.py +21 -24
search.py → src/search.py +44 -27
server.py → src/server.py +2 -2
utils.py → src/utils.py +0 -0

.gitignore CHANGED Viewed

@@ -1,6 +1,3 @@
 __pycache__
 experiments
-.openai-secret
-.mongodb-secret
-demo.mov
 .DS_Store

 __pycache__
 experiments
 .DS_Store

Dockerfile ADDED Viewed

	@@ -0,0 +1,17 @@

+FROM python:3.10
+WORKDIR /app
+RUN apt-get update && \
+    apt-get install -y mysql-server git && \
+    rm -rf /var/lib/apt/lists/*
+RUN git clone https://huggingface.co/davidheineman/colbert-acl
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+# CMD ["python", "db.py"]
+CMD ["python", "server.py"]

README.md CHANGED Viewed

@@ -12,7 +12,7 @@ git clone https://huggingface.co/davidheineman/colbert-acl
 # install dependencies
 # torch==1.13.1 required (conda install -y -n [env] python=3.10)
-pip install colbert-ir[torch,faiss-gpu] bibtexparser mysql-connector-python flask
 brew install mysql
 ```
@@ -31,7 +31,11 @@ python parse.py
 # index with ColBERT
 python index.py
 # initalize database service
 python db.py
 ```
@@ -53,6 +57,12 @@ or for an interface:
 http://localhost:8893
 ```
 ## Example notebooks
 To see an example of search, visit:
@@ -64,7 +74,11 @@ To see an example of search, visit:
     - https://github.com/stanford-futuredata/ColBERT/issues/111
 - TODO:
-    - Scrape: https://proceedings.neurips.cc/
-    - https://dblp.uni-trier.de/db/conf/iclr/index.html
-    - openreview
  -->

 # install dependencies
 # torch==1.13.1 required (conda install -y -n [env] python=3.10)
+pip install -r requirements.txt
 brew install mysql
 ```
 # index with ColBERT
 python index.py
+```
+### Setup MySQL
+```sh
 # initalize database service
 python db.py
 ```
 http://localhost:8893
 ```
+### Deploy as a Docker App
+```
+docker build -t acl-colbert .
+docker run -d -p 5000:5000 acl-colbert
+```
 ## Example notebooks
 To see an example of search, visit:
     - https://github.com/stanford-futuredata/ColBERT/issues/111
 - TODO:
+    - Profile bibtexparser.load(f)
+    - Add UI
+    - Ship as a containerized service
+    - Scrape:
+        - https://proceedings.neurips.cc/
+        - https://dblp.uni-trier.de/db/conf/iclr/index.html
+        - openreview
  -->

collection.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:275476456de56b2812f96e44158ef04780c9067aa9d8828bce3f342769334227
-size 45377196

dataset.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b11f4537583604993033ccf736e16401f5ca787f07c0d0dfcb20d38b42641f57
-size 114098738

 version https://git-lfs.github.com/spec/v1
+oid sha256:c66c5a82d479e8c0604d59aec68c7786e00d91bf8b4b44a9d59d1c3c265661d5
+size 88851239

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch==1.13.1
+colbert-ir[torch] # faiss-gpu
+bibtexparser
+mysql-connector-python
+flask

src/constants.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import os
+INDEX_NAME = os.getenv("INDEX_NAME", 'index')
+INDEX_ROOT = os.getenv("INDEX_ROOT", os.path.dirname(os.path.abspath(__file__)))
+INDEX_PATH = os.path.join(INDEX_ROOT, INDEX_NAME)
+ANTHOLOGY_PATH = os.path.join(INDEX_ROOT, 'anthology.bib')
+DATASET_PATH = os.path.join(INDEX_ROOT, 'dataset.json')
+DB_NAME = 'acl_anthology'

db.py → src/db.py RENAMED Viewed

@@ -1,8 +1,7 @@
 import mysql.connector
 import json
-DB_NAME      = 'acl_anthology'
-DATASET_PATH = 'dataset.json'
 PAPER_QUERY = """
 SELECT *
@@ -130,7 +129,7 @@ def query_paper_metadata(colbert_response, year):
         host = "localhost",
         user = "root",
         password = "",
-        database= "acl_anthology"
     )
     cursor = db.cursor()

 import mysql.connector
 import json
+from constants import DATASET_PATH, DB_NAME
 PAPER_QUERY = """
 SELECT *
         host = "localhost",
         user = "root",
         password = "",
+        database = DB_NAME
     )
     cursor = db.cursor()

index.py → src/index.py RENAMED Viewed

@@ -3,19 +3,19 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Prevents deadlocks in ColBERT
 os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"     # Allows multiple libraries in OpenMP runtime. This can cause unexected behavior, but allows ColBERT to work
 import json
 from colbert import Indexer, Searcher
 from colbert.infra import Run, RunConfig, ColBERTConfig
-INDEX_NAME      = 'index'
-ANTHOLOGY_PATH  = 'anthology.bib'
-DATASET_PATH    = 'dataset.json'
 nbits      = 2     # encode each dimension with 2 bits
 doc_maxlen = 512   # truncate passages
 checkpoint = 'colbert-ir/colbertv2.0' # ColBERT model to use
-def index_anthology(collection, index_name='index'):
     with Run().context(RunConfig(nranks=1, experiment='notebook')): # nranks specifies the number of GPUs to use
         config = ColBERTConfig(
             doc_maxlen=doc_maxlen,

 os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"     # Allows multiple libraries in OpenMP runtime. This can cause unexected behavior, but allows ColBERT to work
 import json
+from constants import INDEX_NAME, DATASET_PATH
 from colbert import Indexer, Searcher
 from colbert.infra import Run, RunConfig, ColBERTConfig
 nbits      = 2     # encode each dimension with 2 bits
 doc_maxlen = 512   # truncate passages
 checkpoint = 'colbert-ir/colbertv2.0' # ColBERT model to use
+def index_anthology(collection, index_name):
     with Run().context(RunConfig(nranks=1, experiment='notebook')): # nranks specifies the number of GPUs to use
         config = ColBERTConfig(
             doc_maxlen=doc_maxlen,

parse.py → src/parse.py RENAMED Viewed

@@ -1,28 +1,31 @@
 import bibtexparser, json
-ANTHOLOGY_PATH  = 'anthology.bib'
-DATASET_PATH    = 'dataset.json'
-def parse_bibtex(anthology_path):
     with open(anthology_path, 'r', encoding='utf-8') as f:
-        acl_bib = bibtexparser.load(f)
-    print(f'Found {len(acl_bib.entries)} articles with keys: {acl_bib.entries[0].keys()}')
-    for entry in acl_bib.entries[:2]:
-        print(entry.get('author'))
-        print(entry.get('title'))
-        print(entry.get('url') + '\n')
-    dataset = acl_bib.entries
     # Remove any entries without abstracts, since we index on abstracts
-    dataset = [entry for entry in dataset if 'abstract' in entry.keys()]
     return dataset
-def preprocess_acl_entries(dataset):
     venues = []
     for id, paper in enumerate(dataset):
         url = paper['url']
@@ -91,24 +94,18 @@ def preprocess_acl_entries(dataset):
     # print(set(venues))
     return dataset
 def main():
     # 1) Parse and save the anthology dataset
-    dataset = parse_bibtex(ANTHOLOGY_PATH)
-    with open(DATASET_PATH, 'w', encoding='utf-8') as f:
-        f.write(json.dumps(dataset, indent=4))
     # 2) Pre-process the ACL anthology
-    with open(DATASET_PATH, 'r', encoding='utf-8') as f:
-        dataset = json.loads(f.read())
-    dataset = preprocess_acl_entries(dataset)
-    with open(DATASET_PATH, 'w', encoding='utf-8') as f:
-        f.write(json.dumps(dataset, indent=4))
 if __name__ == '__main__': main()

 import bibtexparser, json
+from constants import ANTHOLOGY_PATH, DATASET_PATH
+def parse_bibtex(anthology_path, dataset_path):
     with open(anthology_path, 'r', encoding='utf-8') as f:
+        bib = bibtexparser.load(f)
+    dataset = bib.entries
+    print(f'Found {len(dataset)} articles with keys: {dataset[0].keys()}')
+    paper: dict
+    for paper in dataset[:2]:
+        print(f"{paper.get('author')}\n{paper.get('title')}\n{paper.get('url')}\n")
     # Remove any entries without abstracts, since we index on abstracts
+    dataset = [paper for paper in dataset if 'abstract' in paper.keys()]
+    with open(dataset_path, 'w', encoding='utf-8') as f:
+        f.write(json.dumps(dataset, indent=4))
     return dataset
+def preprocess_acl_entries(dataset_path):
+    with open(dataset_path, 'r', encoding='utf-8') as f:
+        dataset = json.loads(f.read())
     venues = []
     for id, paper in enumerate(dataset):
         url = paper['url']
     # print(set(venues))
+    with open(DATASET_PATH, 'w', encoding='utf-8') as f:
+        f.write(json.dumps(dataset, indent=4))
     return dataset
 def main():
     # 1) Parse and save the anthology dataset
+    dataset = parse_bibtex(ANTHOLOGY_PATH, DATASET_PATH)
     # 2) Pre-process the ACL anthology
+    dataset = preprocess_acl_entries(DATASET_PATH)
 if __name__ == '__main__': main()

search.py → src/search.py RENAMED Viewed

@@ -1,44 +1,45 @@
-import os, shutil, json, ujson, tqdm
 import torch
 import torch.nn.functional as F
-from colbert import Searcher
 from colbert.search.index_storage import IndexScorer
 from colbert.search.strided_tensor import StridedTensor
 from colbert.indexing.codecs.residual_embeddings_strided import ResidualEmbeddingsStrided
 from colbert.indexing.codecs.residual import ResidualCodec
-INDEX_NAME = os.getenv("INDEX_NAME", 'index_large')
-INDEX_ROOT = os.getenv("INDEX_ROOT", '.')
-INDEX_PATH = os.path.join(INDEX_ROOT, INDEX_NAME)
-COLLECTION_PATH = os.path.join(INDEX_ROOT, 'collection.json')
-# Move index to ColBERT experiment path
-src_path = os.path.join(INDEX_ROOT, INDEX_NAME)
-dest_path = os.path.join(INDEX_ROOT, 'experiments', 'default', 'indexes', INDEX_NAME)
-if not os.path.exists(dest_path):
-    print(f'Copying {src_path} -> {dest_path}')
-    os.makedirs(dest_path)
-    shutil.copytree(src_path, dest_path, dirs_exist_ok=True)
-# Load abstracts as a collection
-with open(COLLECTION_PATH, 'r', encoding='utf-8') as f:
-    collection = json.load(f)
-searcher = Searcher(index=INDEX_NAME, collection=collection)
-QUERY_MAX_LEN = searcher.config.query_maxlen
-NCELLS = 1
 CENTROID_SCORE_THRESHOLD = 0.5 # How close a document has to be to a centroid to be considered
 NDOCS = 512  # Number of closest documents to consider
 def init_colbert(index_path=INDEX_PATH, load_index_with_mmap=False):
     """
     Load all tensors necessary for running ColBERT
     """
-    global centroids, embeddings, ivf, doclens, nbits, bucket_weights, codec, offsets
     with open(os.path.join(index_path, 'metadata.json')) as f:
         metadata = ujson.load(f)
@@ -109,7 +110,7 @@ def get_candidates(Q: torch.Tensor, ivf: StridedTensor) -> torch.Tensor:
     Q = Q.squeeze(0)
     # Get the closest centroids via a matrix multiplication + argmax
-    centroid_scores = (centroids @ Q.T)
     if NCELLS == 1:
         cells = centroid_scores.argmax(dim=0, keepdim=True).permute(1, 0)
     else:
@@ -165,13 +166,29 @@ def _calculate_colbert(Q: torch.Tensor):
     return scores, pids
 def search_colbert(query):
     """
     ColBERT search with a query.
     """
     # Encode query using ColBERT model, using the appropriate [Q], [D] tokens
-    Q = searcher.encode(query)
-    Q = Q[:, :QUERY_MAX_LEN] # Cut off query to maxlen tokens
     scores, pids = _calculate_colbert(Q)

+import os, shutil, ujson, tqdm
 import torch
 import torch.nn.functional as F
+from constants import INDEX_NAME, INDEX_ROOT, INDEX_PATH
+from colbert import Checkpoint
+from colbert.infra.config import ColBERTConfig
 from colbert.search.index_storage import IndexScorer
 from colbert.search.strided_tensor import StridedTensor
 from colbert.indexing.codecs.residual_embeddings_strided import ResidualEmbeddingsStrided
 from colbert.indexing.codecs.residual import ResidualCodec
+NCELLS = 1  # Number of centroids to use in PLAID
 CENTROID_SCORE_THRESHOLD = 0.5 # How close a document has to be to a centroid to be considered
 NDOCS = 512  # Number of closest documents to consider
+def move_index(index_root, index_name):
+    """ Move the index to the root dir (required for ColBERT) """
+    src_path = os.path.join(index_root, index_name)
+    dest_path = os.path.join(index_root, 'experiments', 'default', 'indexes', index_name)
+    if not os.path.exists(dest_path):
+        print(f'Copying {src_path} -> {dest_path}')
+        os.makedirs(dest_path)
+        shutil.copytree(src_path, dest_path, dirs_exist_ok=True)
 def init_colbert(index_path=INDEX_PATH, load_index_with_mmap=False):
     """
     Load all tensors necessary for running ColBERT
     """
+    global index_checkpoint, centroids, embeddings, ivf, doclens, nbits, bucket_weights, codec, offsets
+    # index_checkpoint: Checkpoint
+    index_config = ColBERTConfig.load_from_index(INDEX_NAME)
+    index_checkpoint = index_config.checkpoint
+    # Move index to ColBERT experiment path
+    move_index(INDEX_ROOT, INDEX_NAME)
     with open(os.path.join(index_path, 'metadata.json')) as f:
         metadata = ujson.load(f)
     Q = Q.squeeze(0)
     # Get the closest centroids via a matrix multiplication + argmax
+    centroid_scores: torch.Tensor = (centroids @ Q.T)
     if NCELLS == 1:
         cells = centroid_scores.argmax(dim=0, keepdim=True).permute(1, 0)
     else:
     return scores, pids
+def encode(text, full_length_search=False) -> torch.Tensor:
+    queries = text if isinstance(text, list) else [text]
+    bsize = 128 if len(queries) > 128 else None
+    Q = index_checkpoint.queryFromText(
+        queries,
+        bsize=bsize,
+        to_cpu=True,
+        full_length_search=full_length_search
+    )
+    QUERY_MAX_LEN = index_checkpoint.query_tokenizer.query_maxlen
+    Q = Q[:, :QUERY_MAX_LEN] # Cut off query to maxlen tokens
+    return Q
 def search_colbert(query):
     """
     ColBERT search with a query.
     """
     # Encode query using ColBERT model, using the appropriate [Q], [D] tokens
+    Q = encode(query)
     scores, pids = _calculate_colbert(Q)

server.py → src/server.py RENAMED Viewed

@@ -40,9 +40,9 @@ def api_search():
 @app.route('/api/search', methods=['POST', 'GET'])
 def query():
     if request.method == "POST":
-        query, year = request.form['query'], int(request.form['year'])
     elif request.method == "GET":
-        query, year = request.args.get('query'), int(request.args.get('year'))
     # Get top passage IDs from ColBERT
     colbert_response = api_search_query(query)

 @app.route('/api/search', methods=['POST', 'GET'])
 def query():
     if request.method == "POST":
+        query, year = str(request.form['query']), int(request.form['year'])
     elif request.method == "GET":
+        query, year = str(request.args.get('query')), int(request.args.get('year'))
     # Get top passage IDs from ColBERT
     colbert_response = api_search_query(query)

utils.py → src/utils.py RENAMED Viewed

File without changes