davidheineman
/

colbert-acl

Model card Files Files and versions Community

davidheineman commited on Apr 14

Commit

7563fd5

•

1 Parent(s): 0b3c38e

add readme

Browse files

Files changed (4) hide show

README.md +45 -0
index.py +1 -24
parse.py +27 -0
server.py +3 -2

README.md CHANGED Viewed

@@ -1,3 +1,48 @@
 ---
 license: apache-2.0
 ---

 ---
 license: apache-2.0
 ---
+## Setup
+First, create a conda environment and install the dependencies:
+```sh
+pip install bibtexparser colbert-ir[torch,faiss-gpu]
+```
+To grab the up-to-date abstracts:
+```sh
+curl -O https://aclanthology.org/anthology+abstracts.bib.gz
+gunzip anthology+abstracts.bib.gz
+mv anthology+abstracts.bib anthology.bib
+```
+### (Optional) Step 1: Parse the Anthology
+Feel free to skip steps 1 and 2, since the parsed/indexed anthology is contained in this repo. To parse the `.bib` file into `.json`:
+```sh
+python parse.py
+```
+### (Optional) Step 2: Index with ColBERT
+```sh
+python index.py
+```
+### Step 3: Search with ColBERT
+To create a flask server capable of serving outputs, run:
+```sh
+INDEX_ROOT=[YOUR_PATH]/colbert-acl INDEX_NAME=index python server.py
+```
+Then, to test, visit:
+```
+http://localhost:8893/api/search?k=25&query=How to extend context windows?
+```
+### Example notebooks
+To see an example of search, visit:
+[colab.research.google.com/drive/1-b90_8YSAK17KQ6C7nqKRYbCWEXQ9FGs](https://colab.research.google.com/drive/1-b90_8YSAK17KQ6C7nqKRYbCWEXQ9FGs?usp=sharing)

index.py CHANGED Viewed

@@ -2,32 +2,16 @@ import os
 os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Prevents deadlocks in ColBERT tokenization
 os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"     # Allows multiple libraries in OpenMP runtime. This can cause unexected behavior, but allows ColBERT to work
-import json, bibtexparser
 from colbert import Indexer, Searcher
 from colbert.infra import Run, RunConfig, ColBERTConfig
 INDEX_NAME = 'index'
 ANTHOLOGY_PATH = 'anthology.bib'
 COLLECTION_PATH = 'acl/collection.json'
 DATASET_PATH = 'acl/dataset.json'
-def parse_anthology_bibtex(anthology_path):
-    with open(anthology_path, 'r', encoding='utf-8') as f:
-        acl_bib = bibtexparser.load(f)
-    print(f'Found {len(acl_bib.entries)} articles with keys: {acl_bib.entries[0].keys()}')
-    for entry in acl_bib.entries[:2]:
-        print(entry.get('author'))
-        print(entry.get('title'))
-        print(entry.get('url') + '\n')
-    dataset = acl_bib.entries
-    collection = [e['abstract'] for e in dataset]
-    return dataset, collection
 def index_anthology(collection, index_name='index'):
     nbits = 2          # encode each dimension with 2 bits
     doc_maxlen = 10    # truncate passages at 300 tokens
@@ -58,13 +42,6 @@ def search_anthology(collection, index_name=INDEX_NAME):
 if __name__ == '__main__':
-    # Parse and save the anthology dataset
-    # dataset, collection = parse_anthology_bibtex(ANTHOLOGY_PATH)
-    # with open(COLLECTION_PATH, 'w', encoding='utf-8') as f:
-    #     f.write(json.dumps(collection, indent=4))
-    # with open(DATASET_PATH, 'w', encoding='utf-8') as f:
-    #     f.write(json.dumps(dataset, indent=4))
     # Load the parsed anthology
     with open(COLLECTION_PATH, 'r', encoding='utf-8') as f:
         collection = json.loads(f.read())

 os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Prevents deadlocks in ColBERT tokenization
 os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"     # Allows multiple libraries in OpenMP runtime. This can cause unexected behavior, but allows ColBERT to work
+import json
 from colbert import Indexer, Searcher
 from colbert.infra import Run, RunConfig, ColBERTConfig
 INDEX_NAME = 'index'
 ANTHOLOGY_PATH = 'anthology.bib'
 COLLECTION_PATH = 'acl/collection.json'
 DATASET_PATH = 'acl/dataset.json'
 def index_anthology(collection, index_name='index'):
     nbits = 2          # encode each dimension with 2 bits
     doc_maxlen = 10    # truncate passages at 300 tokens
 if __name__ == '__main__':
     # Load the parsed anthology
     with open(COLLECTION_PATH, 'r', encoding='utf-8') as f:
         collection = json.loads(f.read())

parse.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import bibtexparser, json
+ANTHOLOGY_PATH = 'anthology.bib'
+COLLECTION_PATH = 'acl/collection.json'
+DATASET_PATH = 'acl/dataset.json'
+def parse_anthology_bibtex(anthology_path):
+    with open(anthology_path, 'r', encoding='utf-8') as f:
+        acl_bib = bibtexparser.load(f)
+    print(f'Found {len(acl_bib.entries)} articles with keys: {acl_bib.entries[0].keys()}')
+    for entry in acl_bib.entries[:2]:
+        print(entry.get('author'))
+        print(entry.get('title'))
+        print(entry.get('url') + '\n')
+    dataset = acl_bib.entries
+    collection = [e['abstract'] for e in dataset]
+    return dataset, collection
+if __name__ == '__main__':
+    # Parse and save the anthology dataset
+    dataset, collection = parse_anthology_bibtex(ANTHOLOGY_PATH)
+    with open(COLLECTION_PATH, 'w', encoding='utf-8') as f:
+        f.write(json.dumps(collection, indent=4))
+    with open(DATASET_PATH, 'w', encoding='utf-8') as f:
+        f.write(json.dumps(dataset, indent=4))

server.py CHANGED Viewed

@@ -11,9 +11,10 @@ load_dotenv()
 INDEX_NAME = os.getenv("INDEX_NAME")
 INDEX_ROOT = os.getenv("INDEX_ROOT")
 app = Flask(__name__)
-searcher = Searcher(index=INDEX_NAME) # index_root=INDEX_ROOT
 counter = {"api" : 0}
 @lru_cache(maxsize=1000000)
@@ -49,5 +50,5 @@ if __name__ == "__main__":
     INDEX_ROOT=/Users/dhei/personal/4440/project/colbert-acl INDEX_NAME=index python server.py
     http://localhost:8893/api/search?k=25&query=How to extend context windows?
     """
-    app.run("0.0.0.0", int(os.getenv("PORT")))

 INDEX_NAME = os.getenv("INDEX_NAME")
 INDEX_ROOT = os.getenv("INDEX_ROOT")
+PORT = int(os.getenv("PORT", 8893))
 app = Flask(__name__)
+searcher = Searcher(index_root=INDEX_ROOT, index=INDEX_NAME)
 counter = {"api" : 0}
 @lru_cache(maxsize=1000000)
     INDEX_ROOT=/Users/dhei/personal/4440/project/colbert-acl INDEX_NAME=index python server.py
     http://localhost:8893/api/search?k=25&query=How to extend context windows?
     """
+    app.run("0.0.0.0", PORT)