davidheineman commited on
Commit
7563fd5
1 Parent(s): 0b3c38e

add readme

Browse files
Files changed (4) hide show
  1. README.md +45 -0
  2. index.py +1 -24
  3. parse.py +27 -0
  4. server.py +3 -2
README.md CHANGED
@@ -1,3 +1,48 @@
1
  ---
2
  license: apache-2.0
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: apache-2.0
3
  ---
4
+
5
+ ## Setup
6
+ First, create a conda environment and install the dependencies:
7
+ ```sh
8
+ pip install bibtexparser colbert-ir[torch,faiss-gpu]
9
+ ```
10
+
11
+ To grab the up-to-date abstracts:
12
+ ```sh
13
+ curl -O https://aclanthology.org/anthology+abstracts.bib.gz
14
+ gunzip anthology+abstracts.bib.gz
15
+ mv anthology+abstracts.bib anthology.bib
16
+ ```
17
+
18
+ ### (Optional) Step 1: Parse the Anthology
19
+
20
+ Feel free to skip steps 1 and 2, since the parsed/indexed anthology is contained in this repo. To parse the `.bib` file into `.json`:
21
+
22
+ ```sh
23
+ python parse.py
24
+ ```
25
+
26
+ ### (Optional) Step 2: Index with ColBERT
27
+
28
+ ```sh
29
+ python index.py
30
+ ```
31
+
32
+ ### Step 3: Search with ColBERT
33
+
34
+ To create a flask server capable of serving outputs, run:
35
+
36
+ ```sh
37
+ INDEX_ROOT=[YOUR_PATH]/colbert-acl INDEX_NAME=index python server.py
38
+ ```
39
+
40
+ Then, to test, visit:
41
+ ```
42
+ http://localhost:8893/api/search?k=25&query=How to extend context windows?
43
+ ```
44
+
45
+ ### Example notebooks
46
+
47
+ To see an example of search, visit:
48
+ [colab.research.google.com/drive/1-b90_8YSAK17KQ6C7nqKRYbCWEXQ9FGs](https://colab.research.google.com/drive/1-b90_8YSAK17KQ6C7nqKRYbCWEXQ9FGs?usp=sharing)
index.py CHANGED
@@ -2,32 +2,16 @@ import os
2
  os.environ["TOKENIZERS_PARALLELISM"] = "false" # Prevents deadlocks in ColBERT tokenization
3
  os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" # Allows multiple libraries in OpenMP runtime. This can cause unexected behavior, but allows ColBERT to work
4
 
5
- import json, bibtexparser
6
  from colbert import Indexer, Searcher
7
  from colbert.infra import Run, RunConfig, ColBERTConfig
8
 
9
  INDEX_NAME = 'index'
10
  ANTHOLOGY_PATH = 'anthology.bib'
11
-
12
  COLLECTION_PATH = 'acl/collection.json'
13
  DATASET_PATH = 'acl/dataset.json'
14
 
15
 
16
- def parse_anthology_bibtex(anthology_path):
17
- with open(anthology_path, 'r', encoding='utf-8') as f:
18
- acl_bib = bibtexparser.load(f)
19
-
20
- print(f'Found {len(acl_bib.entries)} articles with keys: {acl_bib.entries[0].keys()}')
21
- for entry in acl_bib.entries[:2]:
22
- print(entry.get('author'))
23
- print(entry.get('title'))
24
- print(entry.get('url') + '\n')
25
-
26
- dataset = acl_bib.entries
27
- collection = [e['abstract'] for e in dataset]
28
- return dataset, collection
29
-
30
-
31
  def index_anthology(collection, index_name='index'):
32
  nbits = 2 # encode each dimension with 2 bits
33
  doc_maxlen = 10 # truncate passages at 300 tokens
@@ -58,13 +42,6 @@ def search_anthology(collection, index_name=INDEX_NAME):
58
 
59
 
60
  if __name__ == '__main__':
61
- # Parse and save the anthology dataset
62
- # dataset, collection = parse_anthology_bibtex(ANTHOLOGY_PATH)
63
- # with open(COLLECTION_PATH, 'w', encoding='utf-8') as f:
64
- # f.write(json.dumps(collection, indent=4))
65
- # with open(DATASET_PATH, 'w', encoding='utf-8') as f:
66
- # f.write(json.dumps(dataset, indent=4))
67
-
68
  # Load the parsed anthology
69
  with open(COLLECTION_PATH, 'r', encoding='utf-8') as f:
70
  collection = json.loads(f.read())
 
2
  os.environ["TOKENIZERS_PARALLELISM"] = "false" # Prevents deadlocks in ColBERT tokenization
3
  os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" # Allows multiple libraries in OpenMP runtime. This can cause unexected behavior, but allows ColBERT to work
4
 
5
+ import json
6
  from colbert import Indexer, Searcher
7
  from colbert.infra import Run, RunConfig, ColBERTConfig
8
 
9
  INDEX_NAME = 'index'
10
  ANTHOLOGY_PATH = 'anthology.bib'
 
11
  COLLECTION_PATH = 'acl/collection.json'
12
  DATASET_PATH = 'acl/dataset.json'
13
 
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  def index_anthology(collection, index_name='index'):
16
  nbits = 2 # encode each dimension with 2 bits
17
  doc_maxlen = 10 # truncate passages at 300 tokens
 
42
 
43
 
44
  if __name__ == '__main__':
 
 
 
 
 
 
 
45
  # Load the parsed anthology
46
  with open(COLLECTION_PATH, 'r', encoding='utf-8') as f:
47
  collection = json.loads(f.read())
parse.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import bibtexparser, json
2
+
3
+ ANTHOLOGY_PATH = 'anthology.bib'
4
+ COLLECTION_PATH = 'acl/collection.json'
5
+ DATASET_PATH = 'acl/dataset.json'
6
+
7
+ def parse_anthology_bibtex(anthology_path):
8
+ with open(anthology_path, 'r', encoding='utf-8') as f:
9
+ acl_bib = bibtexparser.load(f)
10
+
11
+ print(f'Found {len(acl_bib.entries)} articles with keys: {acl_bib.entries[0].keys()}')
12
+ for entry in acl_bib.entries[:2]:
13
+ print(entry.get('author'))
14
+ print(entry.get('title'))
15
+ print(entry.get('url') + '\n')
16
+
17
+ dataset = acl_bib.entries
18
+ collection = [e['abstract'] for e in dataset]
19
+ return dataset, collection
20
+
21
+ if __name__ == '__main__':
22
+ # Parse and save the anthology dataset
23
+ dataset, collection = parse_anthology_bibtex(ANTHOLOGY_PATH)
24
+ with open(COLLECTION_PATH, 'w', encoding='utf-8') as f:
25
+ f.write(json.dumps(collection, indent=4))
26
+ with open(DATASET_PATH, 'w', encoding='utf-8') as f:
27
+ f.write(json.dumps(dataset, indent=4))
server.py CHANGED
@@ -11,9 +11,10 @@ load_dotenv()
11
 
12
  INDEX_NAME = os.getenv("INDEX_NAME")
13
  INDEX_ROOT = os.getenv("INDEX_ROOT")
 
14
  app = Flask(__name__)
15
 
16
- searcher = Searcher(index=INDEX_NAME) # index_root=INDEX_ROOT
17
  counter = {"api" : 0}
18
 
19
  @lru_cache(maxsize=1000000)
@@ -49,5 +50,5 @@ if __name__ == "__main__":
49
  INDEX_ROOT=/Users/dhei/personal/4440/project/colbert-acl INDEX_NAME=index python server.py
50
  http://localhost:8893/api/search?k=25&query=How to extend context windows?
51
  """
52
- app.run("0.0.0.0", int(os.getenv("PORT")))
53
 
 
11
 
12
  INDEX_NAME = os.getenv("INDEX_NAME")
13
  INDEX_ROOT = os.getenv("INDEX_ROOT")
14
+ PORT = int(os.getenv("PORT", 8893))
15
  app = Flask(__name__)
16
 
17
+ searcher = Searcher(index_root=INDEX_ROOT, index=INDEX_NAME)
18
  counter = {"api" : 0}
19
 
20
  @lru_cache(maxsize=1000000)
 
50
  INDEX_ROOT=/Users/dhei/personal/4440/project/colbert-acl INDEX_NAME=index python server.py
51
  http://localhost:8893/api/search?k=25&query=How to extend context windows?
52
  """
53
+ app.run("0.0.0.0", PORT)
54