davidheineman commited on
Commit
335b0ad
1 Parent(s): 7502d6f

add knn db

Browse files
Files changed (6) hide show
  1. .gitignore +3 -1
  2. README.md +1 -1
  3. knn_db_access.py +69 -0
  4. openai_embed.py +18 -0
  5. search.py +14 -0
  6. templates/index.html +2 -2
.gitignore CHANGED
@@ -1,2 +1,4 @@
1
  __pycache__
2
- experiments
 
 
 
1
  __pycache__
2
+ experiments
3
+ .openai-secret
4
+ .mongodb-secret
README.md CHANGED
@@ -13,7 +13,7 @@ pip install bibtexparser colbert-ir[torch,faiss-gpu]
13
  ## Setup server
14
  Install pip dependencies
15
  ```sh
16
- pip install mysql-connector-python flask
17
  ```
18
 
19
  Set up a local MySQL server:
 
13
  ## Setup server
14
  Install pip dependencies
15
  ```sh
16
+ pip install mysql-connector-python flask openai pymongo[srv]
17
  ```
18
 
19
  Set up a local MySQL server:
knn_db_access.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pymongo.mongo_client import MongoClient
2
+ from pymongo.server_api import ServerApi
3
+
4
+
5
+ USER = "test"
6
+ SERVER = "dbbackend.c9tcfpp"
7
+ with open('.mongodb-secret', 'r') as f:
8
+ PASS = f.read()
9
+
10
+
11
+ class MongoDBAccess:
12
+ def __init__(self) -> None:
13
+ self.uri = f"mongodb+srv://{USER}:{PASS}@{SERVER}.mongodb.net/?retryWrites=true&w=majority&appName=DBBackend"
14
+ self.client = MongoClient(self.uri, server_api=ServerApi('1'))
15
+ self.database = self.client["ColBERTPapers"]
16
+ self.col = self.database["papers"]
17
+
18
+ def ping(self) -> None:
19
+ try:
20
+ self.client.admin.command('ping')
21
+ print("Pinged your deployment. You successfully connected to MongoDB!")
22
+ except Exception as e:
23
+ print(e)
24
+
25
+ def article_info_from_id_list(self, id_list:int):
26
+ query = {"id": {'$in': id_list}}
27
+ doc = self.col.find(query, {"id": 1, "title": 1, "year": 1, "author": 1, "abstract": 1})
28
+ res = []
29
+ for x in doc:
30
+ res.append(x)
31
+ return res
32
+
33
+ def vector_knn_search(self, query_embed, year):
34
+ pipeline = [
35
+ {
36
+ '$vectorSearch': {
37
+ 'index': 'vector_index',
38
+ 'path': 'embed',
39
+ 'queryVector': query_embed,
40
+ 'numCandidates': 1000,
41
+ 'limit': 1000
42
+ }
43
+ },
44
+ {
45
+ "$project": {
46
+ '_id': 0,
47
+ 'id': 1,
48
+ 'title': 1,
49
+ 'year': 1,
50
+ #'author': 1,
51
+ #'abstract': 1,
52
+ 'score': {
53
+ '$meta': 'vectorSearchScore'
54
+ }
55
+ }
56
+ }
57
+ ]
58
+ res = self.client["ColBERTPapers"]["papers"].aggregate(pipeline)
59
+ res_list = []
60
+ for i in res:
61
+ if (i['year'] >= year):
62
+ res_list.append(i)
63
+ return res_list
64
+
65
+
66
+ if __name__ == "__main__":
67
+ db = MongoDBAccess()
68
+ db.ping()
69
+ db.vector_knn_search("What is text simplification?", 1900)
openai_embed.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+
3
+
4
+ with open('.openai-secret', 'r') as f:
5
+ OPENAI_API_KEY = f.read()
6
+
7
+
8
+ class QueryEmbedder:
9
+ def __init__(self) -> None:
10
+ self.client = OpenAI(api_key=OPENAI_API_KEY)
11
+
12
+ def embed_query(self, query):
13
+ res = self.client.embeddings.create(
14
+ model="text-embedding-3-small",
15
+ input=query[0:8000],
16
+ encoding_format="float"
17
+ )
18
+ return res.dict()['data'][0]['embedding']
search.py CHANGED
@@ -10,6 +10,9 @@ from colbert.indexing.codecs.residual import ResidualCodec
10
 
11
  from utils import filter_pids, decompress_residuals
12
 
 
 
 
13
  INDEX_NAME = os.getenv("INDEX_NAME", 'index_large')
14
  INDEX_ROOT = os.getenv("INDEX_ROOT", '.')
15
 
@@ -182,6 +185,17 @@ def search_colbert(query, k):
182
  """
183
  ColBERT search with a query.
184
  """
 
 
 
 
 
 
 
 
 
 
 
185
  # Encode query using ColBERT model, using the appropriate [Q], [D] tokens
186
  Q = searcher.encode(query)
187
  Q = Q[:, :QUERY_MAX_LEN] # Cut off query to maxlen tokens
 
10
 
11
  from utils import filter_pids, decompress_residuals
12
 
13
+ from openai_embed import QueryEmbedder
14
+ from knn_db_access import MongoDBAccess
15
+
16
  INDEX_NAME = os.getenv("INDEX_NAME", 'index_large')
17
  INDEX_ROOT = os.getenv("INDEX_ROOT", '.')
18
 
 
185
  """
186
  ColBERT search with a query.
187
  """
188
+ # Embed query
189
+ queryEmbed = QueryEmbedder()
190
+ query_embed = queryEmbed.embed_query(query)
191
+
192
+ # Get KNN From MongoDB
193
+ mongoDB = MongoDBAccess()
194
+ knn_results = mongoDB.vector_knn_search(query_embed, 1900)
195
+
196
+ print(knn_results)
197
+
198
+
199
  # Encode query using ColBERT model, using the appropriate [Q], [D] tokens
200
  Q = searcher.encode(query)
201
  Q = Q[:, :QUERY_MAX_LEN] # Cut off query to maxlen tokens
templates/index.html CHANGED
@@ -6,8 +6,8 @@
6
  </head>
7
  <body>
8
  <div id="welcome-message" class="welcome-message">
9
- <h1>Welcome!</h1>
10
- <p>Please enter your search terms below</p>
11
  </div>
12
 
13
  <form class='welcome-form' action="/query" method="post">
 
6
  </head>
7
  <body>
8
  <div id="welcome-message" class="welcome-message">
9
+ <h1>ColBERT Article Search Engine</h1>
10
+ <p>By Team 12</p>
11
  </div>
12
 
13
  <form class='welcome-form' action="/query" method="post">