davidheineman
commited on
Commit
•
335b0ad
1
Parent(s):
7502d6f
add knn db
Browse files- .gitignore +3 -1
- README.md +1 -1
- knn_db_access.py +69 -0
- openai_embed.py +18 -0
- search.py +14 -0
- templates/index.html +2 -2
.gitignore
CHANGED
@@ -1,2 +1,4 @@
|
|
1 |
__pycache__
|
2 |
-
experiments
|
|
|
|
|
|
1 |
__pycache__
|
2 |
+
experiments
|
3 |
+
.openai-secret
|
4 |
+
.mongodb-secret
|
README.md
CHANGED
@@ -13,7 +13,7 @@ pip install bibtexparser colbert-ir[torch,faiss-gpu]
|
|
13 |
## Setup server
|
14 |
Install pip dependencies
|
15 |
```sh
|
16 |
-
pip install mysql-connector-python flask
|
17 |
```
|
18 |
|
19 |
Set up a local MySQL server:
|
|
|
13 |
## Setup server
|
14 |
Install pip dependencies
|
15 |
```sh
|
16 |
+
pip install mysql-connector-python flask openai pymongo[srv]
|
17 |
```
|
18 |
|
19 |
Set up a local MySQL server:
|
knn_db_access.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pymongo.mongo_client import MongoClient
|
2 |
+
from pymongo.server_api import ServerApi
|
3 |
+
|
4 |
+
|
5 |
+
USER = "test"
|
6 |
+
SERVER = "dbbackend.c9tcfpp"
|
7 |
+
with open('.mongodb-secret', 'r') as f:
|
8 |
+
PASS = f.read()
|
9 |
+
|
10 |
+
|
11 |
+
class MongoDBAccess:
|
12 |
+
def __init__(self) -> None:
|
13 |
+
self.uri = f"mongodb+srv://{USER}:{PASS}@{SERVER}.mongodb.net/?retryWrites=true&w=majority&appName=DBBackend"
|
14 |
+
self.client = MongoClient(self.uri, server_api=ServerApi('1'))
|
15 |
+
self.database = self.client["ColBERTPapers"]
|
16 |
+
self.col = self.database["papers"]
|
17 |
+
|
18 |
+
def ping(self) -> None:
|
19 |
+
try:
|
20 |
+
self.client.admin.command('ping')
|
21 |
+
print("Pinged your deployment. You successfully connected to MongoDB!")
|
22 |
+
except Exception as e:
|
23 |
+
print(e)
|
24 |
+
|
25 |
+
def article_info_from_id_list(self, id_list:int):
|
26 |
+
query = {"id": {'$in': id_list}}
|
27 |
+
doc = self.col.find(query, {"id": 1, "title": 1, "year": 1, "author": 1, "abstract": 1})
|
28 |
+
res = []
|
29 |
+
for x in doc:
|
30 |
+
res.append(x)
|
31 |
+
return res
|
32 |
+
|
33 |
+
def vector_knn_search(self, query_embed, year):
|
34 |
+
pipeline = [
|
35 |
+
{
|
36 |
+
'$vectorSearch': {
|
37 |
+
'index': 'vector_index',
|
38 |
+
'path': 'embed',
|
39 |
+
'queryVector': query_embed,
|
40 |
+
'numCandidates': 1000,
|
41 |
+
'limit': 1000
|
42 |
+
}
|
43 |
+
},
|
44 |
+
{
|
45 |
+
"$project": {
|
46 |
+
'_id': 0,
|
47 |
+
'id': 1,
|
48 |
+
'title': 1,
|
49 |
+
'year': 1,
|
50 |
+
#'author': 1,
|
51 |
+
#'abstract': 1,
|
52 |
+
'score': {
|
53 |
+
'$meta': 'vectorSearchScore'
|
54 |
+
}
|
55 |
+
}
|
56 |
+
}
|
57 |
+
]
|
58 |
+
res = self.client["ColBERTPapers"]["papers"].aggregate(pipeline)
|
59 |
+
res_list = []
|
60 |
+
for i in res:
|
61 |
+
if (i['year'] >= year):
|
62 |
+
res_list.append(i)
|
63 |
+
return res_list
|
64 |
+
|
65 |
+
|
66 |
+
if __name__ == "__main__":
|
67 |
+
db = MongoDBAccess()
|
68 |
+
db.ping()
|
69 |
+
db.vector_knn_search("What is text simplification?", 1900)
|
openai_embed.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from openai import OpenAI
|
2 |
+
|
3 |
+
|
4 |
+
with open('.openai-secret', 'r') as f:
|
5 |
+
OPENAI_API_KEY = f.read()
|
6 |
+
|
7 |
+
|
8 |
+
class QueryEmbedder:
|
9 |
+
def __init__(self) -> None:
|
10 |
+
self.client = OpenAI(api_key=OPENAI_API_KEY)
|
11 |
+
|
12 |
+
def embed_query(self, query):
|
13 |
+
res = self.client.embeddings.create(
|
14 |
+
model="text-embedding-3-small",
|
15 |
+
input=query[0:8000],
|
16 |
+
encoding_format="float"
|
17 |
+
)
|
18 |
+
return res.dict()['data'][0]['embedding']
|
search.py
CHANGED
@@ -10,6 +10,9 @@ from colbert.indexing.codecs.residual import ResidualCodec
|
|
10 |
|
11 |
from utils import filter_pids, decompress_residuals
|
12 |
|
|
|
|
|
|
|
13 |
INDEX_NAME = os.getenv("INDEX_NAME", 'index_large')
|
14 |
INDEX_ROOT = os.getenv("INDEX_ROOT", '.')
|
15 |
|
@@ -182,6 +185,17 @@ def search_colbert(query, k):
|
|
182 |
"""
|
183 |
ColBERT search with a query.
|
184 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
185 |
# Encode query using ColBERT model, using the appropriate [Q], [D] tokens
|
186 |
Q = searcher.encode(query)
|
187 |
Q = Q[:, :QUERY_MAX_LEN] # Cut off query to maxlen tokens
|
|
|
10 |
|
11 |
from utils import filter_pids, decompress_residuals
|
12 |
|
13 |
+
from openai_embed import QueryEmbedder
|
14 |
+
from knn_db_access import MongoDBAccess
|
15 |
+
|
16 |
INDEX_NAME = os.getenv("INDEX_NAME", 'index_large')
|
17 |
INDEX_ROOT = os.getenv("INDEX_ROOT", '.')
|
18 |
|
|
|
185 |
"""
|
186 |
ColBERT search with a query.
|
187 |
"""
|
188 |
+
# Embed query
|
189 |
+
queryEmbed = QueryEmbedder()
|
190 |
+
query_embed = queryEmbed.embed_query(query)
|
191 |
+
|
192 |
+
# Get KNN From MongoDB
|
193 |
+
mongoDB = MongoDBAccess()
|
194 |
+
knn_results = mongoDB.vector_knn_search(query_embed, 1900)
|
195 |
+
|
196 |
+
print(knn_results)
|
197 |
+
|
198 |
+
|
199 |
# Encode query using ColBERT model, using the appropriate [Q], [D] tokens
|
200 |
Q = searcher.encode(query)
|
201 |
Q = Q[:, :QUERY_MAX_LEN] # Cut off query to maxlen tokens
|
templates/index.html
CHANGED
@@ -6,8 +6,8 @@
|
|
6 |
</head>
|
7 |
<body>
|
8 |
<div id="welcome-message" class="welcome-message">
|
9 |
-
<h1>
|
10 |
-
<p>
|
11 |
</div>
|
12 |
|
13 |
<form class='welcome-form' action="/query" method="post">
|
|
|
6 |
</head>
|
7 |
<body>
|
8 |
<div id="welcome-message" class="welcome-message">
|
9 |
+
<h1>ColBERT Article Search Engine</h1>
|
10 |
+
<p>By Team 12</p>
|
11 |
</div>
|
12 |
|
13 |
<form class='welcome-form' action="/query" method="post">
|