davidheineman
/

colbert-acl

Model card Files Files and versions Community

davidheineman commited on Apr 26

Commit

335b0ad

•

1 Parent(s): 7502d6f

add knn db

Browse files

Files changed (6) hide show

.gitignore +3 -1
README.md +1 -1
knn_db_access.py +69 -0
openai_embed.py +18 -0
search.py +14 -0
templates/index.html +2 -2

.gitignore CHANGED Viewed

@@ -1,2 +1,4 @@
 __pycache__
-experiments

 __pycache__
+experiments
+.openai-secret
+.mongodb-secret

README.md CHANGED Viewed

@@ -13,7 +13,7 @@ pip install bibtexparser colbert-ir[torch,faiss-gpu]
 ## Setup server
 Install pip dependencies
 ```sh
-pip install mysql-connector-python flask
 ```
 Set up a local MySQL server:

 ## Setup server
 Install pip dependencies
 ```sh
+pip install mysql-connector-python flask openai pymongo[srv]
 ```
 Set up a local MySQL server:

knn_db_access.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from pymongo.mongo_client import MongoClient
+from pymongo.server_api import ServerApi
+USER = "test"
+SERVER = "dbbackend.c9tcfpp"
+with open('.mongodb-secret', 'r') as f:
+    PASS = f.read()
+class MongoDBAccess:
+    def __init__(self) -> None:
+        self.uri = f"mongodb+srv://{USER}:{PASS}@{SERVER}.mongodb.net/?retryWrites=true&w=majority&appName=DBBackend"
+        self.client = MongoClient(self.uri, server_api=ServerApi('1'))
+        self.database = self.client["ColBERTPapers"]
+        self.col = self.database["papers"]
+    def ping(self) -> None:
+        try:
+            self.client.admin.command('ping')
+            print("Pinged your deployment. You successfully connected to MongoDB!")
+        except Exception as e:
+            print(e)
+    def article_info_from_id_list(self, id_list:int):
+        query = {"id": {'$in': id_list}}
+        doc = self.col.find(query, {"id": 1, "title": 1, "year": 1, "author": 1, "abstract": 1})
+        res = []
+        for x in doc:
+            res.append(x)
+        return res
+    def vector_knn_search(self, query_embed, year):
+        pipeline = [
+            {
+                '$vectorSearch': {
+                    'index': 'vector_index',
+                    'path': 'embed',
+                    'queryVector': query_embed,
+                    'numCandidates': 1000,
+                    'limit': 1000
+                }
+            },
+            {
+                "$project": {
+                    '_id': 0,
+                    'id': 1,
+                    'title': 1,
+                    'year': 1,
+                    #'author': 1,
+                    #'abstract': 1,
+                    'score': {
+                        '$meta': 'vectorSearchScore'
+                    }
+                }
+            }
+        ]
+        res = self.client["ColBERTPapers"]["papers"].aggregate(pipeline)
+        res_list = []
+        for i in res:
+            if (i['year'] >= year):
+                res_list.append(i)
+        return res_list
+if __name__ == "__main__":
+    db = MongoDBAccess()
+    db.ping()
+    db.vector_knn_search("What is text simplification?", 1900)

openai_embed.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from openai import OpenAI
+with open('.openai-secret', 'r') as f:
+    OPENAI_API_KEY = f.read()
+class QueryEmbedder:
+    def __init__(self) -> None:
+        self.client = OpenAI(api_key=OPENAI_API_KEY)
+    def embed_query(self, query):
+        res = self.client.embeddings.create(
+            model="text-embedding-3-small",
+            input=query[0:8000],
+            encoding_format="float"
+        )
+        return res.dict()['data'][0]['embedding']

search.py CHANGED Viewed

@@ -10,6 +10,9 @@ from colbert.indexing.codecs.residual import ResidualCodec
 from utils import filter_pids, decompress_residuals
 INDEX_NAME = os.getenv("INDEX_NAME", 'index_large')
 INDEX_ROOT = os.getenv("INDEX_ROOT", '.')
@@ -182,6 +185,17 @@ def search_colbert(query, k):
     """
     ColBERT search with a query.
     """
     # Encode query using ColBERT model, using the appropriate [Q], [D] tokens
     Q = searcher.encode(query)
     Q = Q[:, :QUERY_MAX_LEN] # Cut off query to maxlen tokens

 from utils import filter_pids, decompress_residuals
+from openai_embed import QueryEmbedder
+from knn_db_access import MongoDBAccess
 INDEX_NAME = os.getenv("INDEX_NAME", 'index_large')
 INDEX_ROOT = os.getenv("INDEX_ROOT", '.')
     """
     ColBERT search with a query.
     """
+    # Embed query
+    queryEmbed = QueryEmbedder()
+    query_embed = queryEmbed.embed_query(query)
+    # Get KNN From MongoDB
+    mongoDB = MongoDBAccess()
+    knn_results = mongoDB.vector_knn_search(query_embed, 1900)
+    print(knn_results)
     # Encode query using ColBERT model, using the appropriate [Q], [D] tokens
     Q = searcher.encode(query)
     Q = Q[:, :QUERY_MAX_LEN] # Cut off query to maxlen tokens

templates/index.html CHANGED Viewed

@@ -6,8 +6,8 @@
     </head>
     <body>
         <div id="welcome-message" class="welcome-message">
-            <h1>Welcome!</h1>
-            <p>Please enter your search terms below</p>
         </div>
         <form class='welcome-form' action="/query" method="post">

     </head>
     <body>
         <div id="welcome-message" class="welcome-message">
+            <h1>ColBERT Article Search Engine</h1>
+            <p>By Team 12</p>
         </div>
         <form class='welcome-form' action="/query" method="post">