|
|
import json |
|
|
import numpy as np |
|
|
from sentence_transformers import SentenceTransformer |
|
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
|
|
|
|
|
|
class VectorDB: |
|
|
def __init__(self): |
|
|
|
|
|
self.model = SentenceTransformer("BAAI/bge-small-en-v1.5") |
|
|
|
|
|
|
|
|
self.documents = [] |
|
|
self.embeddings = [] |
|
|
|
|
|
|
|
|
self.dimension = self.model.get_sentence_embedding_dimension() |
|
|
|
|
|
def load_documents(self, path="documents.json"): |
|
|
""" |
|
|
Load documents from JSON file and create embeddings. |
|
|
Expected schema: |
|
|
[ |
|
|
{ |
|
|
"id": "...", |
|
|
"data": "text content" |
|
|
} |
|
|
] |
|
|
""" |
|
|
with open(path, "r", encoding="utf-8") as f: |
|
|
self.documents = json.load(f) |
|
|
|
|
|
texts = [doc["data"] for doc in self.documents] |
|
|
self.embeddings = self.model.encode(texts, convert_to_numpy=True) |
|
|
|
|
|
def search(self, query): |
|
|
""" |
|
|
Search documents using cosine similarity. |
|
|
Returns a list of: |
|
|
{ |
|
|
id, |
|
|
score, |
|
|
metadata: { text } |
|
|
} |
|
|
""" |
|
|
if not self.documents: |
|
|
return [] |
|
|
|
|
|
query_embedding = self.model.encode([query], convert_to_numpy=True) |
|
|
|
|
|
scores = cosine_similarity(query_embedding, self.embeddings)[0] |
|
|
|
|
|
results = [] |
|
|
for idx, score in enumerate(scores): |
|
|
results.append({ |
|
|
"id": self.documents[idx]["id"], |
|
|
"score": float(score), |
|
|
"metadata": { |
|
|
"text": self.documents[idx]["data"] |
|
|
} |
|
|
}) |
|
|
|
|
|
|
|
|
results.sort(key=lambda x: x["score"], reverse=True) |
|
|
|
|
|
return results |
|
|
|