|
|
|
|
|
|
|
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
from langchain_community.vectorstores import Chroma
|
|
from langchain_huggingface import HuggingFaceEmbeddings
|
|
from langchain_core.documents import Document
|
|
import uuid
|
|
import os
|
|
import shutil
|
|
|
|
class VectorDatabase:
|
|
def __init__(self):
|
|
self.embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
|
|
self.text_splitter = RecursiveCharacterTextSplitter(
|
|
chunk_size=1000,
|
|
chunk_overlap=200
|
|
)
|
|
self.vector_stores = {}
|
|
|
|
def _create_unique_db_path(self, url):
|
|
"""Create a unique directory for each scraped URL"""
|
|
|
|
unique_id = str(uuid.uuid4())
|
|
db_path = os.path.join("./chroma_db", unique_id)
|
|
os.makedirs(db_path, exist_ok=True)
|
|
return db_path, unique_id
|
|
|
|
def process_and_store(self, scraped_data):
|
|
"""Process documents and store in vector database"""
|
|
|
|
self._cleanup_old_vector_stores()
|
|
|
|
|
|
document = Document(
|
|
page_content=scraped_data['content'],
|
|
metadata={
|
|
"source": scraped_data['url'],
|
|
"domain": scraped_data['domain']
|
|
}
|
|
)
|
|
|
|
|
|
chunks = self.text_splitter.split_documents([document])
|
|
|
|
|
|
db_path, unique_id = self._create_unique_db_path(scraped_data['url'])
|
|
|
|
|
|
vector_store = Chroma.from_documents(
|
|
chunks,
|
|
self.embedding,
|
|
persist_directory=db_path
|
|
)
|
|
|
|
|
|
self.vector_stores[unique_id] = {
|
|
'store': vector_store,
|
|
'url': scraped_data['url']
|
|
}
|
|
|
|
return len(chunks), unique_id
|
|
|
|
def search(self, query, url=None, k=3):
|
|
"""Search for relevant documents"""
|
|
if not self.vector_stores:
|
|
return []
|
|
|
|
|
|
if url is None:
|
|
|
|
vector_store_info = list(self.vector_stores.values())[-1]
|
|
vector_store = vector_store_info['store']
|
|
else:
|
|
|
|
matching_stores = [
|
|
info['store'] for info in self.vector_stores.values()
|
|
if info['url'] == url
|
|
]
|
|
if not matching_stores:
|
|
return []
|
|
vector_store = matching_stores[0]
|
|
|
|
return vector_store.similarity_search(query, k=k)
|
|
|
|
def _cleanup_old_vector_stores(self, max_stores=5):
|
|
"""Clean up old vector stores to prevent resource exhaustion"""
|
|
if len(self.vector_stores) > max_stores:
|
|
|
|
oldest_keys = list(self.vector_stores.keys())[:len(self.vector_stores) - max_stores]
|
|
for key in oldest_keys:
|
|
|
|
store_info = self.vector_stores.pop(key)
|
|
|
|
|
|
db_path = os.path.join("./chroma_db", key)
|
|
try:
|
|
shutil.rmtree(db_path)
|
|
except Exception as e:
|
|
print(f"Error cleaning up vector store: {e}") |