yashjainme's picture
Upload 5 files
5f315ec verified
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
import uuid
import os
import shutil
class VectorDatabase:
def __init__(self):
self.embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
)
self.vector_stores = {} # Store vector stores per URL
def _create_unique_db_path(self, url):
"""Create a unique directory for each scraped URL"""
# Use UUID to ensure unique directory names
unique_id = str(uuid.uuid4())
db_path = os.path.join("./chroma_db", unique_id)
os.makedirs(db_path, exist_ok=True)
return db_path, unique_id
def process_and_store(self, scraped_data):
"""Process documents and store in vector database"""
# Clean up any existing vector stores if they exist
self._cleanup_old_vector_stores()
# Create a LangChain Document object
document = Document(
page_content=scraped_data['content'],
metadata={
"source": scraped_data['url'],
"domain": scraped_data['domain']
}
)
# Split the document into chunks
chunks = self.text_splitter.split_documents([document])
# Create a unique path for this URL's vector store
db_path, unique_id = self._create_unique_db_path(scraped_data['url'])
# Create and persist the vector store
vector_store = Chroma.from_documents(
chunks,
self.embedding,
persist_directory=db_path
)
# Store reference to this vector store
self.vector_stores[unique_id] = {
'store': vector_store,
'url': scraped_data['url']
}
return len(chunks), unique_id
def search(self, query, url=None, k=3):
"""Search for relevant documents"""
if not self.vector_stores:
return []
# If no specific URL provided, use the most recently added vector store
if url is None:
# Get the last added vector store
vector_store_info = list(self.vector_stores.values())[-1]
vector_store = vector_store_info['store']
else:
# Find vector store for specific URL
matching_stores = [
info['store'] for info in self.vector_stores.values()
if info['url'] == url
]
if not matching_stores:
return []
vector_store = matching_stores[0]
return vector_store.similarity_search(query, k=k)
def _cleanup_old_vector_stores(self, max_stores=5):
"""Clean up old vector stores to prevent resource exhaustion"""
if len(self.vector_stores) > max_stores:
# Remove the oldest vector stores
oldest_keys = list(self.vector_stores.keys())[:len(self.vector_stores) - max_stores]
for key in oldest_keys:
# Remove from dictionary
store_info = self.vector_stores.pop(key)
# Remove the physical directory
db_path = os.path.join("./chroma_db", key)
try:
shutil.rmtree(db_path)
except Exception as e:
print(f"Error cleaning up vector store: {e}")