from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.vectorstores import Chroma from langchain_huggingface import HuggingFaceEmbeddings from langchain_core.documents import Document import uuid import os import shutil class VectorDatabase: def __init__(self): self.embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200 ) self.vector_stores = {} # Store vector stores per URL def _create_unique_db_path(self, url): """Create a unique directory for each scraped URL""" # Use UUID to ensure unique directory names unique_id = str(uuid.uuid4()) db_path = os.path.join("./chroma_db", unique_id) os.makedirs(db_path, exist_ok=True) return db_path, unique_id def process_and_store(self, scraped_data): """Process documents and store in vector database""" # Clean up any existing vector stores if they exist self._cleanup_old_vector_stores() # Create a LangChain Document object document = Document( page_content=scraped_data['content'], metadata={ "source": scraped_data['url'], "domain": scraped_data['domain'] } ) # Split the document into chunks chunks = self.text_splitter.split_documents([document]) # Create a unique path for this URL's vector store db_path, unique_id = self._create_unique_db_path(scraped_data['url']) # Create and persist the vector store vector_store = Chroma.from_documents( chunks, self.embedding, persist_directory=db_path ) # Store reference to this vector store self.vector_stores[unique_id] = { 'store': vector_store, 'url': scraped_data['url'] } return len(chunks), unique_id def search(self, query, url=None, k=3): """Search for relevant documents""" if not self.vector_stores: return [] # If no specific URL provided, use the most recently added vector store if url is None: # Get the last added vector store vector_store_info = list(self.vector_stores.values())[-1] vector_store = vector_store_info['store'] else: # Find vector store for specific URL matching_stores = [ info['store'] for info in self.vector_stores.values() if info['url'] == url ] if not matching_stores: return [] vector_store = matching_stores[0] return vector_store.similarity_search(query, k=k) def _cleanup_old_vector_stores(self, max_stores=5): """Clean up old vector stores to prevent resource exhaustion""" if len(self.vector_stores) > max_stores: # Remove the oldest vector stores oldest_keys = list(self.vector_stores.keys())[:len(self.vector_stores) - max_stores] for key in oldest_keys: # Remove from dictionary store_info = self.vector_stores.pop(key) # Remove the physical directory db_path = os.path.join("./chroma_db", key) try: shutil.rmtree(db_path) except Exception as e: print(f"Error cleaning up vector store: {e}")