from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
import uuid
import os
import shutil

class VectorDatabase:
    def __init__(self):
        self.embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000, 
            chunk_overlap=200
        )
        self.vector_stores = {}  # Store vector stores per URL

    def _create_unique_db_path(self, url):
        """Create a unique directory for each scraped URL"""
        # Use UUID to ensure unique directory names
        unique_id = str(uuid.uuid4())
        db_path = os.path.join("./chroma_db", unique_id)
        os.makedirs(db_path, exist_ok=True)
        return db_path, unique_id

    def process_and_store(self, scraped_data):
        """Process documents and store in vector database"""
        # Clean up any existing vector stores if they exist
        self._cleanup_old_vector_stores()

        # Create a LangChain Document object
        document = Document(
            page_content=scraped_data['content'], 
            metadata={
                "source": scraped_data['url'], 
                "domain": scraped_data['domain']
            }
        )

        # Split the document into chunks
        chunks = self.text_splitter.split_documents([document])

        # Create a unique path for this URL's vector store
        db_path, unique_id = self._create_unique_db_path(scraped_data['url'])

        # Create and persist the vector store
        vector_store = Chroma.from_documents(
            chunks, 
            self.embedding, 
            persist_directory=db_path
        )

        # Store reference to this vector store
        self.vector_stores[unique_id] = {
            'store': vector_store,
            'url': scraped_data['url']
        }

        return len(chunks), unique_id

    def search(self, query, url=None, k=3):
        """Search for relevant documents"""
        if not self.vector_stores:
            return []

        # If no specific URL provided, use the most recently added vector store
        if url is None:
            # Get the last added vector store
            vector_store_info = list(self.vector_stores.values())[-1]
            vector_store = vector_store_info['store']
        else:
            # Find vector store for specific URL
            matching_stores = [
                info['store'] for info in self.vector_stores.values() 
                if info['url'] == url
            ]
            if not matching_stores:
                return []
            vector_store = matching_stores[0]

        return vector_store.similarity_search(query, k=k)

    def _cleanup_old_vector_stores(self, max_stores=5):
        """Clean up old vector stores to prevent resource exhaustion"""
        if len(self.vector_stores) > max_stores:
            # Remove the oldest vector stores
            oldest_keys = list(self.vector_stores.keys())[:len(self.vector_stores) - max_stores]
            for key in oldest_keys:
                # Remove from dictionary
                store_info = self.vector_stores.pop(key)
                
                # Remove the physical directory
                db_path = os.path.join("./chroma_db", key)
                try:
                    shutil.rmtree(db_path)
                except Exception as e:
                    print(f"Error cleaning up vector store: {e}")