File size: 3,691 Bytes
5f315ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100



from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document
import uuid
import os
import shutil

class VectorDatabase:
    def __init__(self):
        self.embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000, 
            chunk_overlap=200
        )
        self.vector_stores = {}  # Store vector stores per URL

    def _create_unique_db_path(self, url):
        """Create a unique directory for each scraped URL"""
        # Use UUID to ensure unique directory names
        unique_id = str(uuid.uuid4())
        db_path = os.path.join("./chroma_db", unique_id)
        os.makedirs(db_path, exist_ok=True)
        return db_path, unique_id

    def process_and_store(self, scraped_data):
        """Process documents and store in vector database"""
        # Clean up any existing vector stores if they exist
        self._cleanup_old_vector_stores()

        # Create a LangChain Document object
        document = Document(
            page_content=scraped_data['content'], 
            metadata={
                "source": scraped_data['url'], 
                "domain": scraped_data['domain']
            }
        )

        # Split the document into chunks
        chunks = self.text_splitter.split_documents([document])

        # Create a unique path for this URL's vector store
        db_path, unique_id = self._create_unique_db_path(scraped_data['url'])

        # Create and persist the vector store
        vector_store = Chroma.from_documents(
            chunks, 
            self.embedding, 
            persist_directory=db_path
        )

        # Store reference to this vector store
        self.vector_stores[unique_id] = {
            'store': vector_store,
            'url': scraped_data['url']
        }

        return len(chunks), unique_id

    def search(self, query, url=None, k=3):
        """Search for relevant documents"""
        if not self.vector_stores:
            return []

        # If no specific URL provided, use the most recently added vector store
        if url is None:
            # Get the last added vector store
            vector_store_info = list(self.vector_stores.values())[-1]
            vector_store = vector_store_info['store']
        else:
            # Find vector store for specific URL
            matching_stores = [
                info['store'] for info in self.vector_stores.values() 
                if info['url'] == url
            ]
            if not matching_stores:
                return []
            vector_store = matching_stores[0]

        return vector_store.similarity_search(query, k=k)

    def _cleanup_old_vector_stores(self, max_stores=5):
        """Clean up old vector stores to prevent resource exhaustion"""
        if len(self.vector_stores) > max_stores:
            # Remove the oldest vector stores
            oldest_keys = list(self.vector_stores.keys())[:len(self.vector_stores) - max_stores]
            for key in oldest_keys:
                # Remove from dictionary
                store_info = self.vector_stores.pop(key)
                
                # Remove the physical directory
                db_path = os.path.join("./chroma_db", key)
                try:
                    shutil.rmtree(db_path)
                except Exception as e:
                    print(f"Error cleaning up vector store: {e}")