File size: 1,510 Bytes
c71ac5b
 
 
 
 
 
 
 
d43cabf
c71ac5b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
"""
Vector database handler for storing and retrieving text chunks using embeddings.

TODO:
- Replace in-memory store with a persistent backend (e.g., FAISS, Pinecone).
- Customize embedding provider as needed.
"""

from langchain_community.vectorstores import FAISS
from backend.core.embeddings import EmbeddingProvider

class VectorDatabase:
    """
    Handles the creation and querying of a vector database using text embeddings.
    """

    def __init__(self):
        self.db = None
        self.embedding_provider = EmbeddingProvider()

    async def abuild_from_list(self, chunks):
        """
        Build the vector database from a list of text chunks.

        Parameters
        ----------
        chunks : list of str
            The list of preprocessed text segments.
        """
        self.db = FAISS.from_texts(texts=chunks, embedding=self.embedding_provider.model)

    def search_by_text(self, query, k=4):
        """
        Search the vector database for the most relevant chunks based on the query.

        Parameters
        ----------
        query : str
            The user's input question or topic.
        k : int, optional
            The number of top matches to return (default is 4).

        Returns
        -------
        list of tuple
            List of matched chunks with relevance metadata.
        """
        if self.db is None:
            raise ValueError("Vector database is not initialized.")
        return self.db.similarity_search_with_score(query, k=k)