File size: 4,890 Bytes
f1268bf
8e29230
5ffaf21
 
 
 
 
 
47feab3
5ffaf21
 
3a85771
 
 
 
47feab3
3a85771
 
 
 
 
 
 
 
47feab3
 
 
5ffaf21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1268bf
 
 
47feab3
 
8e29230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47feab3
8e29230
 
 
 
 
 
 
 
3a85771
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47feab3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
from langchain_core.documents import Document
from chains import generate_document_summary_prompt
# embeddings functions
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
import time
from langchain_core.language_models import BaseChatModel
from langchain.retrievers import VectorStoreRetriever
from langchain_core.vectorstores import VectorStoreRetriever
# vectorization functions
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores import Chroma
from langchain_community.retrievers import BM25Retriever
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings


from pathlib import Path
from langchain_community.vectorstores import FAISS
from dotenv import load_dotenv
import os
import requests

from rag_app.knowledge_base.utils import create_embeddings
from rag_app.utils.generate_summary import generate_description, generate_keywords
from config import EMBEDDING_MODEL, FAISS_INDEX_PATH, SEVEN_B_LLM_MODEL

def create_embeddings(
        docs: list[Document], 
        chunk_size:int = 500, 
        chunk_overlap:int = 50,
        ):
    """given a sequence of `Document` objects this fucntion will
    generate embeddings for it.
    
    ## argument
    :params docs (list[Document]) -> list of `list[Document]`
    :params chunk_size (int) -> chunk size in which documents are chunks, defaults to 500
    :params chunk_overlap (int) -> the amount of token that will be overlapped between chunks, defaults to 50
    :params embedding_model (str) -> the huggingspace model that will embed the documents 
    ## Return
    Tuple of embedding and chunks
    """
    
    
    text_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", "(?<=\. )", " ", ""],
        chunk_size = chunk_size,
        chunk_overlap  = chunk_overlap,
        length_function = len,
    )

    # Stage one: read all the docs, split them into chunks.
    st = time.time()
    print('Loading documents and creating chunks ...')

    # Split each document into chunks using the configured text splitter
    chunks = text_splitter.create_documents([doc.page_content for doc in docs], metadatas=[doc.metadata for doc in docs])
    et = time.time() - st
    print(f'Time taken to chunk {len(docs)} documents: {et} seconds.')

    #Stage two: embed the docs.
    embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL)
    print(f"created a total of {len(chunks)} chunks")

    return embeddings,chunks


def generate_document_summaries(
        docs: list[Document],
        llm:BaseChatModel= SEVEN_B_LLM_MODEL,
    ) -> list[Document]:
    """
    Generates summaries for a list of Document objects and updates their metadata with the summaries.

    Args:
        docs (List[Document]): A list of Document objects to generate summaries for.

    Returns:
        List[Document]: A new list of Document objects with updated metadata containing the summaries.

    Example:
        docs = [Document(metadata={"title": "Doc1"}), Document(metadata={"title": "Doc2"})]
        updated_docs = generate_document_summaries(docs)
        for doc in updated_docs:
            print(doc.metadata["summary"])

    """
    
    new_docs = docs.copy()
    
    for doc in new_docs:
        
        genrate_summary_chain = generate_document_summary_prompt | llm
        summary = genrate_summary_chain.invoke(
            {"document":str(doc.metadata)}
        )        
        
        doc.metadata.update(
            {"summary":summary}
        )
    
    return new_docs


def build_vector_store(
        docs: list, 
        embedding_model: str, 
        new_db:bool=False, 
        chunk_size:int=500, 
        chunk_overlap:int=50,
        ):
    """

    """

    embeddings,chunks = create_embeddings(
        docs, 
        chunk_size, 
        chunk_overlap, 
        embedding_model
        )

    #load chunks into vector store
    print(f'Loading chunks into faiss vector store ...')
    
    st = time.time()
    if new_db:
        db_faiss = FAISS.from_documents(chunks, embeddings)
        bm25_retriever = BM25Retriever.from_documents(chunks)
    else:
        db_faiss = FAISS.add_documents(chunks, embeddings)
        bm25_retriever = BM25Retriever.add_documents(chunks)
        
    db_faiss.save_local(FAISS_INDEX_PATH)
    et = time.time() - st
    print(f'Time taken: {et} seconds.')

    print(f'Loading chunks into chroma vector store ...')
    
    st = time.time()
    persist_directory='./vectorstore/chroma-insurance-agent-1500'
    db_chroma = Chroma.from_documents(chunks, embeddings, persist_directory=persist_directory)
    et = time.time() - st
    
    print(f'Time taken: {et} seconds.')
    result = f"built vectore store at {FAISS_INDEX_PATH}"
    return result