|
from langchain_community.embeddings import HuggingFaceEmbeddings |
|
from langchain.retrievers import ParentDocumentRetriever |
|
from langchain.storage import InMemoryStore |
|
from langchain_community.vectorstores import Chroma |
|
from langchain_text_splitters import RecursiveCharacterTextSplitter |
|
from langchain_community.document_loaders import PyMuPDFLoader |
|
import os |
|
|
|
|
|
|
|
|
|
|
|
|
|
curr_dir = os.getcwd() |
|
db_path = 'chroma_db_v2' |
|
|
|
class QuestionRetriever: |
|
|
|
def load_documents(self,file_name): |
|
current_directory = os.getcwd() |
|
data_directory = os.path.join(current_directory, "data") |
|
file_path = os.path.join(data_directory, file_name) |
|
loader = TextLoader(file_path) |
|
documents = loader.load() |
|
return documents |
|
|
|
def store_data_in_vector_db(self,documents): |
|
|
|
text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=0,separator="\n") |
|
docs = text_splitter.split_documents(documents) |
|
|
|
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") |
|
|
|
|
|
db = Chroma.from_documents(docs, embedding_function) |
|
return db |
|
|
|
def get_response(self, user_query): |
|
db=self.store_data_in_vector_db(documents) |
|
|
|
docs = db.similarity_search(user_query) |
|
most_similar_question = docs[0].page_content.split("\n")[0] |
|
if user_query==most_similar_question: |
|
most_similar_question=docs[1].page_content.split("\n")[0] |
|
|
|
print(most_similar_question) |
|
return most_similar_question |
|
|
|
def process_pdf_document(file_path, parent_chunk_size=2000, child_chunk_size=500): |
|
''' |
|
Process a PDF document and return the documents and text splitters |
|
|
|
Args: |
|
file_path (str): The path to the PDF document |
|
parent_chunk_size (int): The size of the parent chunks |
|
child_chunk_size (int): The size of the child chunks |
|
|
|
Returns: |
|
documents (list): The list of documents |
|
parent_splitter (RecursiveCharacterTextSplitter): The text splitter for the parent documents |
|
child_splitter (RecursiveCharacterTextSplitter): The text splitter for the child documents |
|
|
|
''' |
|
|
|
loader = PyMuPDFLoader(file_path) |
|
documents = loader.load() |
|
|
|
|
|
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=parent_chunk_size) |
|
child_splitter = RecursiveCharacterTextSplitter(chunk_size=child_chunk_size) |
|
|
|
return documents, parent_splitter, child_splitter |
|
|
|
|
|
|
|
def create_vectorstore(embeddings_model="all-MiniLM-L6-v2"): |
|
''' |
|
Create the vectorstore and store for the documents |
|
|
|
Args: |
|
embeddings_model (HuggingFaceEmbeddings): The embeddings model |
|
documents (list): The list of documents |
|
|
|
Returns: |
|
vectorstore (Chroma): The vectorstore |
|
store (InMemoryStore): The store |
|
|
|
''' |
|
|
|
|
|
embeddings_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") |
|
|
|
|
|
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000) |
|
|
|
|
|
|
|
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400) |
|
|
|
|
|
vectorstore = Chroma( |
|
collection_name="split_parents", embedding_function=embeddings_model |
|
) |
|
vectordb = Chroma(persist_directory=db_path, |
|
embedding_function=embeddings_model) |
|
|
|
store = InMemoryStore() |
|
|
|
return vectordb, store |
|
|
|
|
|
|
|
def rag_retriever(vectorstore): |
|
''' |
|
Create the retriever for the RAG model |
|
|
|
Args: |
|
vectorstore (Chroma): The vectorstore |
|
store (InMemoryStore): The store |
|
parent_splitter (RecursiveCharacterTextSplitter): The text splitter for the parent documents |
|
child_splitter (RecursiveCharacterTextSplitter): The text splitter for the child documents |
|
|
|
Returns: |
|
retriever (ParentDocumentRetriever): The retriever |
|
|
|
''' |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
retriever = vectorstore.as_retriever() |
|
|
|
return retriever |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|