Spaces:
Sleeping
Sleeping
from langchain_community.document_loaders import PyMuPDFLoader | |
from langchain_community.document_loaders import TextLoader | |
from langchain_community.embeddings.sentence_transformer import ( | |
SentenceTransformerEmbeddings, | |
) | |
import os | |
from langchain.storage import InMemoryStore | |
from langchain_community.document_loaders import TextLoader | |
from langchain_text_splitters import RecursiveCharacterTextSplitter | |
from langchain.retrievers import ParentDocumentRetriever | |
from langchain_community.vectorstores import Chroma | |
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter | |
# Import CSV Files to the VectorDB | |
# Reference : https://towardsdatascience.com/rag-how-to-talk-to-your-data-eaf5469b83b0 | |
# df_mental_health = pd.read_excel("/content/drive/MyDrive/Team 5/Depression_dataset_preprocessed (1).xlsx", sheet_name= "98_row_Mental_Health_FAQs") | |
# df_counsellor_chats = pd.read_excel("/content/drive/MyDrive/Team 5/Depression_dataset_preprocessed (1).xlsx", sheet_name= "Counsellor_Chats") | |
# df_human_therapist = pd.read_excel("/content/drive/MyDrive/Team 5/Depression_dataset_preprocessed (1).xlsx", sheet_name= "99_rows_Human_&_Therapist") | |
# Get the directory path of the current script | |
script_dir = os.path.dirname(os.path.abspath(__file__)) | |
loader = PyMuPDFLoader(os.path.join(script_dir, 'Data','PDFs', 'DepressionGuide-web.pdf')) | |
documents = loader.load() | |
# create the open-source embedding function | |
# Docs:- https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2 | |
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") | |
# https://python.langchain.com/docs/modules/data_connection/retrievers/parent_document_retriever | |
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000) | |
# This text splitter is used to create the child documents | |
# It should create documents smaller than the parent | |
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400) | |
# The vectorstore to use to index the child chunks | |
vectorstore = Chroma( | |
collection_name="split_parents", embedding_function=embedding_function) | |
# The storage layer for the parent documents | |
store = InMemoryStore() | |
def instantiate_rag(): | |
rag_retriever = ParentDocumentRetriever( | |
vectorstore=vectorstore, | |
docstore=store, | |
child_splitter=child_splitter, | |
parent_splitter=parent_splitter, | |
) | |
rag_retriever.add_documents(documents) | |
return rag_retriever |