Spaces:

Phaneendra99
/

LLM

Sleeping

App Files Files Community

LLM / rag_pipeline.py

Phaneendra99

Upload 18 files (#1)

07830ac verified 11 months ago

raw

history blame

2.44 kB

	from langchain_community.document_loaders import PyMuPDFLoader
	from langchain_community.document_loaders import TextLoader
	from langchain_community.embeddings.sentence_transformer import (
	SentenceTransformerEmbeddings,
	)
	import os
	from langchain.storage import InMemoryStore
	from langchain_community.document_loaders import TextLoader

	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain.retrievers import ParentDocumentRetriever
	from langchain_community.vectorstores import Chroma
	from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter

	# Import CSV Files to the VectorDB
	# Reference : https://towardsdatascience.com/rag-how-to-talk-to-your-data-eaf5469b83b0

	# df_mental_health = pd.read_excel("/content/drive/MyDrive/Team 5/Depression_dataset_preprocessed (1).xlsx", sheet_name= "98_row_Mental_Health_FAQs")
	# df_counsellor_chats = pd.read_excel("/content/drive/MyDrive/Team 5/Depression_dataset_preprocessed (1).xlsx", sheet_name= "Counsellor_Chats")
	# df_human_therapist = pd.read_excel("/content/drive/MyDrive/Team 5/Depression_dataset_preprocessed (1).xlsx", sheet_name= "99_rows_Human_&_Therapist")

	# Get the directory path of the current script
	script_dir = os.path.dirname(os.path.abspath(__file__))

	loader = PyMuPDFLoader(os.path.join(script_dir, 'Data','PDFs', 'DepressionGuide-web.pdf'))
	documents = loader.load()

	# create the open-source embedding function
	# Docs:- https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
	embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

	# https://python.langchain.com/docs/modules/data_connection/retrievers/parent_document_retriever

	parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)

	# This text splitter is used to create the child documents
	# It should create documents smaller than the parent
	child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

	# The vectorstore to use to index the child chunks
	vectorstore = Chroma(
	collection_name="split_parents", embedding_function=embedding_function)

	# The storage layer for the parent documents
	store = InMemoryStore()

	def instantiate_rag():
	rag_retriever = ParentDocumentRetriever(
	vectorstore=vectorstore,
	docstore=store,
	child_splitter=child_splitter,
	parent_splitter=parent_splitter,
	)
	rag_retriever.add_documents(documents)
	return rag_retriever