kishoregajjala commited on
Commit
db7706f
1 Parent(s): 30dbaa8

Upload 8 files

Browse files
Files changed (8) hide show
  1. README.md +2 -13
  2. app.py +39 -0
  3. llm_generator.py +161 -0
  4. nlp_models.py +39 -0
  5. rag_pipeline.py +55 -0
  6. rag_pipeline_vectordb.py +92 -0
  7. requirements.txt +17 -0
  8. test_vectordb.ipynb +218 -0
README.md CHANGED
@@ -1,13 +1,2 @@
1
- ---
2
- title: Mental Health Chatbot
3
- emoji: 🐠
4
- colorFrom: purple
5
- colorTo: blue
6
- sdk: streamlit
7
- sdk_version: 1.32.2
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # Mental_Health_Chatbot_Integrated
2
+ Mental_Health_Chatbot_Integrated
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import llm_generator
3
+ from llm_generator import llm_generation
4
+
5
+ import time
6
+
7
+ # ST : https://docs.streamlit.io/knowledge-base/tutorials/build-conversational-apps
8
+
9
+ st.title('Mental Health Therapist')
10
+
11
+ def response_generator(response):
12
+ '''
13
+ responds the text with a type writter effect
14
+ '''
15
+ response_buffer = response.strip()
16
+ for word in response_buffer.split():
17
+ yield word + " "
18
+ time.sleep(0.1)
19
+
20
+ # Initialize chat history
21
+ if "messages" not in st.session_state:
22
+ st.session_state.messages = []
23
+
24
+ for message in st.session_state.messages:
25
+ with st.chat_message(message["role"]):
26
+ st.markdown(message["content"])
27
+
28
+ # Accept user input
29
+ if user_prompt := st.chat_input("Hello, How are you doing today"):
30
+ st.session_state.messages.append({"role": "user", "content": user_prompt})
31
+ with st.chat_message("user"):
32
+ st.markdown(user_prompt)
33
+
34
+ with st.chat_message("assistant"):
35
+ response = llm_generation(user_prompt)
36
+ time.sleep(1)
37
+ st.write_stream(response_generator(response))
38
+
39
+ st.session_state.messages.append({"role": "assistant", "content": response})
llm_generator.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.prompts import PromptTemplate
2
+ from langchain_community.llms import HuggingFaceEndpoint
3
+ from langchain.vectorstores import Chroma
4
+ from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
5
+ import os
6
+ from langchain.prompts.chat import (
7
+ ChatPromptTemplate,
8
+ HumanMessagePromptTemplate,
9
+ SystemMessagePromptTemplate,
10
+ ) # Docs:- https://python.langchain.com/docs/modules/model_io/prompts/message_prompts
11
+
12
+ #import chromadb
13
+
14
+ # LLM Generator
15
+ from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
16
+
17
+ from langchain.chains import ConversationalRetrievalChain
18
+ from langchain.memory import ChatMessageHistory, ConversationSummaryBufferMemory, ConversationBufferMemory
19
+
20
+ from langchain_experimental.chat_models import Llama2Chat
21
+ # Docs:- https://python.langchain.com/docs/integrations/chat/llama2_chat
22
+
23
+
24
+ HUGGINGFACEHUB_API_TOKEN = HF_ACCESS_TOKEN
25
+ #os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN
26
+
27
+ # Implement another function to pass an array of PDFs / CSVs / Excels
28
+ from rag_pipeline import instantiate_rag
29
+ retriever = instantiate_rag()
30
+
31
+ #persist_directory="Data/chroma"
32
+ #chroma_client = chromadb.PersistentClient(persist_directory=persist_directory)
33
+ #embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
34
+ #vectors = Chroma(persist_directory = persist_directory, embedding_function = embedding_function)
35
+ #retriever = vectors.as_retriever() #(k=6)
36
+
37
+
38
+ # Set the url to your Inference Endpoint below
39
+ #your_endpoint_url = "https://fayjubiy2xqn36z0.us-east-1.aws.endpoints.huggingface.cloud"
40
+
41
+ #how you can access HuggingFaceEndpoint integration of the free Serverless Endpoints API.
42
+ repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
43
+ llm = HuggingFaceEndpoint(
44
+ #endpoint_url=f"{your_endpoint_url}",
45
+ repo_id=repo_id,
46
+ #max_length=128,
47
+ max_new_tokens=512,
48
+ token=HUGGINGFACEHUB_API_TOKEN,
49
+ temperature=0.1,
50
+ repetition_penalty=1.1,
51
+ #context_length: 4096, # Set to max for Chat Summary, Llama-2 has a max context length of 4096,
52
+ stream=True,
53
+ callbacks=[StreamingStdOutCallbackHandler()],
54
+ #top_k=10,
55
+ #top_p=0.95,
56
+ )
57
+
58
+
59
+ model = Llama2Chat(llm=llm)
60
+ memory = ConversationBufferMemory(
61
+ llm=llm,
62
+ memory_key="chat_history",
63
+ return_messages=True,
64
+ output_key='answer',
65
+ input_key='question')
66
+
67
+
68
+ # Prompt Context Reference : https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF , https://huggingface.co/TheBloke/Llama-2-13B-chat-GPTQ/discussions/5#64b81e9b15ebeb44419a2b9e
69
+ # Reference:- https://github.com/langchain-ai/langchain/issues/5462
70
+
71
+ system_message_template = """You're a Mental Health Specialist. Support those with Depressive Disorder.
72
+ Listen compassionately, respond helpfully. For casual talk, be friendly. For facts, use context.
73
+ If unsure, say, 'Out of my knowledge.' Always stay direct.
74
+ If you cannot find the answer from the pieces of context, just say that you don't know, don't try to make up an answer.
75
+ ----------------
76
+ {context}"""
77
+
78
+ messages = [
79
+ SystemMessagePromptTemplate.from_template(system_message_template),
80
+ HumanMessagePromptTemplate.from_template("{question}")
81
+ ]
82
+ qa_prompt = ChatPromptTemplate.from_messages(messages)
83
+ qa_prompt.pretty_print()
84
+
85
+ condense_question = """Given the following conversation and a follow-up message,
86
+ rephrase the follow-up message to a stand-alone question or instruction that
87
+ represents the user's intent precisely, add context needed if necessary to generate a complete and
88
+ unambiguous question, only based on the on the Follow up Question and chat history, don't make up messages.
89
+ Maintain the same question intent as the follow up input message.\n
90
+ Chat History:
91
+ {chat_history}\n
92
+ Follow Up Input: {question}
93
+ Standalone question:"""
94
+
95
+ condense_question_prompt = PromptTemplate.from_template(condense_question)
96
+ condense_question_prompt.pretty_print()
97
+
98
+ retrieval_chain = ConversationalRetrievalChain.from_llm(
99
+ llm = llm,
100
+ retriever=retriever,
101
+ memory = memory,
102
+ return_source_documents=False,
103
+ verbose=True,
104
+ #condense_question_prompt=condense_question_prompt,
105
+ # chain_type = "stuff",
106
+ combine_docs_chain_kwargs={'prompt': qa_prompt}, # https://github.com/langchain-ai/langchain/issues/6879
107
+ )
108
+
109
+
110
+ human_inputs = ['Nothing logged yet']
111
+ ai_responses = ['Nothing logged yet']
112
+
113
+ history = ChatMessageHistory()
114
+
115
+ def llm_generation(question: str):
116
+ llm_answer = retrieval_chain.invoke({'question':question, 'chat_history':history.messages})['answer'] #Answer = Dict Key = Latest response by the AI
117
+ history.add_user_message(question)
118
+ history.add_ai_message(llm_answer)
119
+ return llm_answer
120
+
121
+
122
+
123
+ # Decide wether to place this in streamlit.py
124
+ # or make a new post_process.py and import that to streamlit
125
+ def extract_dialogues(text):
126
+ '''
127
+ returns a two lists for human and ai dialogues,
128
+ '''
129
+ human_dialogues = []
130
+ ai_dialogues = []
131
+ lines = text.split('\n')
132
+
133
+ # Iterate through each line
134
+ for line in lines:
135
+ # Remove leading and trailing whitespace
136
+ line = line.strip()
137
+
138
+ # Check if the line starts with 'Human:' or 'AI:'
139
+ if line.startswith('Human:'):
140
+ # Extract the text after 'Human:'
141
+ human_dialogues.append(line[len('Human:'):].strip())
142
+ elif line.startswith('AI:'):
143
+ # Extract the text after 'AI:'
144
+ ai_dialogues.append(line[len('AI:'):].strip())
145
+ return human_dialogues, ai_dialogues
146
+
147
+ def update_list():
148
+ global human_inputs, ai_responses
149
+ human_responses, ai_responses = extract_dialogues(memory.buffer_as_str)
150
+ return 'responses updated'
151
+
152
+
153
+ def is_depressed():
154
+ ''''
155
+ returns wether according to human inputs the person is depressed or not
156
+ '''
157
+ # Implement Classification
158
+ all_user_inputs = ''.join(human_inputs)
159
+ from nlp_models import sentiment_class, pattern_classification, corelation_analysis
160
+ is_depressed = sentiment_class(all_user_inputs)
161
+ return 'Not so depressed' if is_depressed[0][1] > 0.5 else 'is_depressed'
nlp_models.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import DistilBertForSequenceClassification
3
+ import os
4
+ # # Get the directory path of the current script
5
+ # script_dir = os.path.dirname(os.path.abspath(__file__))
6
+ # model = DistilBertForSequenceClassification.from_pretrained("model.safetensors")
7
+
8
+ # Load model directly
9
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
10
+
11
+ tokenizer = AutoTokenizer.from_pretrained("lxs1/DistilBertForSequenceClassification_6h_768dim")
12
+ model = AutoModelForSequenceClassification.from_pretrained("lxs1/DistilBertForSequenceClassification_6h_768dim")
13
+
14
+
15
+ # from transformers import DistilBertTokenizerFast
16
+ # tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
17
+
18
+ # Move the model to the GPU if available
19
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
20
+ model.to(device)
21
+
22
+ def sentiment_class(summarized_text):
23
+ '''
24
+ # 1 = non-depressed
25
+ # 0 = depressed
26
+ returns: example:- array([[0.00493283, 0.9950671 ]], dtype=float32)
27
+ '''
28
+ inputs = tokenizer(summarized_text, padding = True, truncation = True, return_tensors='pt').to('cuda')
29
+ outputs = model(**inputs)
30
+
31
+ predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
32
+ predictions = predictions.cpu().detach().numpy()
33
+ return predictions
34
+
35
+ def pattern_classification():
36
+ return result
37
+
38
+ def corelation_analysis():
39
+ return result
rag_pipeline.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import PyMuPDFLoader
2
+ from langchain_community.document_loaders import TextLoader
3
+ from langchain_community.embeddings.sentence_transformer import (
4
+ SentenceTransformerEmbeddings,
5
+ )
6
+ import os
7
+ from langchain.storage import InMemoryStore
8
+ from langchain_community.document_loaders import TextLoader
9
+
10
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
11
+ from langchain.retrievers import ParentDocumentRetriever
12
+ from langchain_community.vectorstores import Chroma
13
+ from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
14
+
15
+ # Import CSV Files to the VectorDB
16
+ # Reference : https://towardsdatascience.com/rag-how-to-talk-to-your-data-eaf5469b83b0
17
+
18
+ # df_mental_health = pd.read_excel("/content/drive/MyDrive/Team 5/Depression_dataset_preprocessed (1).xlsx", sheet_name= "98_row_Mental_Health_FAQs")
19
+ # df_counsellor_chats = pd.read_excel("/content/drive/MyDrive/Team 5/Depression_dataset_preprocessed (1).xlsx", sheet_name= "Counsellor_Chats")
20
+ # df_human_therapist = pd.read_excel("/content/drive/MyDrive/Team 5/Depression_dataset_preprocessed (1).xlsx", sheet_name= "99_rows_Human_&_Therapist")
21
+
22
+ # Get the directory path of the current script
23
+ script_dir = os.path.dirname(os.path.abspath(__file__))
24
+
25
+ loader = PyMuPDFLoader(os.path.join(script_dir, 'Data','pdf', 'Depression Help Guide.pdf'))
26
+ documents = loader.load()
27
+
28
+ # create the open-source embedding function
29
+ # Docs:- https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
30
+ embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
31
+
32
+ # https://python.langchain.com/docs/modules/data_connection/retrievers/parent_document_retriever
33
+
34
+ parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
35
+
36
+ # This text splitter is used to create the child documents
37
+ # It should create documents smaller than the parent
38
+ child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
39
+
40
+ # The vectorstore to use to index the child chunks
41
+ vectorstore = Chroma(
42
+ collection_name="split_parents", embedding_function=embedding_function)
43
+
44
+ # The storage layer for the parent documents
45
+ store = InMemoryStore()
46
+
47
+ def instantiate_rag():
48
+ rag_retriever = ParentDocumentRetriever(
49
+ vectorstore=vectorstore,
50
+ docstore=store,
51
+ child_splitter=child_splitter,
52
+ parent_splitter=parent_splitter,
53
+ )
54
+ rag_retriever.add_documents(documents)
55
+ return rag_retriever
rag_pipeline_vectordb.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import PyMuPDFLoader
2
+ from langchain_community.document_loaders import TextLoader
3
+ from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
4
+ from langchain.storage import InMemoryStore
5
+ from langchain_community.document_loaders import TextLoader
6
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
7
+ from langchain.retrievers import ParentDocumentRetriever
8
+ from langchain_community.vectorstores import Chroma
9
+ from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
10
+ from langchain_community.document_loaders.csv_loader import CSVLoader
11
+ import chromadb
12
+ from chromadb.utils import embedding_functions
13
+ import os
14
+
15
+ # Reference : https://towardsdatascience.com/rag-how-to-talk-to-your-data-eaf5469b83b0
16
+
17
+
18
+ embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
19
+
20
+ persist_directory="Data/chroma"
21
+ chroma_client = chromadb.PersistentClient(path=persist_directory)
22
+
23
+
24
+ # https://python.langchain.com/docs/modules/data_connection/retrievers/parent_document_retriever
25
+ parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
26
+
27
+ # This text splitter is used to create the child documents
28
+ # It should create documents smaller than the parent
29
+ child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
30
+
31
+ def get_file_paths_recursively(folder_path):
32
+ file_paths = []
33
+ for root, directories, files in os.walk(folder_path):
34
+ for file in files:
35
+ file_path = os.path.join(root, file)
36
+ file_paths.append(file_path)
37
+ return file_paths
38
+
39
+ def vdb_csv_loader(file_paths):
40
+ for i in range(len(file_paths)):
41
+ loader = CSVLoader(file_path=file_paths[i], encoding="latin-1")
42
+ db = Chroma.from_documents(documents=loader.load(), embedding=embedding_function, collection_name= "mental_health_csv_collection", persist_directory=persist_directory) # pars to imclude (docs, emb_fun, col_name, direct_path)
43
+
44
+ ###
45
+ def generate_csv_vector_db() -> None:
46
+
47
+ # Get the directory path of the current script
48
+ #script_dir = os.path.dirname(os.path.abspath(__file__))
49
+ #folder_path = os.path.join(script_dir, 'Data/csv')
50
+ folder_path = "Data/csv"
51
+ file_paths = get_file_paths_recursively(folder_path)
52
+
53
+ #loaded all the files
54
+ vdb_csv_loader(file_paths)
55
+
56
+ ###
57
+ pdf_collection = Chroma(collection_name="mental_health_pdf_collection", embedding_function=embedding_function, persist_directory=persist_directory)
58
+ def vdb_pdf_loader(file_paths):
59
+ for i in range(len(file_paths)):
60
+ loader = PyMuPDFLoader(file_path=file_paths[i])
61
+ documents = loader.load()
62
+
63
+ store = InMemoryStore()
64
+ rag_retriever = ParentDocumentRetriever(
65
+ vectorstore=pdf_collection,
66
+ docstore=store,
67
+ child_splitter=child_splitter,
68
+ parent_splitter=parent_splitter,
69
+ )
70
+ rag_retriever.add_documents(documents)
71
+
72
+
73
+ def generate_pdf_vector_db() -> None:
74
+
75
+ # Get the directory path of the current script
76
+ #script_dir = os.path.dirname(os.path.abspath(__file__))
77
+ #folder_path = os.path.join(script_dir, '/Data/pdf')
78
+ folder_path = "Data/pdf"
79
+ file_paths = get_file_paths_recursively(folder_path)
80
+ vdb_pdf_loader(file_paths)
81
+
82
+
83
+ def vectordb_load():
84
+ # call csv loader
85
+ generate_csv_vector_db()
86
+
87
+ # call PDF loader
88
+ generate_pdf_vector_db()
89
+
90
+ # call vector db load
91
+ vectordb_load()
92
+
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #create new env
2
+ #conda create --name LLM_chatbot
3
+ #activate the env
4
+ #conda activate LLM_chatbot
5
+ #pip install -r requirements.txt
6
+ #if streamlit is still unrecognized run this "conda install -c conda-forge streamlit"
7
+ #to run stremlit use streamlit run streamlit_ui.py
8
+ langchain==0.1.11
9
+ torch==2.0.1
10
+ transformers==4.36.2
11
+ langchain-community==0.0.27
12
+ streamlit==1.32.2
13
+ ctransformers==0.2.27
14
+ pymupdf==1.23.26
15
+ sentence-transformers==2.5.1
16
+ chromadb==0.4.24
17
+ langchain_experimental
test_vectordb.ipynb ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from langchain_community.document_loaders import PyMuPDFLoader\n",
10
+ "from langchain_community.document_loaders import TextLoader\n",
11
+ "from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings\n",
12
+ "from langchain.storage import InMemoryStore\n",
13
+ "from langchain_community.document_loaders import TextLoader\n",
14
+ "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
15
+ "from langchain.retrievers import ParentDocumentRetriever\n",
16
+ "from langchain_community.vectorstores import Chroma\n",
17
+ "from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter\n",
18
+ "from langchain_community.document_loaders.csv_loader import CSVLoader\n",
19
+ "import chromadb\n",
20
+ "from chromadb.utils import embedding_functions\n",
21
+ "import os\n",
22
+ "\n",
23
+ "# Reference : https://towardsdatascience.com/rag-how-to-talk-to-your-data-eaf5469b83b0\n"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "execution_count": 2,
29
+ "metadata": {},
30
+ "outputs": [
31
+ {
32
+ "name": "stderr",
33
+ "output_type": "stream",
34
+ "text": [
35
+ "/Users/kishoregajjala/anaconda3/envs/mhc_1/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
36
+ " from .autonotebook import tqdm as notebook_tqdm\n"
37
+ ]
38
+ }
39
+ ],
40
+ "source": [
41
+ "# create the open-source embedding function\n",
42
+ "huggingface_ef = SentenceTransformerEmbeddings(model_name=\"all-MiniLM-L6-v2\")\n",
43
+ "\n"
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "code",
48
+ "execution_count": null,
49
+ "metadata": {},
50
+ "outputs": [],
51
+ "source": [
52
+ "\n",
53
+ "persist_directory=\"Data/chroma\"\n",
54
+ "chroma_client = chromadb.PersistentClient(path=persist_directory)\n"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": null,
60
+ "metadata": {},
61
+ "outputs": [],
62
+ "source": [
63
+ "\n",
64
+ "# https://python.langchain.com/docs/modules/data_connection/retrievers/parent_document_retriever\n",
65
+ "parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)\n",
66
+ "\n",
67
+ "# This text splitter is used to create the child documents\n",
68
+ "# It should create documents smaller than the parent\n",
69
+ "child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)\n"
70
+ ]
71
+ },
72
+ {
73
+ "cell_type": "code",
74
+ "execution_count": null,
75
+ "metadata": {},
76
+ "outputs": [],
77
+ "source": [
78
+ "\n",
79
+ "def get_file_paths_recursively(folder_path):\n",
80
+ " file_paths = []\n",
81
+ " for root, directories, files in os.walk(folder_path):\n",
82
+ " for file in files:\n",
83
+ " file_path = os.path.join(root, file)\n",
84
+ " file_paths.append(file_path)\n",
85
+ " return file_paths\n"
86
+ ]
87
+ },
88
+ {
89
+ "cell_type": "code",
90
+ "execution_count": null,
91
+ "metadata": {},
92
+ "outputs": [],
93
+ "source": [
94
+ "\n",
95
+ "def vdb_csv_loader(file_paths):\n",
96
+ " for i in range(len(file_paths)):\n",
97
+ " loader = CSVLoader(file_path=file_paths[i], encoding=\"latin-1\")\n",
98
+ " db = Chroma.from_documents(documents=loader.load(), embedding=huggingface_ef, collection_name= \"mental_health_csv_collection\", persist_directory=persist_directory) # pars to imclude (docs, emb_fun, col_name, direct_path)\n",
99
+ "\n",
100
+ "###\n",
101
+ "def generate_csv_vector_db() -> None:\n",
102
+ " \n",
103
+ " # Get the directory path of the current script\n",
104
+ " #script_dir = os.path.dirname(os.path.abspath(__file__))\n",
105
+ " folder_path = \"Data/csv\"\n",
106
+ " file_paths = get_file_paths_recursively(folder_path)\n",
107
+ "\n",
108
+ " #loaded all the files\n",
109
+ " vdb_csv_loader(file_paths)\n"
110
+ ]
111
+ },
112
+ {
113
+ "cell_type": "code",
114
+ "execution_count": null,
115
+ "metadata": {},
116
+ "outputs": [],
117
+ "source": [
118
+ "\n",
119
+ "pdf_collection = Chroma(collection_name=\"mental_health_pdf_collection\", embedding_function=huggingface_ef, persist_directory=persist_directory) \n",
120
+ "def vdb_pdf_loader(file_paths):\n",
121
+ " for i in range(len(file_paths)):\n",
122
+ " loader = PyMuPDFLoader(file_path=file_paths[i])\n",
123
+ " documents = loader.load()\n",
124
+ " \n",
125
+ " store = InMemoryStore()\n",
126
+ " rag_retriever = ParentDocumentRetriever(\n",
127
+ " vectorstore=pdf_collection,\n",
128
+ " docstore=store,\n",
129
+ " child_splitter=child_splitter,\n",
130
+ " parent_splitter=parent_splitter,\n",
131
+ " )\n",
132
+ " rag_retriever.add_documents(documents)\n",
133
+ "\n",
134
+ "\n",
135
+ "def generate_pdf_vector_db() -> None:\n",
136
+ " \n",
137
+ " # Get the directory path of the current script\n",
138
+ " #script_dir = os.path.dirname(os.path.abspath(__file__))\n",
139
+ " folder_path = \"Data/pdf\" #os.path.join(script_dir, '/Data/pdf') \n",
140
+ " file_paths = get_file_paths_recursively(folder_path)\n",
141
+ " vdb_pdf_loader(file_paths)\n",
142
+ "\n",
143
+ "\n",
144
+ "\n"
145
+ ]
146
+ },
147
+ {
148
+ "cell_type": "code",
149
+ "execution_count": null,
150
+ "metadata": {},
151
+ "outputs": [],
152
+ "source": [
153
+ " # call PDF loader\n",
154
+ "generate_pdf_vector_db()\n"
155
+ ]
156
+ },
157
+ {
158
+ "cell_type": "code",
159
+ "execution_count": null,
160
+ "metadata": {},
161
+ "outputs": [],
162
+ "source": [
163
+ "# call csv loader\n",
164
+ "generate_csv_vector_db()"
165
+ ]
166
+ },
167
+ {
168
+ "cell_type": "code",
169
+ "execution_count": null,
170
+ "metadata": {},
171
+ "outputs": [],
172
+ "source": [
173
+ "\n",
174
+ "\n",
175
+ "def vectordb_load(): \n",
176
+ " # call csv loader\n",
177
+ " generate_csv_vector_db()\n",
178
+ "\n",
179
+ " # call PDF loader\n",
180
+ " generate_pdf_vector_db()\n",
181
+ "\n",
182
+ " \n"
183
+ ]
184
+ },
185
+ {
186
+ "cell_type": "code",
187
+ "execution_count": null,
188
+ "metadata": {},
189
+ "outputs": [],
190
+ "source": [
191
+ "\n",
192
+ "# call vector db load\n",
193
+ "vectordb_load()\n"
194
+ ]
195
+ }
196
+ ],
197
+ "metadata": {
198
+ "kernelspec": {
199
+ "display_name": "mhc_1",
200
+ "language": "python",
201
+ "name": "python3"
202
+ },
203
+ "language_info": {
204
+ "codemirror_mode": {
205
+ "name": "ipython",
206
+ "version": 3
207
+ },
208
+ "file_extension": ".py",
209
+ "mimetype": "text/x-python",
210
+ "name": "python",
211
+ "nbconvert_exporter": "python",
212
+ "pygments_lexer": "ipython3",
213
+ "version": "3.11.8"
214
+ }
215
+ },
216
+ "nbformat": 4,
217
+ "nbformat_minor": 2
218
+ }