Spaces:

kishoregajjala
/

Mental-Health-Chatbot

Build error

App Files Files Community

kishoregajjala commited on Mar 17

Commit

db7706f

•

1 Parent(s): 30dbaa8

Upload 8 files

Browse files

Files changed (8) hide show

README.md +2 -13
app.py +39 -0
llm_generator.py +161 -0
nlp_models.py +39 -0
rag_pipeline.py +55 -0
rag_pipeline_vectordb.py +92 -0
requirements.txt +17 -0
test_vectordb.ipynb +218 -0

README.md CHANGED Viewed

@@ -1,13 +1,2 @@
----
-title: Mental Health Chatbot
-emoji: 🐠
-colorFrom: purple
-colorTo: blue
-sdk: streamlit
-sdk_version: 1.32.2
-app_file: app.py
-pinned: false
-license: apache-2.0
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference


1	+ # Mental_Health_Chatbot_Integrated
2	+ Mental_Health_Chatbot_Integrated

app.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import streamlit as st
+import llm_generator
+from llm_generator import llm_generation
+import time
+# ST : https://docs.streamlit.io/knowledge-base/tutorials/build-conversational-apps
+st.title('Mental Health Therapist')
+def response_generator(response):
+    '''
+    responds the text with a type writter effect
+    '''
+    response_buffer = response.strip()
+    for word in response_buffer.split():
+        yield word + " "
+        time.sleep(0.1)
+# Initialize chat history
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.markdown(message["content"])
+# Accept user input
+if user_prompt := st.chat_input("Hello, How are you doing today"):
+    st.session_state.messages.append({"role": "user", "content": user_prompt})
+    with st.chat_message("user"):
+        st.markdown(user_prompt)
+    with st.chat_message("assistant"):
+        response = llm_generation(user_prompt)
+        time.sleep(1)
+        st.write_stream(response_generator(response))
+    st.session_state.messages.append({"role": "assistant", "content": response})

llm_generator.py ADDED Viewed

	@@ -0,0 +1,161 @@

+from langchain.prompts import PromptTemplate
+from langchain_community.llms import HuggingFaceEndpoint
+from langchain.vectorstores import Chroma
+from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
+import os
+from langchain.prompts.chat import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+) # Docs:- https://python.langchain.com/docs/modules/model_io/prompts/message_prompts
+#import chromadb
+# LLM Generator
+from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
+from langchain.chains import ConversationalRetrievalChain
+from langchain.memory import ChatMessageHistory, ConversationSummaryBufferMemory, ConversationBufferMemory
+from langchain_experimental.chat_models import Llama2Chat
+# Docs:- https://python.langchain.com/docs/integrations/chat/llama2_chat
+HUGGINGFACEHUB_API_TOKEN = HF_ACCESS_TOKEN
+#os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN
+# Implement another function to pass an array of PDFs / CSVs / Excels
+from rag_pipeline import instantiate_rag
+retriever = instantiate_rag()
+#persist_directory="Data/chroma"
+#chroma_client = chromadb.PersistentClient(persist_directory=persist_directory)
+#embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
+#vectors = Chroma(persist_directory = persist_directory, embedding_function = embedding_function)
+#retriever = vectors.as_retriever() #(k=6)
+# Set the url to your Inference Endpoint below
+#your_endpoint_url = "https://fayjubiy2xqn36z0.us-east-1.aws.endpoints.huggingface.cloud"
+#how you can access HuggingFaceEndpoint integration of the free Serverless Endpoints API.
+repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
+llm = HuggingFaceEndpoint(
+    #endpoint_url=f"{your_endpoint_url}",
+    repo_id=repo_id,
+    #max_length=128,
+    max_new_tokens=512,
+    token=HUGGINGFACEHUB_API_TOKEN,
+    temperature=0.1,
+    repetition_penalty=1.1,
+    #context_length: 4096, # Set to max for Chat Summary, Llama-2 has a max context length of 4096,
+    stream=True,
+    callbacks=[StreamingStdOutCallbackHandler()],
+    #top_k=10,
+    #top_p=0.95,
+)
+model = Llama2Chat(llm=llm)
+memory = ConversationBufferMemory(
+    llm=llm,
+    memory_key="chat_history",
+    return_messages=True,
+    output_key='answer',
+    input_key='question')
+# Prompt Context Reference : https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF , https://huggingface.co/TheBloke/Llama-2-13B-chat-GPTQ/discussions/5#64b81e9b15ebeb44419a2b9e
+# Reference:- https://github.com/langchain-ai/langchain/issues/5462
+system_message_template = """You're a Mental Health Specialist. Support those with Depressive Disorder.
+Listen compassionately, respond helpfully. For casual talk, be friendly. For facts, use context.
+If unsure, say, 'Out of my knowledge.' Always stay direct.
+If you cannot find the answer from the pieces of context, just say that you don't know, don't try to make up an answer.
+----------------
+{context}"""
+messages = [
+SystemMessagePromptTemplate.from_template(system_message_template),
+HumanMessagePromptTemplate.from_template("{question}")
+]
+qa_prompt = ChatPromptTemplate.from_messages(messages)
+qa_prompt.pretty_print()
+condense_question = """Given the following conversation and a follow-up message,
+rephrase the follow-up message to a stand-alone question or instruction that
+represents the user's intent precisely, add context needed if necessary to generate a complete and
+unambiguous question, only based on the on the Follow up Question and chat history, don't make up messages.
+Maintain the same question intent as the follow up input message.\n
+Chat History:
+{chat_history}\n
+Follow Up Input: {question}
+Standalone question:"""
+condense_question_prompt = PromptTemplate.from_template(condense_question)
+condense_question_prompt.pretty_print()
+retrieval_chain = ConversationalRetrievalChain.from_llm(
+    llm = llm,
+    retriever=retriever,
+    memory = memory,
+    return_source_documents=False,
+    verbose=True,
+    #condense_question_prompt=condense_question_prompt,
+    # chain_type = "stuff",
+    combine_docs_chain_kwargs={'prompt': qa_prompt}, # https://github.com/langchain-ai/langchain/issues/6879
+)
+human_inputs = ['Nothing logged yet']
+ai_responses = ['Nothing logged yet']
+history = ChatMessageHistory()
+def llm_generation(question: str):
+    llm_answer = retrieval_chain.invoke({'question':question, 'chat_history':history.messages})['answer'] #Answer = Dict Key = Latest response by the AI
+    history.add_user_message(question)
+    history.add_ai_message(llm_answer)
+    return llm_answer
+# Decide wether to place this in streamlit.py
+# or make a new post_process.py and import that to streamlit
+def extract_dialogues(text):
+    '''
+    returns a two lists for human and ai dialogues,
+    '''
+    human_dialogues = []
+    ai_dialogues = []
+    lines = text.split('\n')
+    # Iterate through each line
+    for line in lines:
+        # Remove leading and trailing whitespace
+        line = line.strip()
+        # Check if the line starts with 'Human:' or 'AI:'
+        if line.startswith('Human:'):
+            # Extract the text after 'Human:'
+            human_dialogues.append(line[len('Human:'):].strip())
+        elif line.startswith('AI:'):
+            # Extract the text after 'AI:'
+            ai_dialogues.append(line[len('AI:'):].strip())
+    return human_dialogues, ai_dialogues
+def update_list():
+    global human_inputs, ai_responses
+    human_responses, ai_responses = extract_dialogues(memory.buffer_as_str)
+    return 'responses updated'
+def is_depressed():
+    ''''
+    returns wether according to human inputs the person is depressed or not
+    '''
+    # Implement Classification
+    all_user_inputs = ''.join(human_inputs)
+    from nlp_models import sentiment_class, pattern_classification, corelation_analysis
+    is_depressed = sentiment_class(all_user_inputs)
+    return 'Not so depressed' if is_depressed[0][1] > 0.5 else 'is_depressed'

nlp_models.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import torch
+from transformers import DistilBertForSequenceClassification
+import os
+# # Get the directory path of the current script
+# script_dir = os.path.dirname(os.path.abspath(__file__))
+# model = DistilBertForSequenceClassification.from_pretrained("model.safetensors")
+# Load model directly
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+tokenizer = AutoTokenizer.from_pretrained("lxs1/DistilBertForSequenceClassification_6h_768dim")
+model = AutoModelForSequenceClassification.from_pretrained("lxs1/DistilBertForSequenceClassification_6h_768dim")
+# from transformers import DistilBertTokenizerFast
+# tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
+# Move the model to the GPU if available
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+model.to(device)
+def sentiment_class(summarized_text):
+    '''
+    # 1 = non-depressed
+    # 0 = depressed
+    returns: example:- array([[0.00493283, 0.9950671 ]], dtype=float32)
+    '''
+    inputs = tokenizer(summarized_text, padding = True, truncation = True, return_tensors='pt').to('cuda')
+    outputs = model(**inputs)
+    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
+    predictions = predictions.cpu().detach().numpy()
+    return predictions
+def pattern_classification():
+    return result
+def corelation_analysis():
+    return result

rag_pipeline.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from langchain_community.document_loaders import PyMuPDFLoader
+from langchain_community.document_loaders import TextLoader
+from langchain_community.embeddings.sentence_transformer import (
+    SentenceTransformerEmbeddings,
+)
+import os
+from langchain.storage import InMemoryStore
+from langchain_community.document_loaders import TextLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain.retrievers import ParentDocumentRetriever
+from langchain_community.vectorstores import Chroma
+from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
+# Import CSV Files to the VectorDB
+# Reference : https://towardsdatascience.com/rag-how-to-talk-to-your-data-eaf5469b83b0
+# df_mental_health = pd.read_excel("/content/drive/MyDrive/Team 5/Depression_dataset_preprocessed (1).xlsx", sheet_name= "98_row_Mental_Health_FAQs")
+# df_counsellor_chats = pd.read_excel("/content/drive/MyDrive/Team 5/Depression_dataset_preprocessed (1).xlsx", sheet_name= "Counsellor_Chats")
+# df_human_therapist = pd.read_excel("/content/drive/MyDrive/Team 5/Depression_dataset_preprocessed (1).xlsx", sheet_name= "99_rows_Human_&_Therapist")
+# Get the directory path of the current script
+script_dir = os.path.dirname(os.path.abspath(__file__))
+loader = PyMuPDFLoader(os.path.join(script_dir, 'Data','pdf', 'Depression Help Guide.pdf'))
+documents  = loader.load()
+# create the open-source embedding function
+# Docs:- https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
+embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
+# https://python.langchain.com/docs/modules/data_connection/retrievers/parent_document_retriever
+parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
+# This text splitter is used to create the child documents
+# It should create documents smaller than the parent
+child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
+# The vectorstore to use to index the child chunks
+vectorstore = Chroma(
+    collection_name="split_parents", embedding_function=embedding_function)
+# The storage layer for the parent documents
+store = InMemoryStore()
+def instantiate_rag():
+    rag_retriever = ParentDocumentRetriever(
+        vectorstore=vectorstore,
+        docstore=store,
+        child_splitter=child_splitter,
+        parent_splitter=parent_splitter,
+    )
+    rag_retriever.add_documents(documents)
+    return rag_retriever

rag_pipeline_vectordb.py ADDED Viewed

	@@ -0,0 +1,92 @@

+from langchain_community.document_loaders import PyMuPDFLoader
+from langchain_community.document_loaders import TextLoader
+from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
+from langchain.storage import InMemoryStore
+from langchain_community.document_loaders import TextLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain.retrievers import ParentDocumentRetriever
+from langchain_community.vectorstores import Chroma
+from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
+from langchain_community.document_loaders.csv_loader import CSVLoader
+import chromadb
+from chromadb.utils import embedding_functions
+import os
+# Reference : https://towardsdatascience.com/rag-how-to-talk-to-your-data-eaf5469b83b0
+embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
+persist_directory="Data/chroma"
+chroma_client = chromadb.PersistentClient(path=persist_directory)
+# https://python.langchain.com/docs/modules/data_connection/retrievers/parent_document_retriever
+parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
+# This text splitter is used to create the child documents
+# It should create documents smaller than the parent
+child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
+def get_file_paths_recursively(folder_path):
+    file_paths = []
+    for root, directories, files in os.walk(folder_path):
+        for file in files:
+            file_path = os.path.join(root, file)
+            file_paths.append(file_path)
+    return file_paths
+def vdb_csv_loader(file_paths):
+    for i in range(len(file_paths)):
+        loader = CSVLoader(file_path=file_paths[i], encoding="latin-1")
+        db = Chroma.from_documents(documents=loader.load(), embedding=embedding_function, collection_name= "mental_health_csv_collection", persist_directory=persist_directory) # pars to imclude (docs, emb_fun, col_name, direct_path)
+###
+def generate_csv_vector_db() -> None:
+     # Get the directory path of the current script
+    #script_dir = os.path.dirname(os.path.abspath(__file__))
+    #folder_path = os.path.join(script_dir, 'Data/csv')
+    folder_path = "Data/csv"
+    file_paths = get_file_paths_recursively(folder_path)
+    #loaded all the files
+    vdb_csv_loader(file_paths)
+###
+pdf_collection = Chroma(collection_name="mental_health_pdf_collection", embedding_function=embedding_function, persist_directory=persist_directory)
+def vdb_pdf_loader(file_paths):
+    for i in range(len(file_paths)):
+        loader = PyMuPDFLoader(file_path=file_paths[i])
+        documents  = loader.load()
+        store = InMemoryStore()
+        rag_retriever = ParentDocumentRetriever(
+            vectorstore=pdf_collection,
+            docstore=store,
+            child_splitter=child_splitter,
+            parent_splitter=parent_splitter,
+        )
+        rag_retriever.add_documents(documents)
+def generate_pdf_vector_db() -> None:
+    # Get the directory path of the current script
+    #script_dir = os.path.dirname(os.path.abspath(__file__))
+    #folder_path = os.path.join(script_dir, '/Data/pdf')
+    folder_path = "Data/pdf"
+    file_paths = get_file_paths_recursively(folder_path)
+    vdb_pdf_loader(file_paths)
+def vectordb_load():
+    # call csv loader
+    generate_csv_vector_db()
+    # call PDF loader
+    generate_pdf_vector_db()
+# call vector db load
+vectordb_load()

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+#create new env
+#conda create --name LLM_chatbot
+#activate the env
+#conda activate LLM_chatbot
+#pip install -r requirements.txt
+#if streamlit is still unrecognized run this "conda install -c conda-forge streamlit"
+#to run stremlit use streamlit run streamlit_ui.py
+langchain==0.1.11
+torch==2.0.1
+transformers==4.36.2
+langchain-community==0.0.27
+streamlit==1.32.2
+ctransformers==0.2.27
+pymupdf==1.23.26
+sentence-transformers==2.5.1
+chromadb==0.4.24
+langchain_experimental

test_vectordb.ipynb ADDED Viewed

	@@ -0,0 +1,218 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_community.document_loaders import PyMuPDFLoader\n",
+    "from langchain_community.document_loaders import TextLoader\n",
+    "from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings\n",
+    "from langchain.storage import InMemoryStore\n",
+    "from langchain_community.document_loaders import TextLoader\n",
+    "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
+    "from langchain.retrievers import ParentDocumentRetriever\n",
+    "from langchain_community.vectorstores import Chroma\n",
+    "from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter\n",
+    "from langchain_community.document_loaders.csv_loader import CSVLoader\n",
+    "import chromadb\n",
+    "from chromadb.utils import embedding_functions\n",
+    "import os\n",
+    "\n",
+    "# Reference : https://towardsdatascience.com/rag-how-to-talk-to-your-data-eaf5469b83b0\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/kishoregajjala/anaconda3/envs/mhc_1/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "# create the open-source embedding function\n",
+    "huggingface_ef = SentenceTransformerEmbeddings(model_name=\"all-MiniLM-L6-v2\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "persist_directory=\"Data/chroma\"\n",
+    "chroma_client = chromadb.PersistentClient(path=persist_directory)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# https://python.langchain.com/docs/modules/data_connection/retrievers/parent_document_retriever\n",
+    "parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)\n",
+    "\n",
+    "# This text splitter is used to create the child documents\n",
+    "# It should create documents smaller than the parent\n",
+    "child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "def get_file_paths_recursively(folder_path):\n",
+    "    file_paths = []\n",
+    "    for root, directories, files in os.walk(folder_path):\n",
+    "        for file in files:\n",
+    "            file_path = os.path.join(root, file)\n",
+    "            file_paths.append(file_path)\n",
+    "    return file_paths\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "def vdb_csv_loader(file_paths):\n",
+    "    for i in range(len(file_paths)):\n",
+    "        loader = CSVLoader(file_path=file_paths[i], encoding=\"latin-1\")\n",
+    "        db = Chroma.from_documents(documents=loader.load(), embedding=huggingface_ef, collection_name= \"mental_health_csv_collection\", persist_directory=persist_directory) # pars to imclude (docs, emb_fun, col_name, direct_path)\n",
+    "\n",
+    "###\n",
+    "def generate_csv_vector_db() -> None:\n",
+    "    \n",
+    "     # Get the directory path of the current script\n",
+    "    #script_dir = os.path.dirname(os.path.abspath(__file__))\n",
+    "    folder_path = \"Data/csv\"\n",
+    "    file_paths = get_file_paths_recursively(folder_path)\n",
+    "\n",
+    "    #loaded all the files\n",
+    "    vdb_csv_loader(file_paths)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "pdf_collection = Chroma(collection_name=\"mental_health_pdf_collection\", embedding_function=huggingface_ef, persist_directory=persist_directory)      \n",
+    "def vdb_pdf_loader(file_paths):\n",
+    "    for i in range(len(file_paths)):\n",
+    "        loader = PyMuPDFLoader(file_path=file_paths[i])\n",
+    "        documents  = loader.load()\n",
+    "    \n",
+    "        store = InMemoryStore()\n",
+    "        rag_retriever = ParentDocumentRetriever(\n",
+    "            vectorstore=pdf_collection,\n",
+    "            docstore=store,\n",
+    "            child_splitter=child_splitter,\n",
+    "            parent_splitter=parent_splitter,\n",
+    "        )\n",
+    "        rag_retriever.add_documents(documents)\n",
+    "\n",
+    "\n",
+    "def generate_pdf_vector_db() -> None:\n",
+    "    \n",
+    "    # Get the directory path of the current script\n",
+    "    #script_dir = os.path.dirname(os.path.abspath(__file__))\n",
+    "    folder_path = \"Data/pdf\" #os.path.join(script_dir, '/Data/pdf') \n",
+    "    file_paths = get_file_paths_recursively(folder_path)\n",
+    "    vdb_pdf_loader(file_paths)\n",
+    "\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    " # call PDF loader\n",
+    "generate_pdf_vector_db()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# call csv loader\n",
+    "generate_csv_vector_db()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "def vectordb_load():     \n",
+    "    # call csv loader\n",
+    "    generate_csv_vector_db()\n",
+    "\n",
+    "    # call PDF loader\n",
+    "    generate_pdf_vector_db()\n",
+    "\n",
+    "    \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# call vector db load\n",
+    "vectordb_load()\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "mhc_1",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}