Spaces:

junaidiqbalsyed
/

qa_hr_chatbot

Sleeping

App Files Files Community

Syed Junaid Iqbal commited on Dec 14, 2023

Commit

5103cb0

1 Parent(s): d04f7ff

Update app.py

Browse files

Files changed (1) hide show

app.py +154 -143

app.py CHANGED Viewed

@@ -2,47 +2,71 @@ import subprocess
 import streamlit as st
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain.vectorstores import FAISS
 from langchain.embeddings import FastEmbedEmbeddings  # General embeddings from HuggingFace models.
 from langchain.memory import ConversationBufferMemory
 from langchain.callbacks.manager import CallbackManager
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 from htmlTemplates import css, bot_template, user_template
-from langchain.llms import LlamaCpp  # For loading transformer models.
 from langchain.document_loaders import PyPDFLoader, TextLoader, CSVLoader
 from langchain.chains import RetrievalQA
 from langchain.prompts import PromptTemplate
 from langchain import hub
 import os
 import glob
 import shutil
-# os.environ['FAISS_NO_AVX2'] = '1'
-# os.environ["TOKENIZERS_PARALLELISM"] = "false"
-def load_document_text():
-    """
-    input : path to the document
-    output: list of loaded document
     """
-    documents = []
-    for dox in os.listdir(path= "./documents/"):
-        dir = os.path.join("./documents/", dox)
-        if dox.endswith(".pdf"):
-            documents.extend( PyPDFLoader(dir).load() )
-        elif dox.endswith(".txt"):
-            documents.extend( TextLoader(dir).load() )
-        elif dox.endswith(".csv"):
-            documents.extend( CSVLoader(dir).load() )
-    return documents
 def get_text_chunks(documents):
     """
     For the compute purpose we will split the document into multiple smaller chunks.
@@ -50,51 +74,58 @@ def get_text_chunks(documents):
     IMPORTANT :  If the chunks too small we will miss the context and if its too large we will have longer compute time
     """
     text_splitter = RecursiveCharacterTextSplitter(
-        chunk_size= 400,
-        chunk_overlap=50,
     )
-    st.session_state.text_chunks = text_splitter.split_documents(documents)
-def get_vectorstore():
     """
-    given the chunks, we will embed them into vector stores
     """
-    if len(glob.glob("./vectordb/*.faiss")) == 0:
-        st.session_state.vectorstore = FAISS.from_documents(documents= st.session_state.text_chunks,
-                                            embedding= st.session_state.embeddings)
-        # save the file
-        st.session_state.vectorstore.save_local("./vectordb")
-    else:
-        st.session_state.vectorstore = FAISS.load_local("./vectordb/",
-                                                        st.session_state.embeddings)
-def get_conversation_chain():
     """
     This is a langchain model where we will be binding the runner to infer data from LLM
     """
     model_path = st.session_state.model
     callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
-    llm = LlamaCpp(model_path= model_path,
-                   n_ctx=4000,
-                   max_tokens= 4000,
-                   n_gpu_layers = 40,
-                   callback_manager = callback_manager,
-                   verbose=True)
-    memory = ConversationBufferMemory(
-        memory_key='chat_history', return_messages=False)
     prompt_template = """You are a personal HR Bot assistant for answering any questions about Companies policies
     You are given a question and a set of documents.
     If the user's question requires you to provide specific information from the documents, give your answer based only on the examples provided below. DON'T generate an answer that is NOT written in the provided examples.
     If you don't find the answer to the user's question with the examples provided to you below, answer that you didn't find the answer in the documentation and propose him to rephrase his query with more details.
-    Use bullet points if you have to make a list, only if necessary.
     QUESTION: {question}
@@ -107,11 +138,12 @@ def get_conversation_chain():
     rag_prompt_custom = PromptTemplate.from_template(prompt_template)
     prompt = hub.pull("rlm/rag-prompt-mistral")
     conversation_chain = RetrievalQA.from_chain_type(
         llm,
-        retriever= st.session_state.vectorstore.as_retriever(),
         chain_type_kwargs={"prompt": prompt},
     )
     conversation_chain.callback_manager = callback_manager
@@ -119,7 +151,7 @@ def get_conversation_chain():
     return conversation_chain
 def handle_userinput():
     clear = False
@@ -129,77 +161,64 @@ def handle_userinput():
         clear = True
         st.session_state.messages = []
     if "messages" not in st.session_state:
-        st.session_state.messages = [{"role": "assistant", "content": "How can I help you?"}]
-    for msg in st.session_state.messages:
-        st.chat_message(msg["role"]).write(msg["content"])
-    if prompt := st.chat_input():
-        st.session_state.messages.append({"role": "user", "content": prompt})
-        st.chat_message("user").write(prompt)
-        if clear:
-            st.session_state.conversation.clean()
-        msg = st.session_state.conversation.run(prompt)
-        print(msg)
-        st.session_state.messages.append({"role": "assistant", "content": msg})
-        st.chat_message("assistant").write(msg)
-# Function to apply rounded edges using CSS
-def add_rounded_edges(image_path="./randstad_featuredimage.png", radius=30):
-    st.markdown(
-        f'<style>.rounded-img{{border-radius: {radius}px; overflow: hidden;}}</style>',
-        unsafe_allow_html=True,)
-    st.image(image_path, use_column_width=True, output_format='auto')
-# Delete directory content
-def delete_file(directory_path):
-    # Check if the directory exists
-    if os.path.exists(directory_path) and len(os.listdir(directory_path)) > 0:
-        # Iterate over all files in the directory and remove them
-        for filename in os.listdir(directory_path):
-            file_path = os.path.join(directory_path, filename)
-            try:
-                if os.path.isfile(file_path) or os.path.islink(file_path):
-                    os.unlink(file_path)
-                elif os.path.isdir(file_path):
-                    shutil.rmtree(file_path)
-            except Exception as e:
-                print(f"Error deleting {file_path}: {e}")
-    else:
-        print(f"The directory {directory_path} does not exist.")
-def save_uploaded_file(uploaded_file):
-    save_directory = "./documents/"
-    file_path = os.path.join(save_directory, uploaded_file.name)
-    with open(file_path, "wb") as f:
-        f.write(uploaded_file.getvalue())
-    return file_path
-def load_dependencies():
-    # append documents to a list
-    doc_list = load_document_text()
-    # get the text chunks
-    get_text_chunks(doc_list)
-    # create vector store
-    get_vectorstore()
-    # create conversation chain
-    st.session_state.conversation = get_conversation_chain()
 def main():
-    st.set_page_config(page_title="Randstad Chad Bot",
                        page_icon=":books:")
     st.write(css, unsafe_allow_html=True)
@@ -213,71 +232,65 @@ def main():
     st.subheader("🚀 A HR powered by Generative AI")
     # default model
-    st.session_state.model = "./models/mistral-7b-instruct-v0.2.Q5_K_M.gguf"
-    # Embedding Model
-    st.session_state.embeddings = FastEmbedEmbeddings( model_name= "BAAI/bge-base-en-v1.5",
-                                                                    cache_dir="./embedding_model/")
     with st.sidebar:
         # calling a
         add_rounded_edges()
         st.subheader("Select Your Embedding Model Model")
-        st.session_state.model = st.selectbox( 'Models', tuple( glob.glob('./models/*.gguf') ) )
         st.subheader("Your documents")
-        # Space to Upload a Document
         docs = st.file_uploader(
             "Upload File (pdf,text,csv...) and click 'Process'", accept_multiple_files=True)
-        # Define a process button
         if st.button("Process"):
-            # delete the old embeddings
-            delete_file(directory_path= './vectordb/')
-            # delete old documents
-            delete_file(directory_path="./documents/")
-            # then Embedd new documents
             with st.spinner("Processing"):
-                # iterate over updated files and save them to the local directory (i.e. "Documents") using a helper function
-                for file in docs:
-                    save_uploaded_file(file)
                 # using the helper function below lets load our dependencies
                 # Step 1 : Load the documents
-                # Step 2 : Break them into Chunks
-                # Step 3 : Create Embeddings and save them to Vector DB
-                # Step 4 : Get our conversation chain
-                load_dependencies()
-    # Load our model
-    if len(glob.glob("./vectordb/*.faiss")) == 0:
-        load_dependencies()
-        get_vectorstore()
-    else:
-        get_vectorstore()
-        st.session_state.conversation = get_conversation_chain()
-    handle_userinput()
-    # # load dependencies -> chaunks of documents ->  Embeddings -> Inference
-    # load_dependencies()
 if __name__ == '__main__':
     command = 'CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --no-cache-dir'
     # Run the command using subprocess
@@ -286,6 +299,4 @@ if __name__ == '__main__':
         print("Command executed successfully.")
     except subprocess.CalledProcessError as e:
         print(f"Error: {e}")
-    # Run the apps
     main()

 import streamlit as st
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.vectorstores import Chroma, FAISS
 from langchain.embeddings import FastEmbedEmbeddings  # General embeddings from HuggingFace models.
 from langchain.memory import ConversationBufferMemory
 from langchain.callbacks.manager import CallbackManager
+from langchain.callbacks import StreamlitCallbackHandler
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 from htmlTemplates import css, bot_template, user_template
+from langchain.llms import LlamaCpp, OpenAI, GooglePalm  # For loading transformer models.
 from langchain.document_loaders import PyPDFLoader, TextLoader, CSVLoader
 from langchain.chains import RetrievalQA
 from langchain.prompts import PromptTemplate
 from langchain import hub
+import tempfile
 import os
 import glob
 import shutil
+import time
+# TEXT LOADERS
+def get_pdf_text(pdf_docs):
     """
+    Purpose: A hypothetical loader for PDF files in Python.
+    Usage: Used to extract text or other information from PDF documents.
+    Load Function: A load_pdf function might be used to read and extract data from a PDF file.
+    input : pdf document path
+    returns : extracted text
+    """
+    temp_dir = tempfile.TemporaryDirectory()
+    temp_filepath = os.path.join(temp_dir.name, pdf_docs.name)
+    with open(temp_filepath, "wb") as f:
+        f.write(pdf_docs.getvalue())
+    pdf_loader = PyPDFLoader(temp_filepath)
+    pdf_doc = pdf_loader.load()
+    return pdf_doc
+def get_text_file(text_docs):
+    """
+    """
+    temp_dir = tempfile.TemporaryDirectory()
+    temp_filepath = os.path.join(temp_dir.name, text_docs.name)
+    with open(temp_filepath, "wb") as f:
+        f.write(text_docs.getvalue())
+    text_loader = TextLoader(temp_filepath)
+    text_doc = text_loader.load()
+    return text_doc
+def get_csv_file(csv_docs):
+    temp_dir = tempfile.TemporaryDirectory()
+    temp_filepath = os.path.join(temp_dir.name, csv_docs.name)
+    with open(temp_filepath, "wb") as f:
+        f.write(csv_docs.getvalue())
+    csv_loader = CSVLoader(temp_filepath)
+    csv_doc = csv_loader.load()
+    return csv_doc
+# Break the documents into chunks
 def get_text_chunks(documents):
     """
     For the compute purpose we will split the document into multiple smaller chunks.
     IMPORTANT :  If the chunks too small we will miss the context and if its too large we will have longer compute time
     """
     text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size= 1000,
+        chunk_overlap=200,
     )
+    text_chunks = text_splitter.split_documents(documents)
+    return text_chunks
+# Save chunks to vector store
+def get_vectorstore(text_chunks):
     """
+    Load our vectors into chroma DB, Googles Vector Store
     """
+    vectorstore = Chroma.from_documents(documents= text_chunks,
+                                                        embedding= st.session_state.embeddings,
+                                                        persist_directory= "./vectordb/")
+    return vectorstore
+# Bind the Vector DB, Large Language models and Embedding models all into one container
+def get_conversation_chain(vectorstore):
     """
     This is a langchain model where we will be binding the runner to infer data from LLM
     """
     model_path = st.session_state.model
     callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
+    if st.session_state.model == "Google_PaLm" :
+        llm = GooglePalm(google_api_key = "AIzaSyAIo7rw6iJPWpkpOXSJk6BnuOKNSaB5muM",
+                         max_output_tokens = 4000,
+                         callback_manager=callback_manager)
+    elif st.session_state.model == "Open_AIGPT-3.5-Turbo":
+        llm = OpenAI(api_key = "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",
+                     callback_manager = callback_manager,
+                     max_tokens= 4000 )
+    else:
+        llm = LlamaCpp(model_path= model_path,
+                    n_ctx= 4000,
+                    max_tokens= 4000,
+                    f16_kv = True,
+                    callback_manager = callback_manager,
+                    verbose=True)
     prompt_template = """You are a personal HR Bot assistant for answering any questions about Companies policies
     You are given a question and a set of documents.
     If the user's question requires you to provide specific information from the documents, give your answer based only on the examples provided below. DON'T generate an answer that is NOT written in the provided examples.
     If you don't find the answer to the user's question with the examples provided to you below, answer that you didn't find the answer in the documentation and propose him to rephrase his query with more details.
+    Use bullet points if you have to make a list, only if necessary. Use 'DOCUMENTS' as a reference point, to understand and give a consciese output in 3 or 5 sentences.
     QUESTION: {question}
     rag_prompt_custom = PromptTemplate.from_template(prompt_template)
+    # prompt = hub.pull("rlm/rag-prompt")
     prompt = hub.pull("rlm/rag-prompt-mistral")
     conversation_chain = RetrievalQA.from_chain_type(
         llm,
+        retriever= vectorstore.as_retriever(),
         chain_type_kwargs={"prompt": prompt},
     )
     conversation_chain.callback_manager = callback_manager
     return conversation_chain
+# an stream lit interface to handle and save our chats
 def handle_userinput():
     clear = False
         clear = True
         st.session_state.messages = []
+    # initialise our stream  lit chat interface
     if "messages" not in st.session_state:
+        st.session_state.messages = [{"role": "assistant", "content": "How can I help you?"}]
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"]):
+            st.markdown(message["content"])
+    # Clear the cash memory
+    if clear:
+        st.session_state.conversation.memory.clear()
+        clear = False
+    if prompt := st.chat_input():
+        with st.chat_message("user"):
+            st.markdown(prompt)
+        # add user question to chat history
+        st.session_state.messages.append( {"role": "user", "content": prompt})
+        with st.chat_message("assistant"):
+            # set up a call back handler
+            st_callback = StreamlitCallbackHandler(st.container())
+            message_holder = st.empty()
+            full_response = ""
+            # streamlit call back manager
+            st.session_state.conversation.callback_manager = st_callback
+            msg = st.session_state.conversation.run(prompt)
+            #st.markdown(msg)
+            for chunk in msg.split():
+                full_response += chunk + " "s
+                time.sleep(0.09)
+                # add a blinking cursor to simulate typing
+                message_holder.markdown(full_response + "✏️ ")
+        # Display the responce
+        message_holder.info(full_response)
+        # add responce to session state
+        st.session_state.messages.append({"role": "assistant", "content": full_response})
+# Function to apply rounded edges using CSS
+def add_rounded_edges(image_path="./randstad_featuredimage.png", radius=30):
+    st.markdown(
+        f'<style>.rounded-img{{border-radius: {radius}px; overflow: hidden;}}</style>',s
+        unsafe_allow_html=True,)
+    st.image(image_path, use_column_width=True, output_format='auto')
 def main():
+    st.set_page_config(page_title="RANDSTAD",
                        page_icon=":books:")
     st.write(css, unsafe_allow_html=True)
     st.subheader("🚀 A HR powered by Generative AI")
     # default model
+    st.session_state.model = "Google_PaLm"
+    # user_question = st.text_input("Ask a question about your documents:")
+    st.session_state.embeddings = FastEmbedEmbeddings( model_name= "BAAI/bge-base-en-v1.5", cache_dir="./embedding_model/")
+    if len(glob.glob("./vectordb/*.sqlite3")) > 0 :
+        vectorstore = Chroma(persist_directory="./vectordb/", embedding_function=st.session_state.embeddings)
+        st.session_state.conversation = get_conversation_chain(vectorstore)
+        handle_userinput()
+    # side bar information
     with st.sidebar:
         # calling a
         add_rounded_edges()
         st.subheader("Select Your Embedding Model Model")
+        LLM = list( glob.glob('./models/*.gguf') )
+        LLM.extend(["Open_AIGPT-3.5-Turbo", "Google_PaLm"])
+        st.session_state.model = st.selectbox( 'Models', LLM )
         st.subheader("Your documents")
         docs = st.file_uploader(
             "Upload File (pdf,text,csv...) and click 'Process'", accept_multiple_files=True)
         if st.button("Process"):
             with st.spinner("Processing"):
+                # get pdf text
+                doc_list = []
                 # using the helper function below lets load our dependencies
                 # Step 1 : Load the documents
+                for file in docs:
+                    print('file - type : ', file.type)
+                    if file.type == 'text/plain':
+                        # file is .txt
+                        doc_list.extend(get_text_file(file))
+                    elif file.type in ['application/octet-stream', 'application/pdf']:
+                        # file is .pdf
+                        doc_list.extend(get_pdf_text(file))
+                    elif file.type == 'text/csv':
+                        # file is .csv
+                        doc_list.extend(get_csv_file(file))
+                # Step 2 : Break them into Chunks
+                text_chunks = get_text_chunks(doc_list)
+                # Step 3 : Create Embeddings and save them to Vector DB
+                vectorstore = get_vectorstore(text_chunks)
+                # Step 4 : Get our conversation chain
+                st.session_state.conversation = get_conversation_chain(vectorstore)
 if __name__ == '__main__':
     command = 'CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --no-cache-dir'
     # Run the command using subprocess
         print("Command executed successfully.")
     except subprocess.CalledProcessError as e:
         print(f"Error: {e}")
     main()