Spaces:

Krishnachaitanya2004
/

Lawyer-ChatBot

Runtime error

App Files Files Community

Krishnachaitanya2004 commited on Jan 7, 2024

Commit

99cdfe6

1 Parent(s): 0a1dd02

Publish Document Chatbot to Hugging Face

Browse files

Files changed (2) hide show

document_chatbot.py +122 -0
requirements.txt +5 -0

document_chatbot.py ADDED Viewed

	@@ -0,0 +1,122 @@

+# !pip install langchain
+# !pip install sentence-transformers
+# !pip install accelerate
+# !pip install chromadb
+# !pip install "unstructured[all-docs]"
+from langchain.vectorstores import Chroma
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from transformers import pipeline
+import torch
+from langchain.llms import HuggingFacePipeline
+from langchain.embeddings import SentenceTransformerEmbeddings
+from langchain.chains import RetrievalQA
+from langchain_community.document_loaders import UnstructuredFileLoader
+from langchain.text_splitter import CharacterTextSplitter
+import streamlit as st
+import os
+def main_process(uploaded_file):
+    file_name = list(uploaded_file.keys())[0]
+    # Create a temporary directory
+    temp_dir = "temp"
+    os.makedirs(temp_dir, exist_ok=True)
+    # Save the uploaded file to the temporary directory
+    temp_path = os.path.join(temp_dir, file_name)
+    with open(temp_path, "wb") as temp_file:
+        temp_file.write(uploaded_file[file_name])
+    # Process the uploaded file
+    loader = UnstructuredFileLoader(temp_path)
+    documents = loader.load()
+    for document in documents:
+        print(document.page_content)
+    # We cant load the whole pdf into the program so we split the pdf into chunks
+    # We use RecursiveCharacterTextSplitter to split the pdf into chunks
+    # Each chunk is 500 characters long and the chunks overlap by 200 characters (You can change this according to your needs)
+    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=400)
+    texts = text_splitter.split_documents(documents)
+    # We use SentenceTransformerEmbeddings to embed the text chunks
+    # Embeddings are used to find the similarity between the query and the text chunks
+    # We use multi-qa-mpnet-base-dot-v1 model to embed the text chunks
+    # We need to save the embeddings to disk so we use persist_directory to save the embeddings to disk
+    embeddings = SentenceTransformerEmbeddings(model_name="multi-qa-mpnet-base-dot-v1")
+    persist_directory = "/content/chroma/"
+    # Chroma is used to store the embeddings
+    # We use from_documents to store the embeddings
+    # We use the persist_directory to save the embeddings to disk
+    db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory)
+    # To save and load the saved vector db (if needed in the future)
+    # Persist the database to disk
+    # db.persist()
+    # db = Chroma(persist_directory="db", embedding_function=embeddings)
+    checkpoint = "MBZUAI/LaMini-Flan-T5-783M"
+    # Initialize the tokenizer and base model for text generation
+    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+    base_model = AutoModelForSeq2SeqLM.from_pretrained(
+        checkpoint,
+        device_map="auto",
+        torch_dtype=torch.float32
+    )
+    pipe = pipeline(
+        'text2text-generation',
+        model = base_model,
+        tokenizer = tokenizer,
+        max_length = 512,
+        do_sample = True,
+        temperature = 0.3,
+        top_p= 0.95
+    )
+    # Initialize a local language model pipeline
+    local_llm = HuggingFacePipeline(pipeline=pipe)
+    # Create a RetrievalQA chain
+    qa_chain = RetrievalQA.from_chain_type(
+        llm=local_llm,
+        chain_type='stuff',
+        retriever=db.as_retriever(search_type="similarity", search_kwargs={"k": 2}),
+        return_source_documents=True,
+    )
+    return qa_chain
+st.title("Document Chatbot")
+st.write("Upload a pdf file to get started")
+uploaded_file = st.file_uploader("Choose a file", type=["pdf"])
+if uploaded_file is not None:
+    qa_chain = main_process(uploaded_file)
+    if "messages" not in st.session_state:
+        st.session_state.messages = []
+    # Display chat messages from history on app rerun
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"]):
+            st.markdown(message["content"])
+    # Accept user input
+    if prompt := st.chat_input("What is up?"):
+        # Add user message to chat history
+        st.session_state.messages.append({"role": "user", "content": prompt})
+        # Display user message in chat message container
+        with st.chat_message("user"):
+            st.markdown(prompt)
+        # Get response from chatbot
+        with st.chat_message("assitant"):
+            response = qa_chain(prompt)
+            st.markdown(response)
+            st.session_state.messages.append({"role": "assistant", "content": response})

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+unstructured==0.11.8
+langchain==0.0.336
+sentence-transformers==2.2.2
+accelerate==0.25.0
+chromadb==0.4.22