Spaces:

fakezeta
/

pdfchat

Runtime error

App Files Files Community

fakezeta commited on May 12, 2023

Commit

6feb027

1 Parent(s): 0983982

first release

Browse files

Files changed (5) hide show

app.py +113 -0
ingest_data.py +42 -0
query_data.py +43 -0
requirements.txt +9 -0
style.css +23 -0

app.py ADDED Viewed

	@@ -0,0 +1,113 @@

+from ast import Delete
+import streamlit as st
+from streamlit_chat import message
+from ingest_data import embed_doc
+from query_data import get_chain
+import os
+import time
+os.environ["OPENAI_API_KEY"] = "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"
+st.set_page_config(page_title="LangChain Local PDF Chat", page_icon=":robot:")
+footer="""<style>
+.footer {
+position: fixed;
+left: 0;
+bottom: 0;
+width: 100%;
+background-color: white;
+color: black;
+text-align: right;
+}
+</style>
+<div class="footer">
+<p>Adapted with ❤ and \U0001F916 by Fakezeta from the original Mobilefirst</p>
+</div>
+"""
+st.markdown(footer,unsafe_allow_html=True)
+def process_file(uploaded_file):
+    with open(uploaded_file.name,"wb") as f:
+        f.write(uploaded_file.getbuffer())
+        st.write("File Uploaded successfully")
+        with st.spinner("Document is being vectorized...."):
+            vectorstore = embed_doc(uploaded_file.name)
+            f.close()
+            os.remove(uploaded_file.name)
+            return vectorstore
+def get_text():
+    input_text = st.text_input("You: ", value="", key="input", disabled=st.session_state.disabled)
+    return input_text
+def query(query):
+    start = time.time()
+    with st.spinner("Doing magic...."):
+        if len(st.session_state.past) > 0 and len(st.session_state.generated) > 0:
+            chat_history=[("HUMAN: "+st.session_state.past[-1], "ASSISTANT: "+st.session_state.generated[-1])]
+        else:
+            chat_history=[]
+        print("chat_history:", chat_history)
+        output = st.session_state.chain.run(input= query,
+                                            question= query,
+                                            vectorstore= st.session_state.vectorstore,
+                                            chat_history= chat_history
+                                            )
+    end = time.time()
+    print("Query time: \a "+str(round(end - start,1)))
+    return output
+with open("style.css") as f:
+    st.markdown('<style>{}</style>'.format(f.read()), unsafe_allow_html=True)
+st.header("Local Chat with Pdf")
+if "uploaded_file_name" not in st.session_state:
+    st.session_state.uploaded_file_name = ""
+if "past" not in st.session_state:
+    st.session_state.past = []
+if "generated" not in st.session_state:
+    st.session_state["generated"] = []
+if "vectorstore" not in st.session_state:
+    st.session_state.vectorstore = None
+if "chain" not in st.session_state:
+    st.session_state.chain = None
+uploaded_file = st.file_uploader("Choose a file", type=['pdf'])
+if uploaded_file:
+    if uploaded_file.name != st.session_state.uploaded_file_name:
+        st.session_state.vectorstore = None
+        st.session_state.chain = None
+        st.session_state["generated"] = []
+        st.session_state.past = []
+        st.session_state.uploaded_file_name = uploaded_file.name
+        st.session_state.all_messages = []
+    print(st.session_state.uploaded_file_name)
+    if not st.session_state.vectorstore:
+        st.session_state.vectorstore = process_file(uploaded_file)
+    if st.session_state.vectorstore and not st.session_state.chain:
+        with st.spinner("Loading Large Language Model...."):
+            st.session_state.chain=get_chain(st.session_state.vectorstore)
+    searching=False
+    user_input = st.text_input("You: ", value="", key="input", disabled=searching)
+    send_button = st.button(label="Query")
+    if send_button:
+        searching = True
+        output = query(user_input)
+        searching = False
+        st.session_state.past.append(user_input)
+        st.session_state.generated.append(output)
+    if st.session_state["generated"]:
+        for i in range(len(st.session_state["generated"]) - 1, -1, -1):
+            message(st.session_state["generated"][i], key=str(i))
+            message(st.session_state.past[i], is_user=True, key=str(i) + "_user")

ingest_data.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.document_loaders import PyPDFLoader
+from langchain.vectorstores import Chroma
+from langchain.embeddings import TensorflowHubEmbeddings
+import os
+import time
+import streamlit as st
+def embed_doc(filename):
+    if len(os.listdir("."))>0:
+        loader=PyPDFLoader(filename)
+        start = time.time()
+        raw_documents = loader.load()
+        # Split text
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=0,
+            length_function=len
+        )
+        documents = text_splitter.split_documents(raw_documents)
+        end = time.time()
+        st.text("Load and split text: "+str(round(end - start,1)))
+        # Load Data to vectorstore
+        start = time.time()
+#        embeddings = LlamaCppEmbeddings(model_path="ggml-model.bin")
+#        embeddings = HuggingFaceEmbeddings(model_name="diptanuc/all-mpnet-base-v2", model_kwargs={'device': 'cpu'})
+#        embeddings = TensorflowHubEmbeddings(model_url="https://tfhub.dev/google/universal-sentence-encoder/4")
+        embeddings = TensorflowHubEmbeddings(model_url="https://tfhub.dev/google/universal-sentence-encoder-multilingual-qa/3")
+#        embeddings = HuggingFaceEmbeddings(model_name="obrizum/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'})
+        end = time.time()
+        st.text("Embedding time: "+str(round(end - start,1)))
+        start = time.time()
+        vectorstore = Chroma.from_documents(documents, embeddings)
+        end = time.time()
+        st.text("Vectorizing time: "+str(round(end - start,1)))
+        return vectorstore

query_data.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from langchain.prompts.prompt import PromptTemplate
+from langchain.llms import LlamaCpp
+from langchain.chains import ConversationalRetrievalChain
+from langchain.memory import ConversationBufferMemory
+from huggingface_hub import hf_hub_download
+import psutil
+import os
+#_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.
+#You can assume the question about the uploaded document.
+#Chat History:
+#{chat_history}
+#Follow Up Input: {question}
+#Standalone question:"""
+#CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)
+#template = """You are an AI assistant for answering questions about the uploaded document.
+#You are given the following extracted parts of a long document and a question. Provide a conversational answer.
+#If you don't know the answer, just say "Hmm, I'm not sure." Don't try to make up an answer.
+#If the question is not about the uploaded document, politely inform them that you are tuned to only answer questions about the uploaded document.
+#Question: {question}
+#Answer in Markdown:"""
+##QA_PROMPT = PromptTemplate(template=template, input_variables=["question", "context"])
+#QA_PROMPT = PromptTemplate(template=template, input_variables=["question"])
+#=========
+#{context}
+#=========
+def get_chain(vectorstore):
+    if not os.path.exists("ggml-vic7b-q5_1.bin"):
+        hf_hub_download(repo_id="eachadea/ggml-vicuna-7b-1.1", filename="ggml-vic7b-q5_1.bin", local_dir=".")
+    llm = LlamaCpp(model_path="ggml-vic7b-q5_1.bin", n_ctx=2048, n_threads=psutil.cpu_count(logical=False)/2)
+    qa_chain = ConversationalRetrievalChain.from_llm(
+        llm,
+        vectorstore.as_retriever(),
+#        condense_question_prompt=CONDENSE_QUESTION_PROMPT,
+    )
+    return qa_chain

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+langchain
+typing-extensions>=4.5.0
+llama-cpp-python
+streamlit_chat
+pypdf
+chromadb
+tensorflow_text
+psutil
+huggingface-hub

style.css ADDED Viewed

	@@ -0,0 +1,23 @@

+.main {
+    background-color:black; /* You can change the color to your preference */
+    color:white
+}
+/* Change the background color of the sidebar */
+.sidebar .block-container {
+    background-color: black; /* You can change the color to your preference */
+}
+.footer {
+    position: fixed;
+    left: 0;
+    bottom: 0;
+    width: 100%;
+    background-color:black;
+    color: white;
+    text-align: right;
+}
+h1, h2, h3, h4, h5, h6, p, label, .stMarkdown, .sidebar .block-container {
+    color: white;
+}