fakezeta commited on
Commit
6feb027
1 Parent(s): 0983982

first release

Browse files
Files changed (5) hide show
  1. app.py +113 -0
  2. ingest_data.py +42 -0
  3. query_data.py +43 -0
  4. requirements.txt +9 -0
  5. style.css +23 -0
app.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ast import Delete
2
+ import streamlit as st
3
+ from streamlit_chat import message
4
+ from ingest_data import embed_doc
5
+ from query_data import get_chain
6
+ import os
7
+ import time
8
+
9
+ os.environ["OPENAI_API_KEY"] = "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"
10
+
11
+ st.set_page_config(page_title="LangChain Local PDF Chat", page_icon=":robot:")
12
+
13
+ footer="""<style>
14
+
15
+ .footer {
16
+ position: fixed;
17
+ left: 0;
18
+ bottom: 0;
19
+ width: 100%;
20
+ background-color: white;
21
+ color: black;
22
+ text-align: right;
23
+ }
24
+ </style>
25
+ <div class="footer">
26
+ <p>Adapted with ❤ and \U0001F916 by Fakezeta from the original Mobilefirst</p>
27
+ </div>
28
+ """
29
+ st.markdown(footer,unsafe_allow_html=True)
30
+
31
+ def process_file(uploaded_file):
32
+ with open(uploaded_file.name,"wb") as f:
33
+ f.write(uploaded_file.getbuffer())
34
+ st.write("File Uploaded successfully")
35
+
36
+ with st.spinner("Document is being vectorized...."):
37
+ vectorstore = embed_doc(uploaded_file.name)
38
+ f.close()
39
+ os.remove(uploaded_file.name)
40
+ return vectorstore
41
+
42
+ def get_text():
43
+ input_text = st.text_input("You: ", value="", key="input", disabled=st.session_state.disabled)
44
+ return input_text
45
+
46
+ def query(query):
47
+ start = time.time()
48
+ with st.spinner("Doing magic...."):
49
+ if len(st.session_state.past) > 0 and len(st.session_state.generated) > 0:
50
+ chat_history=[("HUMAN: "+st.session_state.past[-1], "ASSISTANT: "+st.session_state.generated[-1])]
51
+ else:
52
+ chat_history=[]
53
+ print("chat_history:", chat_history)
54
+ output = st.session_state.chain.run(input= query,
55
+ question= query,
56
+ vectorstore= st.session_state.vectorstore,
57
+ chat_history= chat_history
58
+ )
59
+ end = time.time()
60
+ print("Query time: \a "+str(round(end - start,1)))
61
+ return output
62
+
63
+
64
+ with open("style.css") as f:
65
+ st.markdown('<style>{}</style>'.format(f.read()), unsafe_allow_html=True)
66
+
67
+ st.header("Local Chat with Pdf")
68
+
69
+ if "uploaded_file_name" not in st.session_state:
70
+ st.session_state.uploaded_file_name = ""
71
+
72
+ if "past" not in st.session_state:
73
+ st.session_state.past = []
74
+
75
+ if "generated" not in st.session_state:
76
+ st.session_state["generated"] = []
77
+
78
+ if "vectorstore" not in st.session_state:
79
+ st.session_state.vectorstore = None
80
+
81
+ if "chain" not in st.session_state:
82
+ st.session_state.chain = None
83
+
84
+ uploaded_file = st.file_uploader("Choose a file", type=['pdf'])
85
+
86
+ if uploaded_file:
87
+ if uploaded_file.name != st.session_state.uploaded_file_name:
88
+ st.session_state.vectorstore = None
89
+ st.session_state.chain = None
90
+ st.session_state["generated"] = []
91
+ st.session_state.past = []
92
+ st.session_state.uploaded_file_name = uploaded_file.name
93
+ st.session_state.all_messages = []
94
+ print(st.session_state.uploaded_file_name)
95
+ if not st.session_state.vectorstore:
96
+ st.session_state.vectorstore = process_file(uploaded_file)
97
+
98
+ if st.session_state.vectorstore and not st.session_state.chain:
99
+ with st.spinner("Loading Large Language Model...."):
100
+ st.session_state.chain=get_chain(st.session_state.vectorstore)
101
+ searching=False
102
+ user_input = st.text_input("You: ", value="", key="input", disabled=searching)
103
+ send_button = st.button(label="Query")
104
+ if send_button:
105
+ searching = True
106
+ output = query(user_input)
107
+ searching = False
108
+ st.session_state.past.append(user_input)
109
+ st.session_state.generated.append(output)
110
+ if st.session_state["generated"]:
111
+ for i in range(len(st.session_state["generated"]) - 1, -1, -1):
112
+ message(st.session_state["generated"][i], key=str(i))
113
+ message(st.session_state.past[i], is_user=True, key=str(i) + "_user")
ingest_data.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
2
+ from langchain.document_loaders import PyPDFLoader
3
+ from langchain.vectorstores import Chroma
4
+ from langchain.embeddings import TensorflowHubEmbeddings
5
+ import os
6
+ import time
7
+ import streamlit as st
8
+
9
+ def embed_doc(filename):
10
+ if len(os.listdir("."))>0:
11
+ loader=PyPDFLoader(filename)
12
+ start = time.time()
13
+ raw_documents = loader.load()
14
+ # Split text
15
+
16
+ text_splitter = RecursiveCharacterTextSplitter(
17
+ chunk_size=1000,
18
+ chunk_overlap=0,
19
+ length_function=len
20
+
21
+ )
22
+
23
+ documents = text_splitter.split_documents(raw_documents)
24
+ end = time.time()
25
+ st.text("Load and split text: "+str(round(end - start,1)))
26
+
27
+
28
+ # Load Data to vectorstore
29
+ start = time.time()
30
+ # embeddings = LlamaCppEmbeddings(model_path="ggml-model.bin")
31
+ # embeddings = HuggingFaceEmbeddings(model_name="diptanuc/all-mpnet-base-v2", model_kwargs={'device': 'cpu'})
32
+ # embeddings = TensorflowHubEmbeddings(model_url="https://tfhub.dev/google/universal-sentence-encoder/4")
33
+ embeddings = TensorflowHubEmbeddings(model_url="https://tfhub.dev/google/universal-sentence-encoder-multilingual-qa/3")
34
+ # embeddings = HuggingFaceEmbeddings(model_name="obrizum/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'})
35
+ end = time.time()
36
+ st.text("Embedding time: "+str(round(end - start,1)))
37
+ start = time.time()
38
+ vectorstore = Chroma.from_documents(documents, embeddings)
39
+ end = time.time()
40
+ st.text("Vectorizing time: "+str(round(end - start,1)))
41
+ return vectorstore
42
+
query_data.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.prompts.prompt import PromptTemplate
2
+ from langchain.llms import LlamaCpp
3
+ from langchain.chains import ConversationalRetrievalChain
4
+ from langchain.memory import ConversationBufferMemory
5
+ from huggingface_hub import hf_hub_download
6
+
7
+ import psutil
8
+ import os
9
+
10
+ #_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.
11
+ #You can assume the question about the uploaded document.
12
+
13
+ #Chat History:
14
+ #{chat_history}
15
+ #Follow Up Input: {question}
16
+ #Standalone question:"""
17
+ #CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)
18
+
19
+ #template = """You are an AI assistant for answering questions about the uploaded document.
20
+ #You are given the following extracted parts of a long document and a question. Provide a conversational answer.
21
+ #If you don't know the answer, just say "Hmm, I'm not sure." Don't try to make up an answer.
22
+ #If the question is not about the uploaded document, politely inform them that you are tuned to only answer questions about the uploaded document.
23
+ #Question: {question}
24
+
25
+ #Answer in Markdown:"""
26
+ ##QA_PROMPT = PromptTemplate(template=template, input_variables=["question", "context"])
27
+ #QA_PROMPT = PromptTemplate(template=template, input_variables=["question"])
28
+
29
+ #=========
30
+ #{context}
31
+ #=========
32
+
33
+
34
+ def get_chain(vectorstore):
35
+ if not os.path.exists("ggml-vic7b-q5_1.bin"):
36
+ hf_hub_download(repo_id="eachadea/ggml-vicuna-7b-1.1", filename="ggml-vic7b-q5_1.bin", local_dir=".")
37
+ llm = LlamaCpp(model_path="ggml-vic7b-q5_1.bin", n_ctx=2048, n_threads=psutil.cpu_count(logical=False)/2)
38
+ qa_chain = ConversationalRetrievalChain.from_llm(
39
+ llm,
40
+ vectorstore.as_retriever(),
41
+ # condense_question_prompt=CONDENSE_QUESTION_PROMPT,
42
+ )
43
+ return qa_chain
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ typing-extensions>=4.5.0
3
+ llama-cpp-python
4
+ streamlit_chat
5
+ pypdf
6
+ chromadb
7
+ tensorflow_text
8
+ psutil
9
+ huggingface-hub
style.css ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .main {
2
+ background-color:black; /* You can change the color to your preference */
3
+ color:white
4
+ }
5
+
6
+ /* Change the background color of the sidebar */
7
+ .sidebar .block-container {
8
+ background-color: black; /* You can change the color to your preference */
9
+ }
10
+
11
+ .footer {
12
+ position: fixed;
13
+ left: 0;
14
+ bottom: 0;
15
+ width: 100%;
16
+ background-color:black;
17
+ color: white;
18
+ text-align: right;
19
+ }
20
+
21
+ h1, h2, h3, h4, h5, h6, p, label, .stMarkdown, .sidebar .block-container {
22
+ color: white;
23
+ }