KrishnaKumar23 commited on
Commit
84ddfaa
1 Parent(s): e4941eb

initial commit

Browse files
Files changed (4) hide show
  1. app.py +99 -0
  2. llm_model.py +92 -0
  3. requirements.txt +13 -0
  4. sidebar.py +60 -0
app.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from streamlit_lottie import st_lottie
3
+ import fitz # PyMuPDF
4
+ import requests
5
+ import os, shutil
6
+ import sidebar
7
+ import llm_model
8
+
9
+ @st.cache_data(experimental_allow_widgets=True)
10
+ def index_document(uploaded_file):
11
+
12
+ if uploaded_file is not None:
13
+ # Specify the folder path where you want to store the uploaded file in the 'assets' folder
14
+ assets_folder = "assets/uploaded_files"
15
+ if not os.path.exists(assets_folder):
16
+ os.makedirs(assets_folder)
17
+
18
+ # Save the uploaded file to the specified folder
19
+ file_path = os.path.join(assets_folder, uploaded_file.name)
20
+ with open(file_path, "wb") as f:
21
+ f.write(uploaded_file.getvalue())
22
+
23
+ file_name = os.path.join(assets_folder, uploaded_file.name)
24
+ st.success(f"File '{file_name}' uploaded !")
25
+
26
+ with st.spinner("Indexing document... This is a free CPU version and may take a while⏳"):
27
+ llm_model.create_vector_db(file_name, instructor_embeddings)
28
+
29
+ return file_name
30
+ else:
31
+ return None
32
+
33
+
34
+ def load_lottieurl(url: str):
35
+ r = requests.get(url)
36
+ if r.status_code != 200:
37
+ return None
38
+ return r.json()
39
+
40
+
41
+ def is_query_valid(query: str) -> bool:
42
+ if not query:
43
+ st.error("Please enter a question!")
44
+ return False
45
+ return True
46
+
47
+
48
+ # Function to load model parameters
49
+ @st.cache_resource()
50
+ def load_model():
51
+ return llm_model.load_model_params()
52
+
53
+ st.set_page_config(page_title="Document QA Bot")
54
+ lottie_book = load_lottieurl("https://assets4.lottiefiles.com/temp/lf20_aKAfIn.json")
55
+ st_lottie(lottie_book, speed=1, height=200, key="initial")
56
+ # Place the title below the Lottie animation
57
+ st.title("PDF Q&A Bot 🤖")
58
+
59
+ # Left Sidebar
60
+ sidebar.sidebar()
61
+ # st.sidebar.header("Upload PDF")
62
+
63
+ # load model parameters
64
+ llm, instructor_embeddings = load_model()
65
+ # Upload file through Streamlit
66
+ uploaded_file = st.file_uploader("Upload a file", type=["pdf", "doc", "docx", "txt"])
67
+
68
+ filename = index_document(uploaded_file)
69
+ print(filename)
70
+
71
+ if not filename:
72
+ st.stop()
73
+
74
+
75
+ with st.form(key="qa_form"):
76
+ query = st.text_area("Ask a question about the document")
77
+ submit = st.form_submit_button("Submit")
78
+
79
+ if submit:
80
+ if not is_query_valid(query):
81
+ st.stop()
82
+
83
+ # Output Columns
84
+ answer_col, sources_col = st.columns(2)
85
+
86
+ qa_chain = llm_model.document_parser(instructor_embeddings, llm)
87
+ result = qa_chain(query)
88
+
89
+ with answer_col:
90
+ st.markdown("#### Answer")
91
+ st.markdown(result["result"])
92
+
93
+ with sources_col:
94
+ st.markdown("#### Sources")
95
+ if not ("i don't know" in result["result"].lower()):
96
+ for source in result["source_documents"]:
97
+ st.markdown(source.page_content)
98
+ st.markdown(source.metadata["source"])
99
+ st.markdown("--------------------------")
llm_model.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.vectorstores import FAISS
2
+ from langchain.llms import GooglePalm
3
+ from langchain.document_loaders import PyPDFLoader
4
+ from langchain.document_loaders import TextLoader
5
+ from langchain.document_loaders import Docx2txtLoader
6
+ from langchain.embeddings import HuggingFaceInstructEmbeddings
7
+ from langchain.prompts import PromptTemplate
8
+ from langchain.chains import RetrievalQA
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ import os
11
+ from dotenv import load_dotenv
12
+
13
+ vector_index_path = "assets/vectordb/faiss_index"
14
+
15
+
16
+ def load_env_variables():
17
+ load_dotenv() # take environment variables from .env
18
+
19
+
20
+ def create_vector_db(filename, instructor_embeddings):
21
+
22
+ if filename.endswith(".pdf"):
23
+ loader = PyPDFLoader(file_path=filename)
24
+ elif filename.endswith(".doc") or filename.endswith(".docx"):
25
+ loader = Docx2txtLoader(filename)
26
+ elif filename.endswith("txt") or filename.endswith("TXT"):
27
+ loader = TextLoader(filename)
28
+
29
+ # Split documents
30
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=10)
31
+ splits = text_splitter.split_documents(loader.load())
32
+
33
+ # data = loader.load()
34
+
35
+ # Create a FAISS instance for vector database from 'data'
36
+ vectordb = FAISS.from_documents(documents=splits,
37
+ embedding=instructor_embeddings)
38
+
39
+ # Save vector database locally
40
+ vectordb.save_local(vector_index_path)
41
+
42
+
43
+ def get_qa_chain(instructor_embeddings, llm):
44
+
45
+ # Load the vector database from the local folder
46
+ vectordb = FAISS.load_local(vector_index_path, instructor_embeddings)
47
+
48
+ # Create a retriever for querying the vector database
49
+ retriever = vectordb.as_retriever(search_type="similarity")
50
+
51
+ prompt_template = """
52
+ You are a question answer agent and you must strictly follow below prompt template.
53
+ Given the following context and a question, generate an answer based on this context only.
54
+ In the answer try to provide as much text as possible from "response" section in the source document context without making much changes.
55
+ Keep answers brief and well-structured. Do not give one word answers.
56
+ If the answer is not found in the context, kindly state "I don't know." Don't try to make up an answer.
57
+
58
+ CONTEXT: {context}
59
+
60
+ QUESTION: {question}"""
61
+
62
+ PROMPT = PromptTemplate(
63
+ template=prompt_template, input_variables=["context", "question"]
64
+ )
65
+
66
+ chain = RetrievalQA.from_chain_type(llm=llm,
67
+ chain_type="stuff", # or map-reduce
68
+ retriever=retriever,
69
+ input_key="query",
70
+ return_source_documents=True, # return source document from the vector db
71
+ chain_type_kwargs={"prompt": PROMPT},
72
+ verbose=True)
73
+
74
+ return chain
75
+
76
+
77
+ def load_model_params():
78
+
79
+ load_env_variables()
80
+ # Create Google Palm LLM model
81
+ llm = GooglePalm(google_api_key=os.environ["GOOGLE_API_KEY"], temperature=0.1)
82
+ # # Initialize instructor embeddings using the Hugging Face model
83
+ instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-large")
84
+
85
+ return llm, instructor_embeddings
86
+
87
+
88
+ def document_parser(instructor_embeddings, llm):
89
+
90
+ chain = get_qa_chain(instructor_embeddings=instructor_embeddings, llm=llm)
91
+
92
+ return chain
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain==0.0.284
2
+ python-dotenv==1.0.0
3
+ tiktoken==0.4.0
4
+ faiss-cpu==1.7.4
5
+ protobuf~=3.19.0
6
+ pypdf
7
+ google-generativeai
8
+ InstructorEmbedding
9
+ sentence-transformers
10
+ streamlit
11
+ frontend
12
+ tools
13
+ docx2txt
sidebar.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from streamlit_lottie import st_lottie
3
+
4
+
5
+ def faq():
6
+ st.markdown(
7
+ """
8
+ # FAQ
9
+ ## How does Document Q&A Bot work?
10
+ When you upload a document (in Pdf, word, csv or txt format), it will be divided into smaller chunks
11
+ and stored in a special type of database called a vector index
12
+ that allows for semantic search and retrieval.
13
+
14
+ When you ask a question, our Q&A bot will first look through the document chunks and find the
15
+ most relevant ones using the vector index. Then, it will use open-source LLM model named Google Palm
16
+ and will provide the final answer.
17
+
18
+ ## Is my data safe?
19
+ Yes, your data is safe. Our bot does not store your documents or
20
+ questions. All uploaded data is deleted after you close the browser tab.
21
+
22
+ ## Why does it take so long to index my document?
23
+ Since, this is a sample QA bot project that uses open-source model
24
+ and doesn't have much resource capabilities like GPU, it may take time
25
+ to index your document based on the size of the document.
26
+
27
+ ## Are the answers 100% accurate?
28
+ No, the answers are not 100% accurate.
29
+ But for most use cases, our QA bot is very accurate and can answer
30
+ most questions. Always check with the sources to make sure that the answers
31
+ are correct.
32
+ """
33
+ )
34
+
35
+
36
+ def sidebar():
37
+ with st.sidebar:
38
+ st.markdown("## Google Palm")
39
+
40
+ st.success('API key already provided!', icon='✅')
41
+
42
+ st.markdown(
43
+ "## How to use QA bot\n"
44
+ "1. Upload a pdf, docx, or a txt file📄\n"
45
+ "2. Ask questions about the document💬\n"
46
+ )
47
+
48
+ # st.session_state["OPENAI_API_KEY"] = api_key_input
49
+
50
+ st.markdown("---")
51
+ st.markdown("# About")
52
+ st.markdown(
53
+ "🤖 QA bot allows you to ask questions about your "
54
+ "documents and get accurate answers with citations. "
55
+ )
56
+
57
+ st.markdown("Created by [Krishna Kumar](https://www.linkedin.com/in/krishna-kumar-yadav-726831105/)")
58
+ st.markdown("---")
59
+
60
+ faq()