Vageesh1 commited on
Commit
2507f18
1 Parent(s): 8e3047c

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +105 -0
  2. helper.py +83 -0
  3. requirements.txt +11 -0
app.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tempfile
2
+ import streamlit as st
3
+ from streamlit_chat import message
4
+
5
+ import torch
6
+ import torch.nn
7
+
8
+ import transformers
9
+ from transformers import (
10
+ AutoModelForCausalLM,
11
+ AutoTokenizer,
12
+ BitsAndBytesConfig,
13
+ HfArgumentParser,
14
+ TrainingArguments,
15
+ pipeline,
16
+ logging,
17
+ )
18
+
19
+
20
+ import pandas as pd
21
+ import numpy as np
22
+ import os
23
+ import io
24
+
25
+ from langchain.document_loaders import TextLoader
26
+ from langchain import PromptTemplate
27
+ from langchain.text_splitter import CharacterTextSplitter
28
+ from langchain.document_loaders import PyPDFLoader
29
+ from langchain.embeddings import HuggingFaceEmbeddings
30
+ from langchain.vectorstores import FAISS
31
+ from langchain.chains.question_answering import load_qa_chain
32
+ from langchain.chains import RetrievalQA
33
+ from langchain import HuggingFacePipeline
34
+
35
+ from helper import conversational_chat,pdf_loader,splitDoc,makeEmbeddings,create_flan_t5_base,conversational_chat
36
+
37
+
38
+
39
+
40
+
41
+ def ui():
42
+ st.title('PDF Question Answer Bot')
43
+ hugging_face_key = os.environ["HUGGINGFACE_HUB_TOKEN"]
44
+ llm = create_flan_t5_base(load_in_8bit=False)
45
+ hf_llm = HuggingFacePipeline(pipeline=llm)
46
+
47
+ uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf"])
48
+ #saving the uploaded pdf file
49
+ save_path = "./uploaded_file.pdf"
50
+ with open(save_path, "wb") as f:
51
+ f.write(uploaded_file.read())
52
+
53
+ #loading the pdf file
54
+ pdf_doc=pdf_loader('./uploaded_file.pdf')
55
+ vector_database = makeEmbeddings(pdf_doc)
56
+ #making the retriever of the vector database
57
+ retriever = vector_database.as_retriever(search_kwargs={"k":4})
58
+ qa_chain = RetrievalQA.from_chain_type(llm=hf_llm, chain_type="stuff",retriever=retriever)
59
+
60
+ # Create an empty container to hold the PDF loader section
61
+ pdf_loader_container = st.empty()
62
+
63
+ # Check if the PDF file is uploaded or not
64
+ if uploaded_file is not None:
65
+ print("The file has been uploaded successfully")
66
+ # Hide the PDF loader interface when the file is uploaded
67
+ pdf_loader_container.empty()
68
+ # Show the chat interface
69
+ show_chat_interface(qa_chain)
70
+
71
+ def show_chat_interface(qa_chain):
72
+ if 'history' not in st.session_state:
73
+ st.session_state['history'] = []
74
+
75
+ if 'generated' not in st.session_state:
76
+ st.session_state['generated'] = ["Hello ! Ask me anything about the Uploaded PDF " + " 🤗"]
77
+
78
+ if 'past' not in st.session_state:
79
+ st.session_state['past'] = ["Hey ! 👋"]
80
+
81
+ response_container = st.container()
82
+ #container for the user's text input
83
+ container = st.container()
84
+
85
+ with container:
86
+ with st.form(key='my_form', clear_on_submit=True):
87
+
88
+ user_input = st.text_input("Query:", placeholder="Talk about your PDF data here (:", key='input')
89
+ submit_button = st.form_submit_button(label='Send')
90
+
91
+ if submit_button and user_input:
92
+ output = conversational_chat(qa_chain,user_input)
93
+
94
+ st.session_state['past'].append(user_input)
95
+ st.session_state['generated'].append(output)
96
+
97
+ if st.session_state['generated']:
98
+ with response_container:
99
+ for i in range(len(st.session_state['generated'])):
100
+ message(st.session_state["past"][i], is_user=True, key=str(i) + '_user', avatar_style="big-smile")
101
+ message(st.session_state["generated"][i], key=str(i), avatar_style="thumbs")
102
+
103
+
104
+ if __name__=='__main__':
105
+ ui()
helper.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tempfile
2
+ import streamlit as st
3
+ from streamlit_chat import message
4
+
5
+ import torch
6
+ import torch.nn
7
+
8
+ import transformers
9
+ from transformers import (
10
+ AutoModelForCausalLM,
11
+ AutoTokenizer,
12
+ BitsAndBytesConfig,
13
+ HfArgumentParser,
14
+ TrainingArguments,
15
+ pipeline,
16
+ logging,
17
+ )
18
+
19
+
20
+ import pandas as pd
21
+ import numpy as np
22
+ import os
23
+ import io
24
+
25
+ from langchain.document_loaders import TextLoader
26
+ from langchain import PromptTemplate
27
+ from langchain.text_splitter import CharacterTextSplitter
28
+ from langchain.document_loaders import PyPDFLoader
29
+ from langchain.embeddings import HuggingFaceEmbeddings
30
+ from langchain.vectorstores import FAISS
31
+ from langchain.chains.question_answering import load_qa_chain
32
+ from langchain.chains import RetrievalQA
33
+ from langchain import HuggingFacePipeline
34
+
35
+
36
+ def pdf_loader(file_path):
37
+ '''This is a function for loading the PDFs
38
+ Params:
39
+ file_path: The path of the PDF file
40
+ '''
41
+ output_file = "Loaded_PDF.txt"
42
+ loader = PyPDFLoader(file_path)
43
+ pdf_file_as_loaded_docs = loader.load()
44
+ return pdf_file_as_loaded_docs
45
+
46
+ def splitDoc(loaded_docs):
47
+ '''This is a function that creates the chunks of our loaded Document
48
+ Params:
49
+ loaded_docs:The loaded document from the pdf_loader function'''
50
+ splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
51
+ chunked_docs = splitter.split_documents(loaded_docs)
52
+ return chunked_docs
53
+
54
+ def makeEmbeddings(chunked_docs):
55
+ '''This is a functuon for making the embeddings of the chunked document
56
+ Params:
57
+ chunked_docs:The chunked docs'''
58
+ embedder = HuggingFaceEmbeddings()
59
+ vector_store = FAISS.from_documents(chunked_docs, embedder)#making a FAISS based vector data
60
+ return vector_store
61
+
62
+
63
+ def create_flan_t5_base(load_in_8bit=False):
64
+ ''''Loading the Flan T5 base in the form of pipeline'''
65
+ # Wrap it in HF pipeline for use with LangChain
66
+ model="google/flan-t5-base"
67
+ tokenizer = AutoTokenizer.from_pretrained(model)
68
+ return pipeline(
69
+ task="text2text-generation",
70
+ model=model,
71
+ tokenizer = tokenizer,
72
+ max_new_tokens=100,
73
+ model_kwargs={ "load_in_8bit": load_in_8bit, "max_length": 512, "temperature": 0.}
74
+ )
75
+
76
+ def conversational_chat(chain,query):
77
+ result = chain({"question": query,
78
+ "chat_history": st.session_state['history']})
79
+ st.session_state['history'].append((query, result["answer"]))
80
+
81
+ return result["answer"]
82
+
83
+
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ huggingfacehub
3
+ langchain
4
+ streamlit
5
+ openai
6
+ tiktoken
7
+ faiss-cpu
8
+ streamlit_chat
9
+ transformers
10
+ sentence_transformers
11
+ pypdf