dosanity commited on
Commit
3ff62e6
1 Parent(s): b25b3d2

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +131 -0
  2. pdf_reader.py +121 -0
  3. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pdf_reader import *
3
+
4
+ # Creating Session State Variable
5
+ if 'API_Key' not in st.session_state:
6
+ st.session_state['API_Key'] = ''
7
+ if 'Pinecone_API_Key' not in st.session_state:
8
+ st.session_state['Pinecone_API_Key'] =''
9
+ if 'summary' not in st.session_state:
10
+ st.session_state.summary = ''
11
+ if 'history' not in st.session_state:
12
+ st.session_state.history = {}
13
+ if 'chat' not in st.session_state:
14
+ st.session_state.chat = ''
15
+ if 'counter' not in st.session_state:
16
+ st.session_state.counter = 1
17
+
18
+ st.title('PDF Chat Bot')
19
+
20
+ #********SIDE BAR Funtionality started*******
21
+
22
+ # Sidebar to capture the API keys
23
+ st.session_state['API_Key'] = st.sidebar.text_input("What's your OPENAI API key?",type="password")
24
+ # File uploader widget
25
+ uploaded_file = st.sidebar.file_uploader("Choose a PDF file", type=["pdf"])
26
+
27
+ load_button = st.sidebar.button("UPLOAD", key="load_button")
28
+
29
+ #If the bove button is clicked, pushing the data to Pinecone...
30
+ if load_button:
31
+ #Proceed only if API keys are provided
32
+ if st.session_state['API_Key'] != '' and uploaded_file is not None:
33
+ file = save_pdf(uploaded_file)
34
+ file = "uploaded.pdf"
35
+ st.session_state.summary = load_db_sum(file, st.session_state['API_Key'])
36
+ st.session_state.chat = load_db(file, st.session_state['API_Key'])
37
+ st.session_state.history = {}
38
+
39
+ elif st.session_state['API_Key'] == '':
40
+ st.sidebar.error("Please enter your OpenAI API key.")
41
+ elif uploaded_file is None:
42
+ st.sidebar.error("Please attach a PDF file.")
43
+
44
+ #********SIDE BAR Funtionality ended*****
45
+
46
+
47
+ if st.session_state['API_Key'] != '' and uploaded_file is not None:
48
+ file = "uploaded.pdf"
49
+ st.markdown('<br>', unsafe_allow_html=True)
50
+ st.markdown("#### **Summary**")
51
+ st.markdown('<hr style="margin: -10px 0; border-top: 1px solid black;">', unsafe_allow_html=True)
52
+ st.write(st.session_state.summary)
53
+
54
+ # create a variable for the chat
55
+ conversation = {}
56
+
57
+ #Captures User Inputs
58
+ user_input = st.text_input('Ask about the PDF',key="prompt") # The box for the text prompt
59
+ # document_count = st.slider('No.Of links to return 🔗 - (0 LOW || 5 HIGH)', 0, 5, 2,step=1)
60
+
61
+ submit = st.button("SUBMIT")
62
+
63
+ if submit:
64
+ #Proceed only if API keys are provided
65
+ if st.session_state.summary == '':
66
+ st.error("Please upload the PDF file.")
67
+
68
+ # user_input = request.form['user_input']
69
+ else:
70
+ result = st.session_state.chat({"question": user_input})
71
+ answer_text = str(result['answer'])
72
+ question_text = str(result['question'])
73
+ user = "User"
74
+ chatbot = "Chat Bot"
75
+ conversation.update({user: question_text, chatbot: answer_text})
76
+
77
+ user_hist = f"[{st.session_state.counter}] {user}"
78
+ chat_hist = f"[{st.session_state.counter}] {chatbot}"
79
+ st.session_state.history.update({user_hist : question_text})
80
+ st.session_state.history.update({chat_hist : answer_text})
81
+ st.session_state.counter += 1
82
+
83
+
84
+ st.markdown('<br>', unsafe_allow_html=True)
85
+ st.markdown("#### **Conversation**")
86
+ st.markdown('<hr style="margin: -10px 0; border-top: 1px solid black;">', unsafe_allow_html=True)
87
+
88
+ table_data = list(conversation.items())
89
+
90
+ # Display the table with keys bolded using HTML
91
+ html_table = """
92
+ <style>
93
+ table, tr {border:hidden;}
94
+ table, td {border:hidden;}
95
+ </style>
96
+ <table><tr><th><strong></strong></th><th></th></tr>
97
+ """
98
+ for key, value in table_data:
99
+ html_table += f"<tr><td style='width: 90px;'><strong>{key}:</strong></td><td>{value}</td></tr>"
100
+ html_table += "</table>"
101
+ st.markdown(html_table, unsafe_allow_html=True)
102
+
103
+
104
+ st.markdown('<br>', unsafe_allow_html=True)
105
+ st.markdown("#### **Chat History**")
106
+ st.markdown('<hr style="margin: -10px 0; border-top: 1px solid black;">', unsafe_allow_html=True)
107
+
108
+ table_data2 = list(st.session_state.history.items())
109
+
110
+ # Display the table with keys bolded using HTML
111
+ html_table = """
112
+ <style>
113
+ table, tr {border:hidden;}
114
+ table, td {border:hidden;}
115
+ </style>
116
+ <table><tr><th><strong></strong></th><th></th></tr>
117
+ """
118
+ for key, value in table_data2:
119
+ key = key[4:]
120
+ html_table += f"<tr><td style='width: 90px;'><strong>{key}:</strong></td><td>{value}</td></tr>"
121
+ html_table += "</table>"
122
+ st.markdown(html_table, unsafe_allow_html=True)
123
+
124
+ elif st.session_state['API_Key'] == '':
125
+ st.error("Please enter your OpenAI API key.")
126
+ elif uploaded_file is None:
127
+ st.session_state.summary = ''
128
+ st.error("Please upload the PDF file.")
129
+
130
+
131
+
pdf_reader.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Imports
2
+ from langchain.embeddings.openai import OpenAIEmbeddings
3
+ from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
4
+ from langchain.vectorstores import DocArrayInMemorySearch
5
+ from langchain.chains.summarize import load_summarize_chain
6
+ from langchain.chains import RetrievalQA, ConversationalRetrievalChain
7
+ from langchain.memory import ConversationBufferMemory
8
+ from langchain.chat_models import ChatOpenAI
9
+ from langchain.document_loaders import TextLoader
10
+ from langchain.document_loaders import PyPDFLoader
11
+ from langchain.prompts import PromptTemplate
12
+ from langchain.llms import OpenAI
13
+ import tiktoken
14
+ import os
15
+ import sys
16
+ sys.path.append('../..')
17
+
18
+ import datetime
19
+ current_date = datetime.datetime.now().date()
20
+ if current_date < datetime.date(2023, 9, 2):
21
+ llm_name = "gpt-3.5-turbo-0301"
22
+ else:
23
+ llm_name = "gpt-3.5-turbo"
24
+
25
+ def load_db(file, api_key):
26
+ os.environ['OPENAI_API_KEY'] = api_key
27
+ # load documents
28
+ loader = PyPDFLoader(file)
29
+ # loader = file
30
+ documents = loader.load()
31
+ # documents = loader.read()
32
+ # split documents
33
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
34
+ docs = text_splitter.split_documents(documents)
35
+ # define embedding
36
+ embeddings = OpenAIEmbeddings()
37
+ # create vector database from data
38
+ db = DocArrayInMemorySearch.from_documents(docs, embeddings)
39
+
40
+ # add in the prompt
41
+ prompt_template_doc = """
42
+
43
+ Use the following pieces of context to answer the question at the end. {context}
44
+ You can also look into chat history. {chat_history}
45
+ If you still can't find the answer, please respond: "Please ask a question related to the document."
46
+
47
+ Question: {question}
48
+ Answer:
49
+ """
50
+ prompt_doc = PromptTemplate(
51
+ template=prompt_template_doc,
52
+ input_variables=["context", "question", "chat_history"],
53
+ )
54
+ # define retriever
55
+ retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})
56
+ # keeps a buffer of history and process it
57
+ memory = ConversationBufferMemory(
58
+ memory_key="chat_history",
59
+ output_key="answer",
60
+ return_messages=True
61
+ )
62
+ # create a chatbot chain
63
+ qa = ConversationalRetrievalChain.from_llm(
64
+ llm=ChatOpenAI(model_name=llm_name, temperature=0),
65
+ chain_type="stuff",
66
+ retriever=retriever,
67
+ combine_docs_chain_kwargs={"prompt": prompt_doc},
68
+ memory=memory
69
+ )
70
+ return qa
71
+
72
+ def load_db_sum(file, api_key):
73
+ os.environ['OPENAI_API_KEY'] = api_key
74
+ # load documents
75
+ loader = PyPDFLoader(file)
76
+ # loader = file
77
+ documents = loader.load()
78
+ # documents = loader.read()
79
+ # split documents
80
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=150)
81
+ docs = text_splitter.split_documents(documents)
82
+ # create string of documents
83
+ str_docs = str(documents)
84
+
85
+ # define number of tokens from text
86
+ def num_tokens_from_string(string: str, encoding_name: str) -> int:
87
+ encoding = tiktoken.encoding_for_model(encoding_name)
88
+ num_tokens = len(encoding.encode(string))
89
+ return num_tokens
90
+
91
+ # get tokens
92
+ num_tokens = num_tokens_from_string(str_docs, llm_name)
93
+ model_max_tokens = 4097
94
+ # define embedding
95
+ embeddings = OpenAIEmbeddings()
96
+ # create vector database from data
97
+ db = DocArrayInMemorySearch.from_documents(docs, embeddings)
98
+ # define retriever
99
+ retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})
100
+ #Keeps a buffer of history and process it
101
+ memory = ConversationBufferMemory(
102
+ memory_key="chat_history",
103
+ output_key="answer",
104
+ return_messages=True
105
+ )
106
+
107
+ # create a chatbot chain based on tokens
108
+ if num_tokens < model_max_tokens:
109
+ chain = load_summarize_chain(llm=OpenAI(temperature=0, model="text-davinci-003", openai_api_key=api_key), chain_type="stuff")
110
+ qa = chain.run(documents)
111
+ else:
112
+ chain = load_summarize_chain(llm=OpenAI(temperature=0, model="text-davinci-003", openai_api_key=api_key), chain_type="map_reduce")
113
+ qa = chain.run(documents)
114
+
115
+ return qa
116
+
117
+ def save_pdf(pdf_file):
118
+ with open("uploaded.pdf", "wb") as file:
119
+ file.write(pdf_file.getvalue())
120
+ file = "uploaded.pdf"
121
+ return file
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ langchain
2
+ tiktoken
3
+ os
4
+ sys
5
+ streamlit