Paul-Joshi commited on
Commit
a00136a
·
verified ·
1 Parent(s): 6b34715

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -0
app.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from dotenv import load_dotenv
3
+ from PyPDF2 import PdfReader
4
+ from langchain.text_splitter import CharacterTextSplitter
5
+ from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
6
+ from langchain.vectorstores import FAISS
7
+ from langchain.chat_models import ChatOpenAI
8
+ from langchain.memory import ConversationBufferMemory
9
+ from langchain.chains import ConversationalRetrievalChain
10
+ from css_template import css, bot_template, user_template
11
+ from langchain.llms import HuggingFaceHub
12
+ import os
13
+ # os.environ['FAISS_NO_AVX2'] = '1'
14
+
15
+ def method_get_pdf_text(pdf_docs):
16
+ text = ""
17
+ for pdf in pdf_docs:
18
+ pdf_reader = PdfReader(pdf)
19
+ for page in pdf_reader.pages:
20
+ text += page.extract_text()
21
+ return text
22
+
23
+
24
+ def method_get_text_chunks(text):
25
+ text_splitter = CharacterTextSplitter(
26
+ separator="\n\n",
27
+ chunk_size=1000,
28
+ chunk_overlap=200,
29
+ length_function=len,
30
+ is_separator_regex=False,
31
+ )
32
+ chunks = text_splitter.split_text(text)
33
+ return chunks
34
+
35
+
36
+ def method_get_vectorstore(text_chunks):
37
+ # embeddings = OpenAIEmbeddings()
38
+ embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
39
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
40
+ return vectorstore
41
+
42
+
43
+ def method_get_conversation_chain(vectorstore):
44
+ #llm = ChatOpenAI()
45
+ llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
46
+
47
+ memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
48
+ conversation_chain = ConversationalRetrievalChain.from_llm(
49
+ llm=llm,
50
+ retriever=vectorstore.as_retriever(),
51
+ memory=memory
52
+ )
53
+ return conversation_chain
54
+
55
+
56
+ def method_handle_userinput(user_question):
57
+ response = st.session_state.conversation({'question': user_question})
58
+ st.session_state.chat_history = response['chat_history']
59
+
60
+ for i, message in enumerate(st.session_state.chat_history):
61
+ if i % 2 == 0:
62
+ st.write(user_template.replace(
63
+ "{{MSG}}", message.content), unsafe_allow_html=True)
64
+ else:
65
+ st.write(bot_template.replace(
66
+ "{{MSG}}", message.content), unsafe_allow_html=True)
67
+
68
+
69
+ def main():
70
+ load_dotenv()
71
+ st.set_page_config(page_title="Converse with multiple PDFs",page_icon=":books:")
72
+ st.write(css, unsafe_allow_html=True)
73
+
74
+ if "conversation" not in st.session_state:
75
+ st.session_state.conversation = None
76
+ if "chat_history" not in st.session_state:
77
+ st.session_state.chat_history = None
78
+
79
+ st.header("Converse with multiple PDFs :books:")
80
+ user_question = st.text_input("Ask a question about your documents:")
81
+ if user_question:
82
+ method_handle_userinput(user_question)
83
+
84
+ with st.sidebar:
85
+ st.subheader("Documents Upload")
86
+ pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Submit'", accept_multiple_files=True)
87
+ if st.button("Submit"):
88
+ with st.spinner("Processing"):
89
+ # get pdf text
90
+ raw_text = method_get_pdf_text(pdf_docs)
91
+ # get the text chunks
92
+ text_chunks = method_get_text_chunks(raw_text)
93
+ # create vector store
94
+ vectorstore = method_get_vectorstore(text_chunks)
95
+ st.write(text_chunks)
96
+ # create conversation chain
97
+ st.session_state.conversation = method_get_conversation_chain(vectorstore)
98
+
99
+
100
+
101
+ if __name__ == '__main__':
102
+ main()