Amir commited on
Commit
e52f0c4
1 Parent(s): ab48947
Files changed (2) hide show
  1. app.py +149 -0
  2. requirements.txt +14 -0
app.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from dotenv import load_dotenv
3
+ from PyPDF2 import PdfReader
4
+ from langchain.text_splitter import CharacterTextSplitter
5
+ from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
6
+ from langchain.vectorstores import FAISS
7
+ from langchain.chat_models import ChatOpenAI
8
+ from langchain.memory import ConversationBufferMemory
9
+ from langchain.chains import ConversationalRetrievalChain
10
+ from langchain.llms import HuggingFaceHub
11
+ import os
12
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer
13
+
14
+ css = '''
15
+ <style>
16
+ .chat-message {
17
+ padding: 1.5rem; border-radius: 0.5rem; margin-bottom: 1rem; display: flex
18
+ }
19
+ .chat-message.user {
20
+ background-color: #2b313e
21
+ }
22
+ .chat-message.bot {
23
+ background-color: #475063
24
+ }
25
+ .chat-message .avatar {
26
+ width: 20%;
27
+ }
28
+ .chat-message .avatar img {
29
+ max-width: 78px;
30
+ max-height: 78px;
31
+ border-radius: 50%;
32
+ object-fit: cover;
33
+ }
34
+ .chat-message .message {
35
+ width: 80%;
36
+ padding: 0 1.5rem;
37
+ color: #fff;
38
+ }
39
+ '''
40
+
41
+ bot_template = '''
42
+ <div class="chat-message bot">
43
+ <div class="avatar">
44
+ <img src="https://i.ibb.co/cN0nmSj/Screenshot-2023-05-28-at-02-37-21.png" style="max-height: 78px; max-width: 78px; border-radius: 50%; object-fit: cover;">
45
+ </div>
46
+ <div class="message">{{MSG}}</div>
47
+ </div>
48
+ '''
49
+
50
+ user_template = '''
51
+ <div class="chat-message user">
52
+ <div class="avatar">
53
+ <img src="https://i.ibb.co/rdZC7LZ/Photo-logo-1.png">
54
+ </div>
55
+ <div class="message">{{MSG}}</div>
56
+ </div>
57
+ '''
58
+
59
+ os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"
60
+ def get_pdf_text(pdf_docs):
61
+ text = ""
62
+ for pdf in pdf_docs:
63
+ pdf_reader = PdfReader(pdf)
64
+ for page in pdf_reader.pages:
65
+ text += page.extract_text()
66
+ return text
67
+
68
+
69
+ def get_text_chunks(text):
70
+ text_splitter = CharacterTextSplitter(
71
+ separator="\n",
72
+ chunk_size=1000,
73
+ chunk_overlap=200,
74
+ length_function=len
75
+ )
76
+ chunks = text_splitter.split_text(text)
77
+ return chunks
78
+
79
+
80
+ def get_vectorstore(text_chunks):
81
+ embeddings = HuggingFaceInstructEmbeddings(model_name="intfloat/multilingual-e5-base")
82
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
83
+ return vectorstore
84
+
85
+
86
+ def get_conversation_chain(vectorstore):
87
+ llm = GPT2LMHeadModel.from_pretrained("sberbank-ai/mGPT")
88
+
89
+ memory = ConversationBufferMemory(
90
+ memory_key='chat_history', return_messages=True)
91
+ conversation_chain = ConversationalRetrievalChain.from_llm(
92
+ llm=llm,
93
+ retriever=vectorstore.as_retriever(),
94
+ memory=memory
95
+ )
96
+ return conversation_chain
97
+
98
+
99
+ def handle_userinput(user_question):
100
+ response = st.session_state.conversation({'question': user_question})
101
+ st.session_state.chat_history = response['chat_history']
102
+
103
+ for i, message in enumerate(st.session_state.chat_history):
104
+ if i % 2 == 0:
105
+ st.write(user_template.replace(
106
+ "{{MSG}}", message.content), unsafe_allow_html=True)
107
+ else:
108
+ st.write(bot_template.replace(
109
+ "{{MSG}}", message.content), unsafe_allow_html=True)
110
+
111
+
112
+ def main():
113
+ load_dotenv()
114
+ st.set_page_config(page_title="Chat with multiple PDFs",
115
+ page_icon=":books:")
116
+ st.write(css, unsafe_allow_html=True)
117
+
118
+ if "conversation" not in st.session_state:
119
+ st.session_state.conversation = None
120
+ if "chat_history" not in st.session_state:
121
+ st.session_state.chat_history = None
122
+
123
+ st.header("Chat with multiple PDFs :books:")
124
+ user_question = st.text_input("Ask a question about your documents:")
125
+ if user_question:
126
+ handle_userinput(user_question)
127
+
128
+ with st.sidebar:
129
+ st.subheader("Your documents")
130
+ pdf_docs = st.file_uploader(
131
+ "Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
132
+ if st.button("Process"):
133
+ with st.spinner("Processing"):
134
+ # get pdf text
135
+ raw_text = get_pdf_text(pdf_docs)
136
+
137
+ # get the text chunks
138
+ text_chunks = get_text_chunks(raw_text)
139
+
140
+ # create vector store
141
+ vectorstore = get_vectorstore(text_chunks)
142
+
143
+ # create conversation chain
144
+ st.session_state.conversation = get_conversation_chain(
145
+ vectorstore)
146
+
147
+
148
+ if __name__ == '__main__':
149
+ main()
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain==0.0.184
2
+ PyPDF2==3.0.1
3
+ python-dotenv==1.0.0
4
+ streamlit==1.18.1
5
+ openai==0.27.6
6
+ faiss-cpu==1.7.4
7
+ altair==4
8
+ tiktoken==0.4.0
9
+ # uncomment to use huggingface llms
10
+ huggingface-hub==0.14.1
11
+
12
+ # uncomment to use instructor embeddings
13
+ InstructorEmbedding==1.0.1
14
+ sentence-transformers==2.2.2