mikepastor11 commited on
Commit
01fc03c
1 Parent(s): 724c4a9

Upload 4 files

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. app.py +288 -0
  3. htmlTemplates.py +44 -0
  4. robot_icon.ico +0 -0
  5. robot_icon.png +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ robot_icon.png filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ##########################################################################
2
+ # app.py - Pennwick PDF Chat
3
+ #
4
+ # HuggingFace Spaces application to anlayze uploaded PDF files
5
+ # with open-source models ( hkunlp/instructor-xl )
6
+ #
7
+ # Mike Pastor February 17, 2024
8
+
9
+
10
+ import streamlit as st
11
+ from streamlit.components.v1 import html
12
+
13
+ from dotenv import load_dotenv
14
+
15
+ from PyPDF2 import PdfReader
16
+
17
+ from PIL import Image
18
+
19
+ # Local file
20
+ from htmlTemplates import css, bot_template, user_template
21
+
22
+ # from langchain.embeddings import HuggingFaceInstructEmbeddings
23
+ from langchain_community.embeddings import HuggingFaceInstructEmbeddings
24
+
25
+ # from langchain.vectorstores import FAISS
26
+ from langchain_community.vectorstores import FAISS
27
+ from langchain.text_splitter import CharacterTextSplitter
28
+ from langchain.memory import ConversationBufferMemory
29
+ from langchain.chains import ConversationalRetrievalChain
30
+
31
+ # from langchain.llms import HuggingFaceHub
32
+ from langchain_community.llms import HuggingFaceHub
33
+
34
+ ##################################################################################
35
+ # Admin flags
36
+ DISPLAY_DIALOG_LINES = 6
37
+
38
+ SESSION_STARTED = False
39
+
40
+
41
+ ##################################################################################
42
+ def extract_pdf_text(pdf_docs):
43
+ text = ""
44
+ for pdf in pdf_docs:
45
+ pdf_reader = PdfReader(pdf)
46
+ for page in pdf_reader.pages:
47
+ text += page.extract_text()
48
+ return text
49
+
50
+
51
+ ##################################################################################
52
+ # Chunk size and overlap must not exceed the models capacity!
53
+ #
54
+ def extract_bitesize_pieces(text):
55
+ text_splitter = CharacterTextSplitter(
56
+ separator="\n",
57
+ chunk_size=800, # 1000
58
+ chunk_overlap=200,
59
+ length_function=len
60
+ )
61
+ chunks = text_splitter.split_text(text)
62
+ return chunks
63
+
64
+
65
+ ##################################################################################
66
+ def prepare_embedding_vectors(text_chunks):
67
+ st.write('Here in vector store....', unsafe_allow_html=True)
68
+ # embeddings = OpenAIEmbeddings()
69
+
70
+ # pip install InstructorEmbedding
71
+ # pip install sentence-transformers==2.2.2
72
+ embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
73
+
74
+ st.write('Here in vector store - got embeddings ', unsafe_allow_html=True)
75
+ # from InstructorEmbedding import INSTRUCTOR
76
+ # model = INSTRUCTOR('hkunlp/instructor-xl')
77
+ # sentence = "3D ActionSLAM: wearable person tracking in multi-floor environments"
78
+ # instruction = "Represent the Science title:"
79
+ # embeddings = model.encode([[instruction, sentence]])
80
+
81
+ # embeddings = model.encode(text_chunks)
82
+ print('have Embeddings: ')
83
+
84
+ # text_chunks="this is a test"
85
+ # FAISS, Chroma and other vector databases
86
+ #
87
+ vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
88
+ st.write('FAISS succeeds: ')
89
+
90
+ return vectorstore
91
+
92
+
93
+ ##################################################################################
94
+ def prepare_conversation(vectorstore):
95
+ # llm = ChatOpenAI()
96
+ # llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
97
+ # google/bigbird-roberta-base facebook/bart-large
98
+ llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature": 0.7, "max_length": 512})
99
+
100
+ memory = ConversationBufferMemory(
101
+ memory_key='chat_history', return_messages=True)
102
+ conversation_chain = ConversationalRetrievalChain.from_llm(
103
+ llm=llm,
104
+ retriever=vectorstore.as_retriever(),
105
+ memory=memory,
106
+ )
107
+ return conversation_chain
108
+
109
+
110
+ ##################################################################################
111
+ def process_user_question(user_question):
112
+ print('process_user_question called: \n')
113
+
114
+ # if not SESSION_STARTED:
115
+ # print('No Session')
116
+ # st.write( 'Please upload and analyze your PDF files first!')
117
+ # return
118
+
119
+ if user_question == None:
120
+ print('question is null')
121
+ return
122
+ if user_question == '':
123
+ print('question is blank')
124
+ return
125
+ if st == None:
126
+ print('session is null')
127
+ return
128
+ if st.session_state == None:
129
+ print('session STATE is null')
130
+ return
131
+
132
+ print('question is: ', user_question)
133
+ print('\nsession is: ', st)
134
+
135
+ # try:
136
+ # response = st.session_state.conversation({'question': user_question})
137
+ # # response = st.session_state.conversation({'summarization': user_question})
138
+ # st.session_state.chat_history = response['chat_history']
139
+ # Exception:
140
+ # st.write( 'Please upload and analyze your PDF files first!')
141
+ # return
142
+
143
+ # st.empty()
144
+
145
+ # try:
146
+ # st.session_state.conversation({'question': "Summarize the document"})
147
+ # # if "key" not in st.session_state:
148
+ # # st.write('Good')
149
+ # except:
150
+ # st.error("Please upload and analyze your PDF files first!")
151
+ # return
152
+
153
+ if st.session_state.conversation == None:
154
+ st.error("Please upload and analyze your PDF files first!")
155
+ return
156
+
157
+ response = st.session_state.conversation({'question': user_question})
158
+ st.session_state.chat_history = response['chat_history']
159
+ results_size = len(response['chat_history'])
160
+
161
+ results_string = ""
162
+
163
+ print('results_size is: ', results_size)
164
+
165
+ for i, message in enumerate(st.session_state.chat_history):
166
+
167
+ # Scrolling does not display the last printed line,
168
+ # so only print the last 6 lines
169
+ #
170
+ print('results_size on msg: ', results_size, i, (results_size - DISPLAY_DIALOG_LINES))
171
+ if results_size > DISPLAY_DIALOG_LINES:
172
+ if i < (results_size - DISPLAY_DIALOG_LINES):
173
+ continue
174
+
175
+ if i % 2 == 0:
176
+ # st.write(user_template.replace(
177
+ # "{{MSG}}", message.content), unsafe_allow_html=True)
178
+
179
+ results_string += ("<p>" + message.content + "</p>")
180
+
181
+ else:
182
+ # st.write(bot_template.replace(
183
+ # "{{MSG}}", message.content), unsafe_allow_html=True)
184
+
185
+ results_string += ("<p>" + "-- " + message.content + "</p>")
186
+
187
+ html(results_string, height=300, scrolling=True)
188
+
189
+
190
+ ###################################################################################
191
+ def main():
192
+ print('Pennwick Starting up...\n')
193
+ # Load the environment variables - if any
194
+ load_dotenv()
195
+
196
+ ##################################################################################
197
+ # st.set_page_config(page_title="Pennwick PDF Analyzer", page_icon=":books:")
198
+ # im = Image.open("robot_icon.ico")
199
+ # st.set_page_config(page_title="Pennwick PDF Analyzer", page_icon=im )
200
+ # st.set_page_config(page_title="Pennwick PDF Analyzer")
201
+
202
+ # import base64
203
+ # from PIL import Image
204
+
205
+ # # Open your image
206
+ # image = Image.open("robot_icon.ico")
207
+
208
+ # # Convert image to base64 string
209
+ # with open("robot_icon.ico", "rb") as f:
210
+ # encoded_string = base64.b64encode(f.read()).decode()
211
+
212
+ # # Set page config with base64 string
213
+ # st.set_page_config(page_title="Pennwick File Analyzer 2", page_icon=f"data:image/ico;base64,{encoded_string}")
214
+
215
+ st.set_page_config(page_title="Pennwick File Analyzer", page_icon="./robot_icon.ico")
216
+
217
+ print('prepared page...\n')
218
+
219
+ ###################
220
+
221
+ st.write(css, unsafe_allow_html=True)
222
+
223
+ if "conversation" not in st.session_state:
224
+ st.session_state.conversation = None
225
+ if "chat_history" not in st.session_state:
226
+ st.session_state.chat_history = None
227
+
228
+ # st.header("Pennwick File Analyzer :shark:")
229
+ # st.header("Pennwick File Analyzer 2")
230
+
231
+ st.image("robot_icon.png", width=96)
232
+ st.header(f"Pennwick File Analyzer")
233
+
234
+ user_question = None
235
+ user_question = st.text_input("Ask the Open Source - Flan-T5 Model a question about your uploaded documents:")
236
+ if user_question != None:
237
+ print('calling process question', user_question)
238
+ process_user_question(user_question)
239
+
240
+ # st.write( user_template, unsafe_allow_html=True)
241
+ # st.write(user_template.replace( "{{MSG}}", "Hello robot!"), unsafe_allow_html=True)
242
+ # st.write(bot_template.replace( "{{MSG}}", "Hello human!"), unsafe_allow_html=True)
243
+
244
+ with st.sidebar:
245
+
246
+ st.subheader("Which documents would you like to analyze?")
247
+ st.subheader("(no data is saved beyond the session)")
248
+
249
+ pdf_docs = st.file_uploader(
250
+ "Upload your PDF documents here and click on 'Analyze'", accept_multiple_files=True)
251
+
252
+ # Upon button press
253
+ if st.button("Analyze these files"):
254
+ with st.spinner("Processing..."):
255
+ #################################################################
256
+ # Track the overall time for file processing into Vectors
257
+ # #
258
+ from datetime import datetime
259
+ global_now = datetime.now()
260
+ global_current_time = global_now.strftime("%H:%M:%S")
261
+ st.write("Vectorizing Files - Current Time =", global_current_time)
262
+
263
+ # get pdf text
264
+ raw_text = extract_pdf_text(pdf_docs)
265
+ # st.write(raw_text)
266
+
267
+ # # get the text chunks
268
+ text_chunks = extract_bitesize_pieces(raw_text)
269
+ # st.write(text_chunks)
270
+
271
+ # # create vector store
272
+ vectorstore = prepare_embedding_vectors(text_chunks)
273
+
274
+ # # create conversation chain
275
+ st.session_state.conversation = prepare_conversation(vectorstore)
276
+
277
+ SESSION_STARTED = True
278
+
279
+ # Mission Complete!
280
+ global_later = datetime.now()
281
+ st.write("Files Vectorized - Total EXECUTION Time =",
282
+ (global_later - global_now), global_later)
283
+
284
+
285
+ if __name__ == '__main__':
286
+ main()
287
+
288
+
htmlTemplates.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ css = '''
2
+ <style>
3
+ .chat-message {
4
+ padding: 1.5rem; border-radius: 0.5rem; margin-bottom: 1rem; display: flex
5
+ }
6
+ .chat-message.user {
7
+ background-color: #2b313e
8
+ }
9
+ .chat-message.bot {
10
+ background-color: #475063
11
+ }
12
+ .chat-message .avatar {
13
+ width: 20%;
14
+ }
15
+ .chat-message .avatar img {
16
+ max-width: 78px;
17
+ max-height: 78px;
18
+ border-radius: 50%;
19
+ object-fit: cover;
20
+ }
21
+ .chat-message .message {
22
+ width: 80%;
23
+ padding: 0 1.5rem;
24
+ color: #fff;
25
+ }
26
+ '''
27
+
28
+ bot_template = '''
29
+ <div class="chat-message bot">
30
+ <div class="avatar">
31
+ <img src="https://free-images.com/sm/9cb8/sunset_sundown_da_nang.jpg" style="max-height: 78px; max-width: 78px; border-radius: 50%; object-fit: cover;">
32
+ </div>
33
+ <div class="message">{{MSG}}</div>
34
+ </div>
35
+ '''
36
+
37
+ user_template = '''
38
+ <div class="chat-message user">
39
+ <div class="avatar">
40
+ <img src="https://free-images.com/sm/176d/squirrel_tail_bushy_tail.jpg" style="max-height: 78px; max-width: 78px; border-radius: 50%; object-fit: cover;" >
41
+ </div>
42
+ <div class="message">{{MSG}}</div>
43
+ </div>
44
+ '''
robot_icon.ico ADDED
robot_icon.png ADDED

Git LFS Details

  • SHA256: 1654b1bf0223c8a88585dcc0d1266692c4aaab3706e91c29f430560b36c21c90
  • Pointer size: 132 Bytes
  • Size of remote file: 1.61 MB