mikepastor11 commited on
Commit
176fa1e
1 Parent(s): ecc91c5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -204
app.py CHANGED
@@ -4,7 +4,7 @@
4
  # HuggingFace Spaces application to provide honeybee expertise
5
  # with open-source models
6
  #
7
- # Mike Pastor February 22, 2024
8
 
9
 
10
  import streamlit as st
@@ -13,20 +13,11 @@ from streamlit.components.v1 import html
13
  from PyPDF2 import PdfReader
14
  from PIL import Image
15
 
 
 
16
  # Local file
17
  from htmlTemplates import css, bot_template, user_template
18
 
19
- # from langchain.embeddings import HuggingFaceInstructEmbeddings
20
- from langchain_community.embeddings import HuggingFaceInstructEmbeddings
21
-
22
- # from langchain.vectorstores import FAISS
23
- from langchain_community.vectorstores import FAISS
24
- from langchain.text_splitter import CharacterTextSplitter
25
- from langchain.memory import ConversationBufferMemory
26
- from langchain.chains import ConversationalRetrievalChain
27
-
28
- # from langchain.llms import HuggingFaceHub
29
- from langchain_community.llms import HuggingFaceHub
30
 
31
  ##################################################################################
32
  # Admin flags
@@ -35,77 +26,20 @@ DISPLAY_DIALOG_LINES = 6
35
  SESSION_STARTED = False
36
 
37
  # MODEL_NAME="deepset/roberta-base-squad2"
38
- #MODEL_NAME="BEE-spoke-data/TinyLlama-3T-1.1bee"
39
 
40
- MODEL_NAME='HuggingFaceH4/zephyr-7b-beta'
41
 
42
- ##################################################################################
43
- def extract_pdf_text(pdf_docs):
44
- text = ""
45
- for pdf in pdf_docs:
46
- pdf_reader = PdfReader(pdf)
47
- for page in pdf_reader.pages:
48
- text += page.extract_text()
49
- return text
50
-
51
-
52
- ##################################################################################
53
- # Chunk size and overlap must not exceed the models capacity!
54
  #
55
- def extract_bitesize_pieces(text):
56
- text_splitter = CharacterTextSplitter(
57
- separator="\n",
58
- chunk_size=800, # 1000
59
- chunk_overlap=200,
60
- length_function=len
61
- )
62
- chunks = text_splitter.split_text(text)
63
- return chunks
64
 
 
65
 
66
- ##################################################################################
67
- def prepare_embedding_vectors(text_chunks):
68
- st.write('Here in vector store....', unsafe_allow_html=True)
69
- # embeddings = OpenAIEmbeddings()
70
-
71
- # pip install InstructorEmbedding
72
- # pip install sentence-transformers==2.2.2
73
- embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
74
-
75
- st.write('Here in vector store - got embeddings ', unsafe_allow_html=True)
76
- # from InstructorEmbedding import INSTRUCTOR
77
- # model = INSTRUCTOR('hkunlp/instructor-xl')
78
- # sentence = "3D ActionSLAM: wearable person tracking in multi-floor environments"
79
- # instruction = "Represent the Science title:"
80
- # embeddings = model.encode([[instruction, sentence]])
81
-
82
- # embeddings = model.encode(text_chunks)
83
- print('have Embeddings: ')
84
-
85
- # text_chunks="this is a test"
86
- # FAISS, Chroma and other vector databases
87
- #
88
- vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
89
- st.write('FAISS succeeds: ')
90
-
91
- return vectorstore
92
 
93
 
94
- ##################################################################################
95
- def prepare_conversation(vectorstore):
96
- # llm = ChatOpenAI()
97
- # llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
98
- # google/bigbird-roberta-base facebook/bart-large
99
- llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature": 0.7, "max_length": 512})
100
-
101
- memory = ConversationBufferMemory(
102
- memory_key='chat_history', return_messages=True)
103
- conversation_chain = ConversationalRetrievalChain.from_llm(
104
- llm=llm,
105
- retriever=vectorstore.as_retriever(),
106
- memory=memory,
107
- )
108
- return conversation_chain
109
 
110
 
111
  ##################################################################################
@@ -133,58 +67,6 @@ def process_user_question(user_question):
133
  print('question is: ', user_question)
134
  print('\nsession is: ', st)
135
 
136
- # try:
137
- # response = st.session_state.conversation({'question': user_question})
138
- # # response = st.session_state.conversation({'summarization': user_question})
139
- # st.session_state.chat_history = response['chat_history']
140
- # Exception:
141
- # st.write( 'Please upload and analyze your PDF files first!')
142
- # return
143
-
144
- # st.empty()
145
-
146
- # try:
147
- # st.session_state.conversation({'question': "Summarize the document"})
148
- # # if "key" not in st.session_state:
149
- # # st.write('Good')
150
- # except:
151
- # st.error("Please upload and analyze your PDF files first!")
152
- # return
153
-
154
- # if st.session_state.conversation == None:
155
- # st.error("Please upload and analyze your PDF files first!")
156
- # return
157
-
158
- #
159
- # response = st.session_state.conversation({'question': user_question})
160
- # st.session_state.chat_history = response['chat_history']
161
- # results_size = len(response['chat_history'])
162
- #
163
- # results_string = ""
164
- #
165
- # print('results_size is: ', results_size)
166
- #
167
- # for i, message in enumerate(st.session_state.chat_history):
168
- #
169
- # # Scrolling does not display the last printed line,
170
- # # so only print the last 6 lines
171
- # #
172
- # print('results_size on msg: ', results_size, i, (results_size - DISPLAY_DIALOG_LINES))
173
- # if results_size > DISPLAY_DIALOG_LINES:
174
- # if i < (results_size - DISPLAY_DIALOG_LINES):
175
- # continue
176
- #
177
- # if i % 2 == 0:
178
- # # st.write(user_template.replace(
179
- # # "{{MSG}}", message.content), unsafe_allow_html=True)
180
- #
181
- # results_string += ("<p>" + message.content + "</p>")
182
- #
183
- # else:
184
- # # st.write(bot_template.replace(
185
- # # "{{MSG}}", message.content), unsafe_allow_html=True)
186
- #
187
- # results_string += ("<p>" + "-- " + message.content + "</p>")
188
 
189
 
190
  #################################################################
@@ -194,106 +76,66 @@ def process_user_question(user_question):
194
  global_now = datetime.now()
195
  global_current_time = global_now.strftime("%H:%M:%S")
196
  print("# app.py Starting up... - Current Time =", global_current_time)
197
- st.write(('Question: ' + user_question + ' | ' + str( global_current_time )), unsafe_allow_html=True)
198
 
199
- from transformers import pipeline
200
 
201
- pipe = pipeline("conversational", "HuggingFaceH4/zephyr-7b-beta")
202
 
203
- # "Question: which technology is most recent? A)Cell phones B)Television C)Airplane Answer: "
204
- messages = [
205
- {
206
- "role": "system",
207
- "content": "You are a friendly chatbot who always responds in the style of a personal assistant",
208
- },
209
- {"role": "user",
210
- "content": user_question },
211
- ]
212
 
213
- st.write('Sending message to Model-> ', messages )
214
- results_string = pipe(messages)
 
 
215
 
216
- # st.write('results type= ', type( results_string) )
217
- st.write('results= ', results_string)
218
- # st.write('results ans= ', results_string[0])
219
 
220
- # Mission Complete!
221
- ##################################################################################
222
- global_later = datetime.now()
223
- st.write("# Total EXECUTION Time =", (global_later - global_now), global_later)
224
 
225
- #
226
- # # Choose a question answering pipeline (e.g., 'question-answering')
227
- # nlp = pipeline("question-answering")
228
- #
229
- # # Specify the model name or identifier (e.g., 'deepset/roberta-base-squad2')
230
- # model_name = MODEL_NAME
231
- #
232
- # # Prepare the question and context (optional)
233
- # # question = "What is the capital of France?"
234
- # # context = "France is a country located in Western Europe. It is bordered by the Atlantic Ocean to the west, the Mediterranean Sea to the south, and Belgium, Luxembourg, Germany, Switzerland, Italy, and Spain to the east and north."
235
- #
236
- # context = "You are an expert Apiarist and answer all questions regarding Honeybees."
237
- # # context = " "
238
- #
239
- # # Ask the question
240
- # answer = nlp(question= ('Question: '+user_question), context=context, model=model_name)
241
- #
242
- # # Print the answer
243
- # print(f"Answer: {answer['answer']}")
244
- # print(f"Score: {answer['score']}")
245
- #
246
- # st.write( ('Answer= '+answer['answer']), unsafe_allow_html=True)
247
- #
248
- # results_string = answer['answer'] + ' - Probability= ' + str( answer['score'] )
249
 
250
- # html(results_string, height=300, scrolling=True)
 
251
 
 
252
 
253
- ###################################################################################
254
- def main():
255
- print('Pennwick Starting up...\n')
256
- # Load the environment variables - if any
257
- # load_dotenv()
258
 
 
259
  ##################################################################################
260
- # st.set_page_config(page_title="Pennwick PDF Analyzer", page_icon=":books:")
261
- # im = Image.open("robot_icon.ico")
262
- # st.set_page_config(page_title="Pennwick PDF Analyzer", page_icon=im )
263
- # st.set_page_config(page_title="Pennwick PDF Analyzer")
264
 
265
- # import base64
266
- # from PIL import Image
267
 
268
- # # Open your image
269
- # image = Image.open("robot_icon.ico")
270
 
271
- # # Convert image to base64 string
272
- # with open("robot_icon.ico", "rb") as f:
273
- # encoded_string = base64.b64encode(f.read()).decode()
274
 
275
- # # Set page config with base64 string
276
- # st.set_page_config(page_title="Pennwick File Analyzer 2", page_icon=f"data:image/ico;base64,{encoded_string}")
 
 
 
 
277
 
278
- st.set_page_config(page_title="Pennwick Honeybee Robot", page_icon="./HoneybeeLogo.ico")
279
 
280
- print('prepared page...\n')
 
281
 
282
- ###################
 
283
 
284
  st.write(css, unsafe_allow_html=True)
285
 
286
- if "conversation" not in st.session_state:
287
- st.session_state.conversation = None
288
- if "chat_history" not in st.session_state:
289
- st.session_state.chat_history = None
290
 
291
- # st.header("Pennwick File Analyzer :shark:")
292
- # st.header("Pennwick File Analyzer 2")
293
 
294
- # st.image("robot_icon.png", width=96)
295
- st.image("./HoneybeeLogo.png", width=96)
296
- st.header(f"Pennwick Honeybee Robot")
297
 
298
  user_question = None
299
  user_question = st.text_input("Ask the Open Source - "+MODEL_NAME+" - Model any question about Honeybees...")
@@ -301,6 +143,13 @@ def main():
301
  print('calling process question', user_question)
302
  process_user_question(user_question)
303
 
 
 
 
 
 
 
 
304
  # st.write( user_template, unsafe_allow_html=True)
305
  # st.write(user_template.replace( "{{MSG}}", "Hello robot!"), unsafe_allow_html=True)
306
  # st.write(bot_template.replace( "{{MSG}}", "Hello human!"), unsafe_allow_html=True)
 
4
  # HuggingFace Spaces application to provide honeybee expertise
5
  # with open-source models
6
  #
7
+ # Mike Pastor February 23, 2024
8
 
9
 
10
  import streamlit as st
 
13
  from PyPDF2 import PdfReader
14
  from PIL import Image
15
 
16
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
17
+
18
  # Local file
19
  from htmlTemplates import css, bot_template, user_template
20
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  ##################################################################################
23
  # Admin flags
 
26
  SESSION_STARTED = False
27
 
28
  # MODEL_NAME="deepset/roberta-base-squad2"
29
+ # MODEL_NAME="BEE-spoke-data/TinyLlama-3T-1.1bee"
30
 
31
+ # MODEL_NAME='HuggingFaceH4/zephyr-7b-beta'
32
 
33
+ ##############################################################
34
+ # Our model and tokenizer
 
 
 
 
 
 
 
 
 
 
35
  #
36
+ MODEL_NAME = "facebook/blenderbot-400M-distill"
 
 
 
 
 
 
 
 
37
 
38
+ model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
39
 
40
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
 
45
  ##################################################################################
 
67
  print('question is: ', user_question)
68
  print('\nsession is: ', st)
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
 
72
  #################################################################
 
76
  global_now = datetime.now()
77
  global_current_time = global_now.strftime("%H:%M:%S")
78
  print("# app.py Starting up... - Current Time =", global_current_time)
 
79
 
80
+ st.write(('Question: ' + user_question ), unsafe_allow_html=True)
81
 
 
82
 
83
+ # input_text = input('Say something--> ')
 
 
 
 
 
 
 
 
84
 
85
+ print( 'history--> ', st.session_state.history_string)
86
+ ################################################################
87
+ # Tokenize the user prompt and conversation history
88
+ inputs = tokenizer.encode_plus( st.session_state.history_string, user_question, return_tensors="pt" )
89
 
90
+ # st.write('Len of inputs= ', len( inputs))
91
+ # Generate a response
92
+ outputs = model.generate( **inputs )
93
 
94
+ # decode the response
95
+ response = tokenizer.decode( outputs[0], skip_special_tokens=True).strip()
 
 
96
 
97
+ # append history
98
+ st.session_state.conversation_history.append(user_question)
99
+ st.session_state.conversation_history.append(response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
+ # st.session_state.history_string = "/n".join(st.session_state.conversation_history)
102
+ st.session_state.history_string = "<br>".join( st.session_state.conversation_history )
103
 
104
+ st.write( 'Response: ', response)
105
 
 
 
 
 
 
106
 
107
+ # Mission Complete!
108
  ##################################################################################
109
+ global_later = datetime.now()
110
+ st.write("Total query execute Time =", (global_later - global_now), global_later)
 
 
111
 
 
 
112
 
 
 
113
 
114
+ #################################################################################
115
+ def main():
116
+ print('Pennwick Starting up...\n')
117
 
118
+ ##################################################################
119
+ # Initial conversation tracking
120
+ if not hasattr(st.session_state, "conversation_history"):
121
+ st.session_state.conversation_history = []
122
+ if not hasattr(st.session_state, "history_string"):
123
+ st.session_state.history_string = "\n".join(st.session_state.conversation_history)
124
 
 
125
 
126
+ # Load the environment variables - if any
127
+ # load_dotenv()
128
 
129
+ st.set_page_config(page_title="Pennwick Honeybee Robot",
130
+ page_icon="./HoneybeeLogo.ico")
131
 
132
  st.write(css, unsafe_allow_html=True)
133
 
134
+ st.image("./HoneybeeLogo.png", width=96)
135
+ st.header(f"Pennwick Honeybee Robot - BETA VERSION")
 
 
136
 
137
+ print('Prepared page...\n')
 
138
 
 
 
 
139
 
140
  user_question = None
141
  user_question = st.text_input("Ask the Open Source - "+MODEL_NAME+" - Model any question about Honeybees...")
 
143
  print('calling process question', user_question)
144
  process_user_question(user_question)
145
 
146
+ html_history_string = ""
147
+ if len( st.session_state.history_string ) > 100:
148
+ html_history_string = st.session_state.history_string[-100:]
149
+ else:
150
+ html_history_string = st.session_state.history_string
151
+
152
+ html(html_history_string , height=150, scrolling=True)
153
  # st.write( user_template, unsafe_allow_html=True)
154
  # st.write(user_template.replace( "{{MSG}}", "Hello robot!"), unsafe_allow_html=True)
155
  # st.write(bot_template.replace( "{{MSG}}", "Hello human!"), unsafe_allow_html=True)