Spaces:
Sleeping
Sleeping
mikepastor11
commited on
Commit
•
46158ec
1
Parent(s):
5b7b180
Update app.py
Browse files
app.py
CHANGED
@@ -4,7 +4,7 @@
|
|
4 |
# HuggingFace Spaces application to anlayze uploaded PDF files
|
5 |
# with open-source models ( hkunlp/instructor-xl )
|
6 |
#
|
7 |
-
# Mike Pastor February
|
8 |
|
9 |
|
10 |
import streamlit as st
|
@@ -25,16 +25,14 @@ from langchain_community.embeddings import HuggingFaceInstructEmbeddings
|
|
25 |
|
26 |
# from langchain.vectorstores import FAISS
|
27 |
from langchain_community.vectorstores import FAISS
|
28 |
-
|
29 |
from langchain.text_splitter import CharacterTextSplitter
|
30 |
-
|
31 |
from langchain.memory import ConversationBufferMemory
|
32 |
from langchain.chains import ConversationalRetrievalChain
|
33 |
|
34 |
-
|
35 |
# from langchain.llms import HuggingFaceHub
|
36 |
from langchain_community.llms import HuggingFaceHub
|
37 |
|
|
|
38 |
def extract_pdf_text(pdf_docs):
|
39 |
text = ""
|
40 |
for pdf in pdf_docs:
|
@@ -43,6 +41,7 @@ def extract_pdf_text(pdf_docs):
|
|
43 |
text += page.extract_text()
|
44 |
return text
|
45 |
|
|
|
46 |
# Chunk size and overlap must not exceed the models capacity!
|
47 |
#
|
48 |
def extract_bitesize_pieces(text):
|
@@ -55,7 +54,7 @@ def extract_bitesize_pieces(text):
|
|
55 |
chunks = text_splitter.split_text(text)
|
56 |
return chunks
|
57 |
|
58 |
-
|
59 |
def prepare_embedding_vectors(text_chunks):
|
60 |
|
61 |
st.write('Here in vector store....', unsafe_allow_html=True)
|
@@ -82,7 +81,8 @@ def prepare_embedding_vectors(text_chunks):
|
|
82 |
st.write('FAISS succeeds: ')
|
83 |
|
84 |
return vectorstore
|
85 |
-
|
|
|
86 |
def prepare_conversation(vectorstore):
|
87 |
# llm = ChatOpenAI()
|
88 |
# llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
|
@@ -98,6 +98,7 @@ def prepare_conversation(vectorstore):
|
|
98 |
)
|
99 |
return conversation_chain
|
100 |
|
|
|
101 |
def process_user_question(user_question):
|
102 |
|
103 |
print('process_user_question called: \n')
|
@@ -169,19 +170,22 @@ def main():
|
|
169 |
# st.set_page_config(page_title="Pennwick PDF Analyzer", page_icon=im )
|
170 |
# st.set_page_config(page_title="Pennwick PDF Analyzer")
|
171 |
|
172 |
-
import base64
|
173 |
-
from PIL import Image
|
174 |
|
175 |
-
# Open your image
|
176 |
-
image = Image.open("robot_icon.ico")
|
177 |
|
178 |
-
# Convert image to base64 string
|
179 |
-
with open("robot_icon.ico", "rb") as f:
|
180 |
-
|
181 |
|
182 |
-
# Set page config with base64 string
|
183 |
-
st.set_page_config(page_title="Pennwick File Analyzer 2", page_icon=f"data:image/ico;base64,{encoded_string}")
|
|
|
184 |
|
|
|
|
|
185 |
print( 'prepared page...\n')
|
186 |
|
187 |
|
@@ -194,8 +198,11 @@ def main():
|
|
194 |
if "chat_history" not in st.session_state:
|
195 |
st.session_state.chat_history = None
|
196 |
|
197 |
-
# st.header("Pennwick File Analyzer :
|
198 |
-
st.header("Pennwick File Analyzer 2")
|
|
|
|
|
|
|
199 |
|
200 |
user_question = None
|
201 |
user_question = st.text_input("Ask the Open Source - Flan-t5 Model a question about your uploaded documents:")
|
|
|
4 |
# HuggingFace Spaces application to anlayze uploaded PDF files
|
5 |
# with open-source models ( hkunlp/instructor-xl )
|
6 |
#
|
7 |
+
# Mike Pastor February 17, 2024
|
8 |
|
9 |
|
10 |
import streamlit as st
|
|
|
25 |
|
26 |
# from langchain.vectorstores import FAISS
|
27 |
from langchain_community.vectorstores import FAISS
|
|
|
28 |
from langchain.text_splitter import CharacterTextSplitter
|
|
|
29 |
from langchain.memory import ConversationBufferMemory
|
30 |
from langchain.chains import ConversationalRetrievalChain
|
31 |
|
|
|
32 |
# from langchain.llms import HuggingFaceHub
|
33 |
from langchain_community.llms import HuggingFaceHub
|
34 |
|
35 |
+
##################################################################################
|
36 |
def extract_pdf_text(pdf_docs):
|
37 |
text = ""
|
38 |
for pdf in pdf_docs:
|
|
|
41 |
text += page.extract_text()
|
42 |
return text
|
43 |
|
44 |
+
##################################################################################
|
45 |
# Chunk size and overlap must not exceed the models capacity!
|
46 |
#
|
47 |
def extract_bitesize_pieces(text):
|
|
|
54 |
chunks = text_splitter.split_text(text)
|
55 |
return chunks
|
56 |
|
57 |
+
##################################################################################
|
58 |
def prepare_embedding_vectors(text_chunks):
|
59 |
|
60 |
st.write('Here in vector store....', unsafe_allow_html=True)
|
|
|
81 |
st.write('FAISS succeeds: ')
|
82 |
|
83 |
return vectorstore
|
84 |
+
|
85 |
+
##################################################################################
|
86 |
def prepare_conversation(vectorstore):
|
87 |
# llm = ChatOpenAI()
|
88 |
# llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
|
|
|
98 |
)
|
99 |
return conversation_chain
|
100 |
|
101 |
+
##################################################################################
|
102 |
def process_user_question(user_question):
|
103 |
|
104 |
print('process_user_question called: \n')
|
|
|
170 |
# st.set_page_config(page_title="Pennwick PDF Analyzer", page_icon=im )
|
171 |
# st.set_page_config(page_title="Pennwick PDF Analyzer")
|
172 |
|
173 |
+
# import base64
|
174 |
+
# from PIL import Image
|
175 |
|
176 |
+
# # Open your image
|
177 |
+
# image = Image.open("robot_icon.ico")
|
178 |
|
179 |
+
# # Convert image to base64 string
|
180 |
+
# with open("robot_icon.ico", "rb") as f:
|
181 |
+
# encoded_string = base64.b64encode(f.read()).decode()
|
182 |
|
183 |
+
# # Set page config with base64 string
|
184 |
+
# st.set_page_config(page_title="Pennwick File Analyzer 2", page_icon=f"data:image/ico;base64,{encoded_string}")
|
185 |
+
|
186 |
|
187 |
+
st.set_page_config(page_title="Pennwick File Analyzer", page_icon="./robot_icon.ico")
|
188 |
+
|
189 |
print( 'prepared page...\n')
|
190 |
|
191 |
|
|
|
198 |
if "chat_history" not in st.session_state:
|
199 |
st.session_state.chat_history = None
|
200 |
|
201 |
+
# st.header("Pennwick File Analyzer :shark:")
|
202 |
+
# st.header("Pennwick File Analyzer 2")
|
203 |
+
|
204 |
+
st.image("robot_icon.png", width=96 )
|
205 |
+
st.header(f"Pennwick File Analyzer")
|
206 |
|
207 |
user_question = None
|
208 |
user_question = st.text_input("Ask the Open Source - Flan-t5 Model a question about your uploaded documents:")
|