Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -10,6 +10,7 @@ import math
|
|
10 |
import requests
|
11 |
import time
|
12 |
import re
|
|
|
13 |
|
14 |
from datetime import datetime
|
15 |
from openai import ChatCompletion
|
@@ -202,11 +203,10 @@ def extract_mime_type(file):
|
|
202 |
else:
|
203 |
raise TypeError("Input should be a string or a streamlit.UploadedFile object")
|
204 |
|
|
|
205 |
|
206 |
-
import textract
|
207 |
-
import os
|
208 |
def extract_file_extension(file):
|
209 |
-
#
|
210 |
file_name = file.name
|
211 |
pattern = r".*?\.(.*?)$"
|
212 |
match = re.search(pattern, file_name)
|
@@ -215,28 +215,36 @@ def extract_file_extension(file):
|
|
215 |
else:
|
216 |
raise ValueError(f"Unable to extract file extension from {file_name}")
|
217 |
|
218 |
-
def pdf2txt(
|
219 |
text = ""
|
220 |
-
for file in
|
221 |
file_extension = extract_file_extension(file)
|
222 |
# print the file extension
|
223 |
-
|
224 |
-
|
225 |
-
#
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
|
237 |
return text
|
238 |
|
239 |
-
|
240 |
def pdf2txt_old(pdf_docs):
|
241 |
st.write(pdf_docs)
|
242 |
for file in pdf_docs:
|
@@ -399,7 +407,7 @@ if user_question:
|
|
399 |
|
400 |
with st.sidebar:
|
401 |
st.subheader("Your documents")
|
402 |
-
docs = st.file_uploader("
|
403 |
with st.spinner("Processing"):
|
404 |
raw = pdf2txt(docs)
|
405 |
if len(raw) > 0:
|
@@ -407,6 +415,6 @@ with st.sidebar:
|
|
407 |
text_chunks = txt2chunks(raw)
|
408 |
vectorstore = vector_store(text_chunks)
|
409 |
st.session_state.conversation = get_chain(vectorstore)
|
410 |
-
st.markdown('# AI Search Index of Length:' + length + ' Created.')
|
411 |
filename = generate_filename(raw, 'txt')
|
412 |
create_file(filename, raw, '')
|
|
|
10 |
import requests
|
11 |
import time
|
12 |
import re
|
13 |
+
import textract
|
14 |
|
15 |
from datetime import datetime
|
16 |
from openai import ChatCompletion
|
|
|
203 |
else:
|
204 |
raise TypeError("Input should be a string or a streamlit.UploadedFile object")
|
205 |
|
206 |
+
from io import BytesIO
|
207 |
|
|
|
|
|
208 |
def extract_file_extension(file):
|
209 |
+
# get the file name directly from the UploadedFile object
|
210 |
file_name = file.name
|
211 |
pattern = r".*?\.(.*?)$"
|
212 |
match = re.search(pattern, file_name)
|
|
|
215 |
else:
|
216 |
raise ValueError(f"Unable to extract file extension from {file_name}")
|
217 |
|
218 |
+
def pdf2txt(docs):
|
219 |
text = ""
|
220 |
+
for file in docs:
|
221 |
file_extension = extract_file_extension(file)
|
222 |
# print the file extension
|
223 |
+
st.write(f"File type extension: {file_extension}")
|
224 |
+
|
225 |
+
# save the uploaded file temporarily
|
226 |
+
temp_file_name = file.name
|
227 |
+
with open(temp_file_name, "wb") as f:
|
228 |
+
f.write(file.getvalue())
|
229 |
+
|
230 |
+
# read the file according to its extension
|
231 |
+
try:
|
232 |
+
if file_extension.lower() in ['txt', 'html', 'htm', 'py', 'xml', 'json', 'docx']:
|
233 |
+
text += textract.process(temp_file_name).decode("utf-8")
|
234 |
+
elif file_extension.lower() == 'pdf':
|
235 |
+
with open(temp_file_name, "rb") as f:
|
236 |
+
pdf = PdfFileReader(f)
|
237 |
+
for page in range(pdf.getNumPages()):
|
238 |
+
text += pdf.getPage(page).extractText()
|
239 |
+
except Exception as e:
|
240 |
+
st.write(f"Error processing file {file.name}: {e}")
|
241 |
+
|
242 |
+
# remove the temporary file
|
243 |
+
os.remove(temp_file_name)
|
244 |
|
245 |
return text
|
246 |
|
247 |
+
|
248 |
def pdf2txt_old(pdf_docs):
|
249 |
st.write(pdf_docs)
|
250 |
for file in pdf_docs:
|
|
|
407 |
|
408 |
with st.sidebar:
|
409 |
st.subheader("Your documents")
|
410 |
+
docs = st.file_uploader("import documents", accept_multiple_files=True)
|
411 |
with st.spinner("Processing"):
|
412 |
raw = pdf2txt(docs)
|
413 |
if len(raw) > 0:
|
|
|
415 |
text_chunks = txt2chunks(raw)
|
416 |
vectorstore = vector_store(text_chunks)
|
417 |
st.session_state.conversation = get_chain(vectorstore)
|
418 |
+
st.markdown('# AI Search Index of Length:' + length + ' Created.') # add timing
|
419 |
filename = generate_filename(raw, 'txt')
|
420 |
create_file(filename, raw, '')
|