Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -203,8 +203,7 @@ def extract_mime_type(file):
|
|
203 |
else:
|
204 |
raise TypeError("Input should be a string or a streamlit.UploadedFile object")
|
205 |
|
206 |
-
from
|
207 |
-
import os
|
208 |
import re
|
209 |
|
210 |
def extract_file_extension(file):
|
@@ -224,34 +223,19 @@ def pdf2txt(docs):
|
|
224 |
# print the file extension
|
225 |
st.write(f"File type extension: {file_extension}")
|
226 |
|
227 |
-
# save the uploaded file temporarily
|
228 |
-
temp_file_name = file.name
|
229 |
-
with open(temp_file_name, "wb") as f:
|
230 |
-
f.write(file.getvalue())
|
231 |
-
|
232 |
# read the file according to its extension
|
233 |
try:
|
234 |
-
if file_extension.lower()
|
235 |
-
|
236 |
-
text += f.read()
|
237 |
-
elif file_extension.lower() in ['txt', 'html', 'htm', 'xml', 'json']:
|
238 |
-
with open(temp_file_name, 'r') as f:
|
239 |
-
text += f.read()
|
240 |
elif file_extension.lower() == 'pdf':
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
text += pdf.getPage(page).extractText()
|
245 |
except Exception as e:
|
246 |
st.write(f"Error processing file {file.name}: {e}")
|
247 |
|
248 |
-
# remove the temporary file
|
249 |
-
os.remove(temp_file_name)
|
250 |
-
|
251 |
return text
|
252 |
|
253 |
-
|
254 |
-
|
255 |
def pdf2txt_old(pdf_docs):
|
256 |
st.write(pdf_docs)
|
257 |
for file in pdf_docs:
|
|
|
203 |
else:
|
204 |
raise TypeError("Input should be a string or a streamlit.UploadedFile object")
|
205 |
|
206 |
+
from io import BytesIO
|
|
|
207 |
import re
|
208 |
|
209 |
def extract_file_extension(file):
|
|
|
223 |
# print the file extension
|
224 |
st.write(f"File type extension: {file_extension}")
|
225 |
|
|
|
|
|
|
|
|
|
|
|
226 |
# read the file according to its extension
|
227 |
try:
|
228 |
+
if file_extension.lower() in ['py', 'txt', 'html', 'htm', 'xml', 'json']:
|
229 |
+
text += file.getvalue().decode('utf-8')
|
|
|
|
|
|
|
|
|
230 |
elif file_extension.lower() == 'pdf':
|
231 |
+
pdf = PdfFileReader(BytesIO(file.getvalue()))
|
232 |
+
for page in range(pdf.getNumPages()):
|
233 |
+
text += pdf.getPage(page).extractText()
|
|
|
234 |
except Exception as e:
|
235 |
st.write(f"Error processing file {file.name}: {e}")
|
236 |
|
|
|
|
|
|
|
237 |
return text
|
238 |
|
|
|
|
|
239 |
def pdf2txt_old(pdf_docs):
|
240 |
st.write(pdf_docs)
|
241 |
for file in pdf_docs:
|