awacke1 commited on
Commit
d11a287
1 Parent(s): 03d5e6b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -22
app.py CHANGED
@@ -203,8 +203,7 @@ def extract_mime_type(file):
203
  else:
204
  raise TypeError("Input should be a string or a streamlit.UploadedFile object")
205
 
206
- from PyPDF2 import PdfFileReader
207
- import os
208
  import re
209
 
210
  def extract_file_extension(file):
@@ -224,34 +223,19 @@ def pdf2txt(docs):
224
  # print the file extension
225
  st.write(f"File type extension: {file_extension}")
226
 
227
- # save the uploaded file temporarily
228
- temp_file_name = file.name
229
- with open(temp_file_name, "wb") as f:
230
- f.write(file.getvalue())
231
-
232
  # read the file according to its extension
233
  try:
234
- if file_extension.lower() == 'py':
235
- with open(temp_file_name, 'r') as f:
236
- text += f.read()
237
- elif file_extension.lower() in ['txt', 'html', 'htm', 'xml', 'json']:
238
- with open(temp_file_name, 'r') as f:
239
- text += f.read()
240
  elif file_extension.lower() == 'pdf':
241
- with open(temp_file_name, "rb") as f:
242
- pdf = PdfFileReader(f)
243
- for page in range(pdf.getNumPages()):
244
- text += pdf.getPage(page).extractText()
245
  except Exception as e:
246
  st.write(f"Error processing file {file.name}: {e}")
247
 
248
- # remove the temporary file
249
- os.remove(temp_file_name)
250
-
251
  return text
252
 
253
-
254
-
255
  def pdf2txt_old(pdf_docs):
256
  st.write(pdf_docs)
257
  for file in pdf_docs:
 
203
  else:
204
  raise TypeError("Input should be a string or a streamlit.UploadedFile object")
205
 
206
+ from io import BytesIO
 
207
  import re
208
 
209
  def extract_file_extension(file):
 
223
  # print the file extension
224
  st.write(f"File type extension: {file_extension}")
225
 
 
 
 
 
 
226
  # read the file according to its extension
227
  try:
228
+ if file_extension.lower() in ['py', 'txt', 'html', 'htm', 'xml', 'json']:
229
+ text += file.getvalue().decode('utf-8')
 
 
 
 
230
  elif file_extension.lower() == 'pdf':
231
+ pdf = PdfFileReader(BytesIO(file.getvalue()))
232
+ for page in range(pdf.getNumPages()):
233
+ text += pdf.getPage(page).extractText()
 
234
  except Exception as e:
235
  st.write(f"Error processing file {file.name}: {e}")
236
 
 
 
 
237
  return text
238
 
 
 
239
  def pdf2txt_old(pdf_docs):
240
  st.write(pdf_docs)
241
  for file in pdf_docs: