awacke1 commited on
Commit
e6741ed
1 Parent(s): 531e73c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -1
app.py CHANGED
@@ -201,8 +201,47 @@ def extract_mime_type(file):
201
  return file.type
202
  else:
203
  raise TypeError("Input should be a string or a streamlit.UploadedFile object")
204
-
 
 
 
 
 
 
 
 
 
 
 
205
  def pdf2txt(pdf_docs):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  st.write(pdf_docs)
207
  for file in pdf_docs:
208
  mime_type = extract_mime_type(file)
 
201
  return file.type
202
  else:
203
  raise TypeError("Input should be a string or a streamlit.UploadedFile object")
204
+
205
+
206
+ import textract
207
+ import os
208
+
209
+ def extract_mime_type(file):
210
+ # If it's not a string, assume it's a streamlit.UploadedFile object
211
+ if isinstance(file, streamlit.UploadedFile):
212
+ return file.type
213
+ else:
214
+ raise TypeError("Input should be a streamlit.UploadedFile object")
215
+
216
  def pdf2txt(pdf_docs):
217
+ st.write(pdf_docs)
218
+ file_types = {'application/pdf': '.pdf', 'text/plain': '.txt',
219
+ 'text/html': '.html', 'application/json': '.json',
220
+ 'application/py': '.py', 'text/xml': '.xml', 'text/htm': '.htm'}
221
+
222
+ text = ""
223
+ for file in pdf_docs:
224
+ mime_type = extract_mime_type(file)
225
+ file_extension = file_types.get(mime_type, '')
226
+ st.write(f"File type extension: {file_extension}")
227
+
228
+ # You might want to save the file and use textract to extract text from it.
229
+ # Assuming file is a streamlit.UploadedFile object
230
+ with open(file.name, "wb") as f:
231
+ f.write(file.getvalue())
232
+
233
+ if file_extension in ['.txt', '.html', '.htm', '.py', '.xml', '.json']:
234
+ text += textract.process(file.name).decode("utf-8")
235
+ elif file_extension == '.pdf':
236
+ pdf_reader = PdfReader(file.name)
237
+ for page in pdf_reader.pages:
238
+ text += page.extract_text()
239
+
240
+ # Delete the file after processing
241
+ os.remove(file.name)
242
+ return text
243
+
244
+ def pdf2txt_old(pdf_docs):
245
  st.write(pdf_docs)
246
  for file in pdf_docs:
247
  mime_type = extract_mime_type(file)