awacke1 commited on
Commit
89a7198
1 Parent(s): ecc7e6b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -30
app.py CHANGED
@@ -205,40 +205,34 @@ def extract_mime_type(file):
205
 
206
  import textract
207
  import os
208
-
209
- def extract_mime_type(file):
210
- # If it's not a string, assume it's a streamlit.UploadedFile object
211
- if isinstance(file, streamlit.UploadedFile):
212
- return file.type
 
213
  else:
214
- raise TypeError("Input should be a streamlit.UploadedFile object")
215
 
216
  def pdf2txt(pdf_docs):
217
- st.write(pdf_docs)
218
- file_types = {'application/pdf': '.pdf', 'text/plain': '.txt',
219
- 'text/html': '.html', 'application/json': '.json',
220
- 'application/py': '.py', 'text/xml': '.xml', 'text/htm': '.htm'}
221
-
222
  text = ""
223
- for file in pdf_docs:
224
- mime_type = extract_mime_type(file)
225
- file_extension = file_types.get(mime_type, '')
226
- st.write(f"File type extension: {file_extension}")
227
-
228
- # You might want to save the file and use textract to extract text from it.
229
- # Assuming file is a streamlit.UploadedFile object
230
- with open(file.name, "wb") as f:
231
- f.write(file.getvalue())
232
-
233
- if file_extension in ['.txt', '.html', '.htm', '.py', '.xml', '.json']:
234
- text += textract.process(file.name).decode("utf-8")
235
- elif file_extension == '.pdf':
236
- pdf_reader = PdfReader(file.name)
237
- for page in pdf_reader.pages:
238
- text += page.extract_text()
239
-
240
- # Delete the file after processing
241
- os.remove(file.name)
242
  return text
243
 
244
  def pdf2txt_old(pdf_docs):
 
205
 
206
  import textract
207
  import os
208
+ def extract_file_extension(file_str):
209
+ # Using regex pattern matching to find the file extension
210
+ pattern = r"name='.*?\.(.*?)'"
211
+ match = re.search(pattern, file_str)
212
+ if match:
213
+ return match.group(1)
214
  else:
215
+ raise ValueError(f"Unable to extract file extension from {file_str}")
216
 
217
  def pdf2txt(pdf_docs):
 
 
 
 
 
218
  text = ""
219
+ for file_str in pdf_docs:
220
+ file_extension = extract_file_extension(file_str)
221
+ # Print the file extension
222
+ print(f"File type extension: {file_extension}")
223
+
224
+ # Simulate file reading
225
+ # You need to replace the following lines with actual file reading
226
+ # based on the file_extension
227
+ if file_extension in ['txt', 'html', 'htm', 'py', 'xml', 'json']:
228
+ # text += textract.process(file_str).decode("utf-8")
229
+ text += f"\nExtracted text from {file_extension} file..."
230
+ elif file_extension == 'pdf':
231
+ # pdf_reader = PdfReader(file_str)
232
+ # for page in pdf_reader.pages:
233
+ # text += page.extract_text()
234
+ text += f"\nExtracted text from PDF file..."
235
+
 
 
236
  return text
237
 
238
  def pdf2txt_old(pdf_docs):