Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -201,8 +201,47 @@ def extract_mime_type(file):
|
|
201 |
return file.type
|
202 |
else:
|
203 |
raise TypeError("Input should be a string or a streamlit.UploadedFile object")
|
204 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
205 |
def pdf2txt(pdf_docs):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
st.write(pdf_docs)
|
207 |
for file in pdf_docs:
|
208 |
mime_type = extract_mime_type(file)
|
|
|
201 |
return file.type
|
202 |
else:
|
203 |
raise TypeError("Input should be a string or a streamlit.UploadedFile object")
|
204 |
+
|
205 |
+
|
206 |
+
import textract
|
207 |
+
import os
|
208 |
+
|
209 |
+
def extract_mime_type(file):
|
210 |
+
# If it's not a string, assume it's a streamlit.UploadedFile object
|
211 |
+
if isinstance(file, streamlit.UploadedFile):
|
212 |
+
return file.type
|
213 |
+
else:
|
214 |
+
raise TypeError("Input should be a streamlit.UploadedFile object")
|
215 |
+
|
216 |
def pdf2txt(pdf_docs):
|
217 |
+
st.write(pdf_docs)
|
218 |
+
file_types = {'application/pdf': '.pdf', 'text/plain': '.txt',
|
219 |
+
'text/html': '.html', 'application/json': '.json',
|
220 |
+
'application/py': '.py', 'text/xml': '.xml', 'text/htm': '.htm'}
|
221 |
+
|
222 |
+
text = ""
|
223 |
+
for file in pdf_docs:
|
224 |
+
mime_type = extract_mime_type(file)
|
225 |
+
file_extension = file_types.get(mime_type, '')
|
226 |
+
st.write(f"File type extension: {file_extension}")
|
227 |
+
|
228 |
+
# You might want to save the file and use textract to extract text from it.
|
229 |
+
# Assuming file is a streamlit.UploadedFile object
|
230 |
+
with open(file.name, "wb") as f:
|
231 |
+
f.write(file.getvalue())
|
232 |
+
|
233 |
+
if file_extension in ['.txt', '.html', '.htm', '.py', '.xml', '.json']:
|
234 |
+
text += textract.process(file.name).decode("utf-8")
|
235 |
+
elif file_extension == '.pdf':
|
236 |
+
pdf_reader = PdfReader(file.name)
|
237 |
+
for page in pdf_reader.pages:
|
238 |
+
text += page.extract_text()
|
239 |
+
|
240 |
+
# Delete the file after processing
|
241 |
+
os.remove(file.name)
|
242 |
+
return text
|
243 |
+
|
244 |
+
def pdf2txt_old(pdf_docs):
|
245 |
st.write(pdf_docs)
|
246 |
for file in pdf_docs:
|
247 |
mime_type = extract_mime_type(file)
|