Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -205,40 +205,34 @@ def extract_mime_type(file):
|
|
205 |
|
206 |
import textract
|
207 |
import os
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
|
|
213 |
else:
|
214 |
-
raise
|
215 |
|
216 |
def pdf2txt(pdf_docs):
|
217 |
-
st.write(pdf_docs)
|
218 |
-
file_types = {'application/pdf': '.pdf', 'text/plain': '.txt',
|
219 |
-
'text/html': '.html', 'application/json': '.json',
|
220 |
-
'application/py': '.py', 'text/xml': '.xml', 'text/htm': '.htm'}
|
221 |
-
|
222 |
text = ""
|
223 |
-
for
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
#
|
229 |
-
#
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
# Delete the file after processing
|
241 |
-
os.remove(file.name)
|
242 |
return text
|
243 |
|
244 |
def pdf2txt_old(pdf_docs):
|
|
|
205 |
|
206 |
import textract
|
207 |
import os
|
208 |
+
def extract_file_extension(file_str):
|
209 |
+
# Using regex pattern matching to find the file extension
|
210 |
+
pattern = r"name='.*?\.(.*?)'"
|
211 |
+
match = re.search(pattern, file_str)
|
212 |
+
if match:
|
213 |
+
return match.group(1)
|
214 |
else:
|
215 |
+
raise ValueError(f"Unable to extract file extension from {file_str}")
|
216 |
|
217 |
def pdf2txt(pdf_docs):
|
|
|
|
|
|
|
|
|
|
|
218 |
text = ""
|
219 |
+
for file_str in pdf_docs:
|
220 |
+
file_extension = extract_file_extension(file_str)
|
221 |
+
# Print the file extension
|
222 |
+
print(f"File type extension: {file_extension}")
|
223 |
+
|
224 |
+
# Simulate file reading
|
225 |
+
# You need to replace the following lines with actual file reading
|
226 |
+
# based on the file_extension
|
227 |
+
if file_extension in ['txt', 'html', 'htm', 'py', 'xml', 'json']:
|
228 |
+
# text += textract.process(file_str).decode("utf-8")
|
229 |
+
text += f"\nExtracted text from {file_extension} file..."
|
230 |
+
elif file_extension == 'pdf':
|
231 |
+
# pdf_reader = PdfReader(file_str)
|
232 |
+
# for page in pdf_reader.pages:
|
233 |
+
# text += page.extract_text()
|
234 |
+
text += f"\nExtracted text from PDF file..."
|
235 |
+
|
|
|
|
|
236 |
return text
|
237 |
|
238 |
def pdf2txt_old(pdf_docs):
|