Spaces:
Running
Running
Update utils/file_utils.py
Browse files- utils/file_utils.py +16 -1
utils/file_utils.py
CHANGED
|
@@ -7,7 +7,7 @@ import logging
|
|
| 7 |
logger = logging.getLogger(__name__)
|
| 8 |
|
| 9 |
def extract_text_from_pdf(pdf_path):
|
| 10 |
-
"""Extracts text from PDF files
|
| 11 |
logger.info(f"π Extracting text from PDF: {pdf_path}")
|
| 12 |
try:
|
| 13 |
if not os.path.exists(pdf_path):
|
|
@@ -27,6 +27,21 @@ def extract_text_from_pdf(pdf_path):
|
|
| 27 |
logger.error(f"β PDF extraction failed: {e}")
|
| 28 |
raise Exception(f"Failed to extract text from PDF: {str(e)}")
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
def extract_text_from_pptx(pptx_path):
|
| 31 |
"""Extracts text from PowerPoint (PPTX) files."""
|
| 32 |
logger.info(f"π Extracting text from PPTX: {pptx_path}")
|
|
|
|
| 7 |
logger = logging.getLogger(__name__)
|
| 8 |
|
| 9 |
def extract_text_from_pdf(pdf_path):
|
| 10 |
+
"""Extracts text from PDF files - enhanced for Hugging Face"""
|
| 11 |
logger.info(f"π Extracting text from PDF: {pdf_path}")
|
| 12 |
try:
|
| 13 |
if not os.path.exists(pdf_path):
|
|
|
|
| 27 |
logger.error(f"β PDF extraction failed: {e}")
|
| 28 |
raise Exception(f"Failed to extract text from PDF: {str(e)}")
|
| 29 |
|
| 30 |
+
def extract_text_from_pdf_bytes(pdf_bytes):
|
| 31 |
+
"""Extract text from PDF bytes without temp files"""
|
| 32 |
+
try:
|
| 33 |
+
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
| 34 |
+
full_text = ""
|
| 35 |
+
for page in doc:
|
| 36 |
+
full_text += page.get_text()
|
| 37 |
+
doc.close()
|
| 38 |
+
return full_text.strip()
|
| 39 |
+
except Exception as e:
|
| 40 |
+
logger.error(f"β PDF bytes extraction failed: {e}")
|
| 41 |
+
raise
|
| 42 |
+
|
| 43 |
+
# Keep your existing PPTX and DOCX functions as they are...
|
| 44 |
+
|
| 45 |
def extract_text_from_pptx(pptx_path):
|
| 46 |
"""Extracts text from PowerPoint (PPTX) files."""
|
| 47 |
logger.info(f"π Extracting text from PPTX: {pptx_path}")
|