Spaces:

ibraheem007
/

tailored

Running

ibraheem007 commited on 19 days ago

Commit

435aca4

verified ·

1 Parent(s): a0f26db

Update utils/file_utils.py

Files changed (1) hide show

utils/file_utils.py CHANGED Viewed

@@ -7,7 +7,7 @@ import logging
 logger = logging.getLogger(__name__)
 def extract_text_from_pdf(pdf_path):
-    """Extracts text from PDF files with enhanced error handling."""
     logger.info(f"📄 Extracting text from PDF: {pdf_path}")
     try:
         if not os.path.exists(pdf_path):
@@ -27,6 +27,21 @@ def extract_text_from_pdf(pdf_path):
         logger.error(f"❌ PDF extraction failed: {e}")
         raise Exception(f"Failed to extract text from PDF: {str(e)}")
 def extract_text_from_pptx(pptx_path):
     """Extracts text from PowerPoint (PPTX) files."""
     logger.info(f"📊 Extracting text from PPTX: {pptx_path}")

 logger = logging.getLogger(__name__)
 def extract_text_from_pdf(pdf_path):
+    """Extracts text from PDF files - enhanced for Hugging Face"""
     logger.info(f"📄 Extracting text from PDF: {pdf_path}")
     try:
         if not os.path.exists(pdf_path):
         logger.error(f"❌ PDF extraction failed: {e}")
         raise Exception(f"Failed to extract text from PDF: {str(e)}")
+def extract_text_from_pdf_bytes(pdf_bytes):
+    """Extract text from PDF bytes without temp files"""
+    try:
+        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+        full_text = ""
+        for page in doc:
+            full_text += page.get_text()
+        doc.close()
+        return full_text.strip()
+    except Exception as e:
+        logger.error(f"❌ PDF bytes extraction failed: {e}")
+        raise
+# Keep your existing PPTX and DOCX functions as they are...
 def extract_text_from_pptx(pptx_path):
     """Extracts text from PowerPoint (PPTX) files."""
     logger.info(f"📊 Extracting text from PPTX: {pptx_path}")