Spaces:

ibraheem007
/

tailored

Running

App Files Files Community

ibraheem007 commited on 27 days ago

Commit

7ab96e5

verified ·

1 Parent(s): feb1a28

Update utils/file_utils.py

Browse files

Files changed (1) hide show

utils/file_utils.py +49 -23

utils/file_utils.py CHANGED Viewed

@@ -2,37 +2,63 @@ import os
 import fitz  # PyMuPDF
 from pptx import Presentation
 from docx import Document
 def extract_text_from_pdf(pdf_path):
-    """Extracts text from PDF files."""
-    if not os.path.exists(pdf_path):
-        raise FileNotFoundError(f"File not found: {pdf_path}")
-    doc = fitz.open(pdf_path)
-    full_text = ""
-    for page in doc:
-        full_text += page.get_text()
-    doc.close()
-    return full_text.strip()
 def extract_text_from_pptx(pptx_path):
     """Extracts text from PowerPoint (PPTX) files."""
-    if not os.path.exists(pptx_path):
-        raise FileNotFoundError(f"File not found: {pptx_path}")
-    prs = Presentation(pptx_path)
-    full_text = ""
-    for slide in prs.slides:
-        for shape in slide.shapes:
-            if hasattr(shape, "text"):
-                full_text += shape.text + "\n"
-    return full_text.strip()
 def extract_text_from_docx(docx_path):
     """Extracts text from Word (DOCX) files."""
-    if not os.path.exists(docx_path):
-        raise FileNotFoundError(f"File not found: {docx_path}")
-    doc = Document(docx_path)
-    full_text = "\n".join([para.text for para in doc.paragraphs])
-    return full_text.strip()

 import fitz  # PyMuPDF
 from pptx import Presentation
 from docx import Document
+import logging
+logger = logging.getLogger(__name__)
 def extract_text_from_pdf(pdf_path):
+    """Extracts text from PDF files with enhanced error handling."""
+    logger.info(f"📄 Extracting text from PDF: {pdf_path}")
+    try:
+        if not os.path.exists(pdf_path):
+            raise FileNotFoundError(f"File not found: {pdf_path}")
+        doc = fitz.open(pdf_path)
+        full_text = ""
+        for page_num, page in enumerate(doc):
+            page_text = page.get_text()
+            full_text += page_text
+            logger.debug(f"📄 Page {page_num + 1}: {len(page_text)} characters")
+        doc.close()
+        logger.info(f"✅ PDF extraction complete: {len(full_text)} total characters")
+        return full_text.strip()
+    except Exception as e:
+        logger.error(f"❌ PDF extraction failed: {e}")
+        raise Exception(f"Failed to extract text from PDF: {str(e)}")
 def extract_text_from_pptx(pptx_path):
     """Extracts text from PowerPoint (PPTX) files."""
+    logger.info(f"📊 Extracting text from PPTX: {pptx_path}")
+    try:
+        if not os.path.exists(pptx_path):
+            raise FileNotFoundError(f"File not found: {pptx_path}")
+        prs = Presentation(pptx_path)
+        full_text = ""
+        for slide_num, slide in enumerate(prs.slides):
+            for shape in slide.shapes:
+                if hasattr(shape, "text") and shape.text.strip():
+                    full_text += shape.text + "\n"
+            logger.debug(f"📊 Slide {slide_num + 1} processed")
+        logger.info(f"✅ PPTX extraction complete: {len(full_text)} total characters")
+        return full_text.strip()
+    except Exception as e:
+        logger.error(f"❌ PPTX extraction failed: {e}")
+        raise Exception(f"Failed to extract text from PowerPoint: {str(e)}")
 def extract_text_from_docx(docx_path):
     """Extracts text from Word (DOCX) files."""
+    logger.info(f"📝 Extracting text from DOCX: {docx_path}")
+    try:
+        if not os.path.exists(docx_path):
+            raise FileNotFoundError(f"File not found: {docx_path}")
+        doc = Document(docx_path)
+        full_text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
+        logger.info(f"✅ DOCX extraction complete: {len(full_text)} total characters")
+        return full_text.strip()
+    except Exception as e:
+        logger.error(f"❌ DOCX extraction failed: {e}")
+        raise Exception(f"Failed to extract text from Word document: {str(e)}")