ibraheem007 commited on
Commit
7ab96e5
Β·
verified Β·
1 Parent(s): feb1a28

Update utils/file_utils.py

Browse files
Files changed (1) hide show
  1. utils/file_utils.py +49 -23
utils/file_utils.py CHANGED
@@ -2,37 +2,63 @@ import os
2
  import fitz # PyMuPDF
3
  from pptx import Presentation
4
  from docx import Document
 
 
 
5
 
6
  def extract_text_from_pdf(pdf_path):
7
- """Extracts text from PDF files."""
8
- if not os.path.exists(pdf_path):
9
- raise FileNotFoundError(f"File not found: {pdf_path}")
 
 
10
 
11
- doc = fitz.open(pdf_path)
12
- full_text = ""
13
- for page in doc:
14
- full_text += page.get_text()
15
- doc.close()
16
- return full_text.strip()
 
 
 
 
 
 
 
17
 
18
  def extract_text_from_pptx(pptx_path):
19
  """Extracts text from PowerPoint (PPTX) files."""
20
- if not os.path.exists(pptx_path):
21
- raise FileNotFoundError(f"File not found: {pptx_path}")
 
 
22
 
23
- prs = Presentation(pptx_path)
24
- full_text = ""
25
- for slide in prs.slides:
26
- for shape in slide.shapes:
27
- if hasattr(shape, "text"):
28
- full_text += shape.text + "\n"
29
- return full_text.strip()
 
 
 
 
 
 
30
 
31
  def extract_text_from_docx(docx_path):
32
  """Extracts text from Word (DOCX) files."""
33
- if not os.path.exists(docx_path):
34
- raise FileNotFoundError(f"File not found: {docx_path}")
 
 
35
 
36
- doc = Document(docx_path)
37
- full_text = "\n".join([para.text for para in doc.paragraphs])
38
- return full_text.strip()
 
 
 
 
 
2
  import fitz # PyMuPDF
3
  from pptx import Presentation
4
  from docx import Document
5
+ import logging
6
+
7
+ logger = logging.getLogger(__name__)
8
 
9
  def extract_text_from_pdf(pdf_path):
10
+ """Extracts text from PDF files with enhanced error handling."""
11
+ logger.info(f"πŸ“„ Extracting text from PDF: {pdf_path}")
12
+ try:
13
+ if not os.path.exists(pdf_path):
14
+ raise FileNotFoundError(f"File not found: {pdf_path}")
15
 
16
+ doc = fitz.open(pdf_path)
17
+ full_text = ""
18
+ for page_num, page in enumerate(doc):
19
+ page_text = page.get_text()
20
+ full_text += page_text
21
+ logger.debug(f"πŸ“„ Page {page_num + 1}: {len(page_text)} characters")
22
+
23
+ doc.close()
24
+ logger.info(f"βœ… PDF extraction complete: {len(full_text)} total characters")
25
+ return full_text.strip()
26
+ except Exception as e:
27
+ logger.error(f"❌ PDF extraction failed: {e}")
28
+ raise Exception(f"Failed to extract text from PDF: {str(e)}")
29
 
30
  def extract_text_from_pptx(pptx_path):
31
  """Extracts text from PowerPoint (PPTX) files."""
32
+ logger.info(f"πŸ“Š Extracting text from PPTX: {pptx_path}")
33
+ try:
34
+ if not os.path.exists(pptx_path):
35
+ raise FileNotFoundError(f"File not found: {pptx_path}")
36
 
37
+ prs = Presentation(pptx_path)
38
+ full_text = ""
39
+ for slide_num, slide in enumerate(prs.slides):
40
+ for shape in slide.shapes:
41
+ if hasattr(shape, "text") and shape.text.strip():
42
+ full_text += shape.text + "\n"
43
+ logger.debug(f"πŸ“Š Slide {slide_num + 1} processed")
44
+
45
+ logger.info(f"βœ… PPTX extraction complete: {len(full_text)} total characters")
46
+ return full_text.strip()
47
+ except Exception as e:
48
+ logger.error(f"❌ PPTX extraction failed: {e}")
49
+ raise Exception(f"Failed to extract text from PowerPoint: {str(e)}")
50
 
51
  def extract_text_from_docx(docx_path):
52
  """Extracts text from Word (DOCX) files."""
53
+ logger.info(f"πŸ“ Extracting text from DOCX: {docx_path}")
54
+ try:
55
+ if not os.path.exists(docx_path):
56
+ raise FileNotFoundError(f"File not found: {docx_path}")
57
 
58
+ doc = Document(docx_path)
59
+ full_text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
60
+ logger.info(f"βœ… DOCX extraction complete: {len(full_text)} total characters")
61
+ return full_text.strip()
62
+ except Exception as e:
63
+ logger.error(f"❌ DOCX extraction failed: {e}")
64
+ raise Exception(f"Failed to extract text from Word document: {str(e)}")