ibraheem007 commited on
Commit
435aca4
Β·
verified Β·
1 Parent(s): a0f26db

Update utils/file_utils.py

Browse files
Files changed (1) hide show
  1. utils/file_utils.py +16 -1
utils/file_utils.py CHANGED
@@ -7,7 +7,7 @@ import logging
7
  logger = logging.getLogger(__name__)
8
 
9
  def extract_text_from_pdf(pdf_path):
10
- """Extracts text from PDF files with enhanced error handling."""
11
  logger.info(f"πŸ“„ Extracting text from PDF: {pdf_path}")
12
  try:
13
  if not os.path.exists(pdf_path):
@@ -27,6 +27,21 @@ def extract_text_from_pdf(pdf_path):
27
  logger.error(f"❌ PDF extraction failed: {e}")
28
  raise Exception(f"Failed to extract text from PDF: {str(e)}")
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  def extract_text_from_pptx(pptx_path):
31
  """Extracts text from PowerPoint (PPTX) files."""
32
  logger.info(f"πŸ“Š Extracting text from PPTX: {pptx_path}")
 
7
  logger = logging.getLogger(__name__)
8
 
9
  def extract_text_from_pdf(pdf_path):
10
+ """Extracts text from PDF files - enhanced for Hugging Face"""
11
  logger.info(f"πŸ“„ Extracting text from PDF: {pdf_path}")
12
  try:
13
  if not os.path.exists(pdf_path):
 
27
  logger.error(f"❌ PDF extraction failed: {e}")
28
  raise Exception(f"Failed to extract text from PDF: {str(e)}")
29
 
30
+ def extract_text_from_pdf_bytes(pdf_bytes):
31
+ """Extract text from PDF bytes without temp files"""
32
+ try:
33
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
34
+ full_text = ""
35
+ for page in doc:
36
+ full_text += page.get_text()
37
+ doc.close()
38
+ return full_text.strip()
39
+ except Exception as e:
40
+ logger.error(f"❌ PDF bytes extraction failed: {e}")
41
+ raise
42
+
43
+ # Keep your existing PPTX and DOCX functions as they are...
44
+
45
  def extract_text_from_pptx(pptx_path):
46
  """Extracts text from PowerPoint (PPTX) files."""
47
  logger.info(f"πŸ“Š Extracting text from PPTX: {pptx_path}")