Soltane777's picture
Update backend/utils.py
70cb71f verified
import fitz # pymupdf
from docx import Document
import pptx
import os
from typing import Optional
def extract_text_from_pdf(file_path: str) -> Optional[str]:
"""
استخراج النص من ملف PDF باستخدام pymupdf (أسرع من tika).
"""
try:
doc = fitz.open(file_path)
text = ""
for page in doc:
text += page.get_text()
return text.strip() if text else None
except Exception as e:
print(f"Error reading PDF: {e}")
return None
def extract_text_from_docx(file_path: str) -> Optional[str]:
"""
استخراج النص من ملف Word (DOCX).
"""
try:
doc = Document(file_path)
return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
except Exception as e:
print(f"Error reading DOCX: {e}")
return None
def extract_text_from_pptx(file_path: str) -> Optional[str]:
"""
استخراج النص من ملف PowerPoint (PPTX).
"""
try:
presentation = pptx.Presentation(file_path)
text = []
for slide in presentation.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
text.append(shape.text)
return "\n".join(text) if text else None
except Exception as e:
print(f"Error reading PPTX: {e}")
return None
def extract_text_from_document(file_path: str) -> Optional[str]:
"""
دالة موحدة لاستخراج النص من أي مستند (PDF/DOCX/PPTX/TXT).
"""
if not os.path.exists(file_path):
print(f"File not found: {file_path}")
return None
if file_path.lower().endswith('.pdf'):
return extract_text_from_pdf(file_path)
elif file_path.lower().endswith('.docx'):
return extract_text_from_docx(file_path)
elif file_path.lower().endswith('.pptx'):
return extract_text_from_pptx(file_path)
elif file_path.lower().endswith('.txt'):
try:
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
except Exception as e:
print(f"Error reading TXT: {e}")
return None
else:
print(f"Unsupported file format: {file_path}")
return None