import fitz # pymupdf from docx import Document import pptx import os from typing import Optional def extract_text_from_pdf(file_path: str) -> Optional[str]: """ استخراج النص من ملف PDF باستخدام pymupdf (أسرع من tika). """ try: doc = fitz.open(file_path) text = "" for page in doc: text += page.get_text() return text.strip() if text else None except Exception as e: print(f"Error reading PDF: {e}") return None def extract_text_from_docx(file_path: str) -> Optional[str]: """ استخراج النص من ملف Word (DOCX). """ try: doc = Document(file_path) return "\n".join([p.text for p in doc.paragraphs if p.text.strip()]) except Exception as e: print(f"Error reading DOCX: {e}") return None def extract_text_from_pptx(file_path: str) -> Optional[str]: """ استخراج النص من ملف PowerPoint (PPTX). """ try: presentation = pptx.Presentation(file_path) text = [] for slide in presentation.slides: for shape in slide.shapes: if hasattr(shape, "text"): text.append(shape.text) return "\n".join(text) if text else None except Exception as e: print(f"Error reading PPTX: {e}") return None def extract_text_from_document(file_path: str) -> Optional[str]: """ دالة موحدة لاستخراج النص من أي مستند (PDF/DOCX/PPTX/TXT). """ if not os.path.exists(file_path): print(f"File not found: {file_path}") return None if file_path.lower().endswith('.pdf'): return extract_text_from_pdf(file_path) elif file_path.lower().endswith('.docx'): return extract_text_from_docx(file_path) elif file_path.lower().endswith('.pptx'): return extract_text_from_pptx(file_path) elif file_path.lower().endswith('.txt'): try: with open(file_path, 'r', encoding='utf-8') as f: return f.read() except Exception as e: print(f"Error reading TXT: {e}") return None else: print(f"Unsupported file format: {file_path}") return None