File size: 2,282 Bytes
70cb71f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import fitz  # pymupdf
from docx import Document
import pptx
import os
from typing import Optional

def extract_text_from_pdf(file_path: str) -> Optional[str]:
    """
    استخراج النص من ملف PDF باستخدام pymupdf (أسرع من tika).
    """
    try:
        doc = fitz.open(file_path)
        text = ""
        for page in doc:
            text += page.get_text()
        return text.strip() if text else None
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return None

def extract_text_from_docx(file_path: str) -> Optional[str]:
    """
    استخراج النص من ملف Word (DOCX).
    """
    try:
        doc = Document(file_path)
        return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
    except Exception as e:
        print(f"Error reading DOCX: {e}")
        return None

def extract_text_from_pptx(file_path: str) -> Optional[str]:
    """
    استخراج النص من ملف PowerPoint (PPTX).
    """
    try:
        presentation = pptx.Presentation(file_path)
        text = []
        for slide in presentation.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    text.append(shape.text)
        return "\n".join(text) if text else None
    except Exception as e:
        print(f"Error reading PPTX: {e}")
        return None

def extract_text_from_document(file_path: str) -> Optional[str]:
    """
    دالة موحدة لاستخراج النص من أي مستند (PDF/DOCX/PPTX/TXT).
    """
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return None

    if file_path.lower().endswith('.pdf'):
        return extract_text_from_pdf(file_path)
    elif file_path.lower().endswith('.docx'):
        return extract_text_from_docx(file_path)
    elif file_path.lower().endswith('.pptx'):
        return extract_text_from_pptx(file_path)
    elif file_path.lower().endswith('.txt'):
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return f.read()
        except Exception as e:
            print(f"Error reading TXT: {e}")
            return None
    else:
        print(f"Unsupported file format: {file_path}")
        return None