File size: 5,868 Bytes
354c6a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import re
from nltk.corpus import stopwords
from nltk.stem.isri import ISRIStemmer

arabic_stopwords = set(stopwords.words("arabic"))
stemmer = ISRIStemmer()

char_map = str.maketrans(
    {"أ": "ا", "إ": "ا", "آ": "ا", "ى": "ي", "ة": "ه", "ؤ": "و", "ئ": "ي", "ـ": ""}
)

diacritics_pattern = re.compile(r"[\u064B-\u0652]")
punctuation_pattern = re.compile(r"[^\w\s]")
whitespace_pattern = re.compile(r"\s+")
repeated_char_pattern = re.compile(r"(.)\1+")


def normalize_arabic(text: str) -> str:
    """Normalize Arabic characters."""
    return text.translate(char_map)


def remove_diacritics(text: str) -> str:
    """Remove Arabic diacritics."""
    return diacritics_pattern.sub("", text)


def remove_punctuation(text: str) -> str:
    """Remove punctuation marks."""
    return punctuation_pattern.sub(" ", text)


def reduce_repeated_characters(text: str) -> str:
    """Reduce repeated characters to single occurrence."""
    return repeated_char_pattern.sub(r"\1", text)


def remove_stopwords(tokens: list[str]) -> list[str]:
    """Remove Arabic stopwords from tokens."""
    return [word for word in tokens if word not in arabic_stopwords]


def stem_tokens(tokens: list[str]) -> list[str]:
    """Apply ISRI stemming to tokens."""
    return [stemmer.stem(token) for token in tokens]


def preprocess_for_classification(text: str) -> str:
    """Preprocess text for classification: normalize, clean, tokenize, stem."""
    text = text.strip().lower()
    text = normalize_arabic(text)
    text = remove_diacritics(text)
    text = remove_punctuation(text)
    text = reduce_repeated_characters(text)
    text = whitespace_pattern.sub(" ", text).strip()
    text = re.sub(r"\d+", "", text)
    tokens = text.split()
    tokens = remove_stopwords(tokens)
    tokens = stem_tokens(tokens)
    return " ".join(tokens)


def preprocess_for_summarization(text: str) -> str:
    """Light preprocessing for summarization: remove diacritics and numbers."""
    if not isinstance(text, str):
        return ""
    text = text.strip().lower()
    text = remove_diacritics(text)
    text = whitespace_pattern.sub(" ", text).strip()
    return re.sub(r"\d+", "", text)


class ArabicPreprocessor:
    """Arabic text preprocessor with analysis capabilities."""
    
    def __init__(self):
        self.arabic_stopwords = arabic_stopwords
        self.stemmer = stemmer
        self.char_map = char_map
    
    def preprocess_for_classification(self, text: str) -> str:
        """Preprocess text for classification."""
        return preprocess_for_classification(text)
    
    def preprocess_for_summarization(self, text: str) -> str:
        """Preprocess text for summarization."""
        return preprocess_for_summarization(text)
    
    def get_preprocessing_steps(self, text: str, task_type: str = "classification") -> dict:
        """Get detailed preprocessing steps for analysis."""
        steps = {
            "original": text,
            "stripped_lowered": text.strip().lower(),
        }
        
        current = text.strip().lower()
        
        if task_type == "classification":
            steps["normalized"] = normalize_arabic(current)
            current = normalize_arabic(current)
            
            steps["diacritics_removed"] = remove_diacritics(current)
            current = remove_diacritics(current)
            
            steps["punctuation_removed"] = remove_punctuation(current)
            current = remove_punctuation(current)
            
            steps["repeated_chars_reduced"] = reduce_repeated_characters(current)
            current = reduce_repeated_characters(current)
            
            steps["whitespace_normalized"] = whitespace_pattern.sub(" ", current).strip()
            current = whitespace_pattern.sub(" ", current).strip()
            
            steps["numbers_removed"] = re.sub(r"\d+", "", current)
            current = re.sub(r"\d+", "", current)
            
            tokens = current.split()
            steps["tokenized"] = tokens
            
            tokens_no_stop = remove_stopwords(tokens)
            steps["stopwords_removed"] = tokens_no_stop
            
            stemmed_tokens = stem_tokens(tokens_no_stop)
            steps["stemmed"] = stemmed_tokens
            
            steps["final"] = " ".join(stemmed_tokens)
        
        elif task_type == "summarization":
            steps["diacritics_removed"] = remove_diacritics(current)
            current = remove_diacritics(current)
            
            steps["whitespace_normalized"] = whitespace_pattern.sub(" ", current).strip()
            current = whitespace_pattern.sub(" ", current).strip()
            
            steps["numbers_removed"] = re.sub(r"\d+", "", current)
            steps["final"] = re.sub(r"\d+", "", current)
        
        return steps
    
    def analyze_text(self, text: str) -> dict:
        """Analyze text characteristics and statistics."""
        original_sentences = re.split(r"[.!؟\n]+", text)
        original_sentences = [s.strip() for s in original_sentences if s.strip()]
        
        tokens = text.split()
        arabic_chars = len(re.findall(r'[\u0600-\u06FF]', text))
        
        return {
            "character_count": len(text),
            "word_count": len(tokens),
            "sentence_count": len(original_sentences),
            "arabic_character_count": arabic_chars,
            "arabic_character_ratio": arabic_chars / len(text) if len(text) > 0 else 0,
            "average_word_length": sum(len(word) for word in tokens) / len(tokens) if tokens else 0,
            "average_sentence_length": len(tokens) / len(original_sentences) if original_sentences else 0,
            "has_diacritics": bool(re.search(r'[\u064B-\u0652]', text)),
            "punctuation_count": len(re.findall(r'[^\w\s]', text))
        }