File size: 5,868 Bytes
354c6a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
import re
from nltk.corpus import stopwords
from nltk.stem.isri import ISRIStemmer
arabic_stopwords = set(stopwords.words("arabic"))
stemmer = ISRIStemmer()
char_map = str.maketrans(
{"أ": "ا", "إ": "ا", "آ": "ا", "ى": "ي", "ة": "ه", "ؤ": "و", "ئ": "ي", "ـ": ""}
)
diacritics_pattern = re.compile(r"[\u064B-\u0652]")
punctuation_pattern = re.compile(r"[^\w\s]")
whitespace_pattern = re.compile(r"\s+")
repeated_char_pattern = re.compile(r"(.)\1+")
def normalize_arabic(text: str) -> str:
"""Normalize Arabic characters."""
return text.translate(char_map)
def remove_diacritics(text: str) -> str:
"""Remove Arabic diacritics."""
return diacritics_pattern.sub("", text)
def remove_punctuation(text: str) -> str:
"""Remove punctuation marks."""
return punctuation_pattern.sub(" ", text)
def reduce_repeated_characters(text: str) -> str:
"""Reduce repeated characters to single occurrence."""
return repeated_char_pattern.sub(r"\1", text)
def remove_stopwords(tokens: list[str]) -> list[str]:
"""Remove Arabic stopwords from tokens."""
return [word for word in tokens if word not in arabic_stopwords]
def stem_tokens(tokens: list[str]) -> list[str]:
"""Apply ISRI stemming to tokens."""
return [stemmer.stem(token) for token in tokens]
def preprocess_for_classification(text: str) -> str:
"""Preprocess text for classification: normalize, clean, tokenize, stem."""
text = text.strip().lower()
text = normalize_arabic(text)
text = remove_diacritics(text)
text = remove_punctuation(text)
text = reduce_repeated_characters(text)
text = whitespace_pattern.sub(" ", text).strip()
text = re.sub(r"\d+", "", text)
tokens = text.split()
tokens = remove_stopwords(tokens)
tokens = stem_tokens(tokens)
return " ".join(tokens)
def preprocess_for_summarization(text: str) -> str:
"""Light preprocessing for summarization: remove diacritics and numbers."""
if not isinstance(text, str):
return ""
text = text.strip().lower()
text = remove_diacritics(text)
text = whitespace_pattern.sub(" ", text).strip()
return re.sub(r"\d+", "", text)
class ArabicPreprocessor:
"""Arabic text preprocessor with analysis capabilities."""
def __init__(self):
self.arabic_stopwords = arabic_stopwords
self.stemmer = stemmer
self.char_map = char_map
def preprocess_for_classification(self, text: str) -> str:
"""Preprocess text for classification."""
return preprocess_for_classification(text)
def preprocess_for_summarization(self, text: str) -> str:
"""Preprocess text for summarization."""
return preprocess_for_summarization(text)
def get_preprocessing_steps(self, text: str, task_type: str = "classification") -> dict:
"""Get detailed preprocessing steps for analysis."""
steps = {
"original": text,
"stripped_lowered": text.strip().lower(),
}
current = text.strip().lower()
if task_type == "classification":
steps["normalized"] = normalize_arabic(current)
current = normalize_arabic(current)
steps["diacritics_removed"] = remove_diacritics(current)
current = remove_diacritics(current)
steps["punctuation_removed"] = remove_punctuation(current)
current = remove_punctuation(current)
steps["repeated_chars_reduced"] = reduce_repeated_characters(current)
current = reduce_repeated_characters(current)
steps["whitespace_normalized"] = whitespace_pattern.sub(" ", current).strip()
current = whitespace_pattern.sub(" ", current).strip()
steps["numbers_removed"] = re.sub(r"\d+", "", current)
current = re.sub(r"\d+", "", current)
tokens = current.split()
steps["tokenized"] = tokens
tokens_no_stop = remove_stopwords(tokens)
steps["stopwords_removed"] = tokens_no_stop
stemmed_tokens = stem_tokens(tokens_no_stop)
steps["stemmed"] = stemmed_tokens
steps["final"] = " ".join(stemmed_tokens)
elif task_type == "summarization":
steps["diacritics_removed"] = remove_diacritics(current)
current = remove_diacritics(current)
steps["whitespace_normalized"] = whitespace_pattern.sub(" ", current).strip()
current = whitespace_pattern.sub(" ", current).strip()
steps["numbers_removed"] = re.sub(r"\d+", "", current)
steps["final"] = re.sub(r"\d+", "", current)
return steps
def analyze_text(self, text: str) -> dict:
"""Analyze text characteristics and statistics."""
original_sentences = re.split(r"[.!؟\n]+", text)
original_sentences = [s.strip() for s in original_sentences if s.strip()]
tokens = text.split()
arabic_chars = len(re.findall(r'[\u0600-\u06FF]', text))
return {
"character_count": len(text),
"word_count": len(tokens),
"sentence_count": len(original_sentences),
"arabic_character_count": arabic_chars,
"arabic_character_ratio": arabic_chars / len(text) if len(text) > 0 else 0,
"average_word_length": sum(len(word) for word in tokens) / len(tokens) if tokens else 0,
"average_sentence_length": len(tokens) / len(original_sentences) if original_sentences else 0,
"has_diacritics": bool(re.search(r'[\u064B-\u0652]', text)),
"punctuation_count": len(re.findall(r'[^\w\s]', text))
}
|