Spaces:

mabosaimi
/

arabic-summarizer-classifier

Running

File size: 5,868 Bytes

354c6a0

import re
from nltk.corpus import stopwords
from nltk.stem.isri import ISRIStemmer

arabic_stopwords = set(stopwords.words("arabic"))
stemmer = ISRIStemmer()

char_map = str.maketrans(
    {"أ": "ا", "إ": "ا", "آ": "ا", "ى": "ي", "ة": "ه", "ؤ": "و", "ئ": "ي", "ـ": ""}
)

diacritics_pattern = re.compile(r"[\u064B-\u0652]")
punctuation_pattern = re.compile(r"[^\w\s]")
whitespace_pattern = re.compile(r"\s+")
repeated_char_pattern = re.compile(r"(.)\1+")


def normalize_arabic(text: str) -> str:
    """Normalize Arabic characters."""
    return text.translate(char_map)


def remove_diacritics(text: str) -> str:
    """Remove Arabic diacritics."""
    return diacritics_pattern.sub("", text)


def remove_punctuation(text: str) -> str:
    """Remove punctuation marks."""
    return punctuation_pattern.sub(" ", text)


def reduce_repeated_characters(text: str) -> str:
    """Reduce repeated characters to single occurrence."""
    return repeated_char_pattern.sub(r"\1", text)


def remove_stopwords(tokens: list[str]) -> list[str]:
    """Remove Arabic stopwords from tokens."""
    return [word for word in tokens if word not in arabic_stopwords]


def stem_tokens(tokens: list[str]) -> list[str]:
    """Apply ISRI stemming to tokens."""
    return [stemmer.stem(token) for token in tokens]


def preprocess_for_classification(text: str) -> str:
    """Preprocess text for classification: normalize, clean, tokenize, stem."""
    text = text.strip().lower()
    text = normalize_arabic(text)
    text = remove_diacritics(text)
    text = remove_punctuation(text)
    text = reduce_repeated_characters(text)
    text = whitespace_pattern.sub(" ", text).strip()
    text = re.sub(r"\d+", "", text)
    tokens = text.split()
    tokens = remove_stopwords(tokens)
    tokens = stem_tokens(tokens)
    return " ".join(tokens)


def preprocess_for_summarization(text: str) -> str:
    """Light preprocessing for summarization: remove diacritics and numbers."""
    if not isinstance(text, str):
        return ""
    text = text.strip().lower()
    text = remove_diacritics(text)
    text = whitespace_pattern.sub(" ", text).strip()
    return re.sub(r"\d+", "", text)


class ArabicPreprocessor:
    """Arabic text preprocessor with analysis capabilities."""
    
    def __init__(self):
        self.arabic_stopwords = arabic_stopwords
        self.stemmer = stemmer
        self.char_map = char_map
    
    def preprocess_for_classification(self, text: str) -> str:
        """Preprocess text for classification."""
        return preprocess_for_classification(text)
    
    def preprocess_for_summarization(self, text: str) -> str:
        """Preprocess text for summarization."""
        return preprocess_for_summarization(text)
    
    def get_preprocessing_steps(self, text: str, task_type: str = "classification") -> dict:
        """Get detailed preprocessing steps for analysis."""
        steps = {
            "original": text,
            "stripped_lowered": text.strip().lower(),
        }
        
        current = text.strip().lower()
        
        if task_type == "classification":
            steps["normalized"] = normalize_arabic(current)
            current = normalize_arabic(current)
            
            steps["diacritics_removed"] = remove_diacritics(current)
            current = remove_diacritics(current)
            
            steps["punctuation_removed"] = remove_punctuation(current)
            current = remove_punctuation(current)
            
            steps["repeated_chars_reduced"] = reduce_repeated_characters(current)
            current = reduce_repeated_characters(current)
            
            steps["whitespace_normalized"] = whitespace_pattern.sub(" ", current).strip()
            current = whitespace_pattern.sub(" ", current).strip()
            
            steps["numbers_removed"] = re.sub(r"\d+", "", current)
            current = re.sub(r"\d+", "", current)
            
            tokens = current.split()
            steps["tokenized"] = tokens
            
            tokens_no_stop = remove_stopwords(tokens)
            steps["stopwords_removed"] = tokens_no_stop
            
            stemmed_tokens = stem_tokens(tokens_no_stop)
            steps["stemmed"] = stemmed_tokens
            
            steps["final"] = " ".join(stemmed_tokens)
        
        elif task_type == "summarization":
            steps["diacritics_removed"] = remove_diacritics(current)
            current = remove_diacritics(current)
            
            steps["whitespace_normalized"] = whitespace_pattern.sub(" ", current).strip()
            current = whitespace_pattern.sub(" ", current).strip()
            
            steps["numbers_removed"] = re.sub(r"\d+", "", current)
            steps["final"] = re.sub(r"\d+", "", current)
        
        return steps
    
    def analyze_text(self, text: str) -> dict:
        """Analyze text characteristics and statistics."""
        original_sentences = re.split(r"[.!؟\n]+", text)
        original_sentences = [s.strip() for s in original_sentences if s.strip()]
        
        tokens = text.split()
        arabic_chars = len(re.findall(r'[\u0600-\u06FF]', text))
        
        return {
            "character_count": len(text),
            "word_count": len(tokens),
            "sentence_count": len(original_sentences),
            "arabic_character_count": arabic_chars,
            "arabic_character_ratio": arabic_chars / len(text) if len(text) > 0 else 0,
            "average_word_length": sum(len(word) for word in tokens) / len(tokens) if tokens else 0,
            "average_sentence_length": len(tokens) / len(original_sentences) if original_sentences else 0,
            "has_diacritics": bool(re.search(r'[\u064B-\u0652]', text)),
            "punctuation_count": len(re.findall(r'[^\w\s]', text))
        }