File size: 5,092 Bytes
5fc9256
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import torch
import numpy as np
import re
from typing import Dict, List, Any
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
from preprocessor import preprocess_for_summarization


class BERTExtractiveSummarizer:
    def __init__(self, model_name='aubmindlab/bert-base-arabertv02'):
        """Initialize BERT-based Arabic summarizer."""
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")
        
        # Load tokenizer and model
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name)
        self.model.to(self.device)
        self.model.eval()
    
    def get_sentence_embeddings(self, sentences: List[str]) -> np.ndarray:
        """Get BERT embeddings for sentences."""
        embeddings = []
        
        with torch.no_grad():
            for sentence in sentences:
                # Tokenize
                inputs = self.tokenizer(
                    sentence,
                    return_tensors='pt',
                    max_length=512,
                    truncation=True,
                    padding=True
                ).to(self.device)
                
                # Get embeddings
                outputs = self.model(**inputs)
                # Use CLS token embedding
                embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
                embeddings.append(embedding.squeeze())
        
        return np.array(embeddings)
    
    def summarize(self, text: str, num_sentences: int = 3) -> Dict[str, Any]:
        """
        Summarize Arabic text using BERT extractive summarization.
        Returns the same structure as other summarizers for consistency.
        """
        print(f"BERT Summarizer: Processing text with {len(text)} characters")
        
        # Use the same preprocessing as TF-IDF for fair comparison
        cleaned_text = preprocess_for_summarization(text)
        print(f"BERT Summarizer: After preprocessing: '{cleaned_text[:100]}...'")
        
        # Split into sentences - same approach as TF-IDF
        sentences = re.split(r'[.!؟\n]+', cleaned_text)
        sentences = [s.strip() for s in sentences if s.strip()]  # Same as TF-IDF
        
        print(f"BERT Summarizer: Found {len(sentences)} sentences")
        original_sentence_count = len(sentences)
        
        # If we have fewer sentences than requested, return all
        if len(sentences) <= num_sentences:
            print(f"BERT Summarizer: Returning all {len(sentences)} sentences (fewer than requested)")
            return {
                "summary": cleaned_text.strip(),  # Use cleaned text like TF-IDF
                "original_sentence_count": original_sentence_count,
                "summary_sentence_count": len(sentences),
                "sentences": sentences,
                "selected_indices": list(range(len(sentences))),
                "sentence_scores": [1.0] * len(sentences)  # All sentences selected
            }
        
        print("BERT Summarizer: Getting sentence embeddings...")
        # Get sentence embeddings
        sentence_embeddings = self.get_sentence_embeddings(sentences)
        print(f"BERT Summarizer: Got embeddings shape: {sentence_embeddings.shape}")
        
        # Calculate document embedding (mean of all sentences)
        doc_embedding = np.mean(sentence_embeddings, axis=0)
        
        # Calculate similarity scores
        similarities = cosine_similarity([doc_embedding], sentence_embeddings)[0]
        print(f"BERT Summarizer: Similarity scores: {similarities}")
        
        # Get top sentences (indices with highest scores)
        top_indices = np.argsort(similarities)[-num_sentences:]
        print(f"BERT Summarizer: Top indices: {top_indices}")
        
        # Sort indices to maintain original order in summary
        top_indices_sorted = sorted(top_indices)
        # Convert numpy indices to regular ints for JSON serialization
        top_indices_sorted = [int(i) for i in top_indices_sorted]
        print(f"BERT Summarizer: Selected indices (in order): {top_indices_sorted}")
        
        # Get selected sentences and their scores
        selected_sentences = [sentences[i] for i in top_indices_sorted]
        selected_scores = [float(similarities[i]) for i in top_indices_sorted]
        
        print(f"BERT Summarizer: Selected sentences: {[s[:50] + '...' for s in selected_sentences]}")
        
        # Create summary by joining selected sentences
        summary = ' '.join(selected_sentences)
        
        return {
            "summary": summary,
            "original_sentence_count": original_sentence_count,
            "summary_sentence_count": len(selected_sentences),
            "sentences": sentences,  # All original sentences
            "selected_indices": top_indices_sorted,
            "sentence_scores": selected_scores,
            "top_sentence_scores": selected_scores  # Additional info
        }