File size: 20,438 Bytes
dff1572
 
cf24ede
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dff1572
 
 
 
 
cf24ede
 
 
 
 
 
 
 
dff1572
 
cf24ede
 
 
 
 
 
 
 
 
 
 
 
 
 
dff1572
 
 
 
 
 
 
 
 
 
 
cf24ede
dff1572
 
 
 
cf24ede
 
 
 
 
 
dff1572
 
 
 
 
cf24ede
 
 
 
 
 
 
 
 
 
 
 
 
dff1572
 
 
 
 
cf24ede
 
 
 
 
 
 
 
 
 
 
 
 
dff1572
cf24ede
dff1572
cf24ede
dff1572
cf24ede
 
 
 
dff1572
cf24ede
 
 
 
dff1572
cf24ede
dff1572
cf24ede
 
 
 
dff1572
cf24ede
 
 
 
 
dff1572
cf24ede
dff1572
cf24ede
dff1572
cf24ede
 
dff1572
cf24ede
 
 
 
 
 
 
 
 
 
 
dff1572
 
 
 
cf24ede
 
 
 
 
 
dff1572
 
 
cf24ede
dff1572
 
cf24ede
 
 
 
 
 
 
 
 
 
 
 
 
dff1572
 
 
 
 
 
cf24ede
 
 
 
 
 
 
 
 
 
 
dff1572
cf24ede
 
 
 
 
 
dff1572
cf24ede
 
 
 
 
dff1572
 
 
 
 
 
 
 
cf24ede
 
 
 
 
dff1572
 
 
 
 
 
 
 
 
 
cf24ede
 
 
 
 
 
 
 
 
dff1572
cf24ede
 
 
dff1572
 
 
 
 
 
 
 
cf24ede
dff1572
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cf24ede
 
 
 
 
 
 
 
 
 
 
 
 
 
dff1572
 
 
cf24ede
dff1572
 
 
 
cf24ede
dff1572
 
 
7670c91
dff1572
 
 
 
 
 
 
 
 
 
 
 
64ff698
 
 
 
 
 
 
 
dff1572
 
 
 
 
 
cf24ede
dff1572
 
cf24ede
dff1572
 
 
 
cf24ede
dff1572
 
 
 
cf24ede
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dff1572
 
 
 
 
 
 
cf24ede
 
 
 
 
 
 
dff1572
cf24ede
 
 
 
 
 
dff1572
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cf24ede
 
 
 
 
 
 
 
 
 
dff1572
 
 
 
 
 
 
 
 
 
 
 
cf24ede
 
 
dff1572
cf24ede
dff1572
 
 
 
 
cf24ede
 
 
 
 
 
 
 
 
 
dff1572
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
# File: features/feature_engineer.py
# Enhanced Feature Engineering Pipeline for Priority 6

import json
import joblib
import logging
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime
from scipy.sparse import hstack, csr_matrix
from typing import Dict, List, Any, Optional, Tuple

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.preprocessing import StandardScaler, FunctionTransformer

import warnings
warnings.filterwarnings('ignore')

# Import feature analyzers
from features.sentiment_analyzer import SentimentAnalyzer
from features.readability_analyzer import ReadabilityAnalyzer
from features.entity_analyzer import EntityAnalyzer
from features.linguistic_analyzer import LinguisticAnalyzer

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class AdvancedFeatureEngineer(BaseEstimator, TransformerMixin):
    """
    Advanced feature engineering pipeline combining multiple NLP feature extractors
    for enhanced fake news detection performance.
    """
    
    def __init__(self, 
                 enable_sentiment: bool = True,
                 enable_readability: bool = True,
                 enable_entities: bool = True,
                 enable_linguistic: bool = True,
                 feature_selection_k: int = 5000,
                 tfidf_max_features: int = 10000,
                 ngram_range: Tuple[int, int] = (1, 3),
                 min_df: int = 2,
                 max_df: float = 0.95):
        """
        Initialize the advanced feature engineering pipeline.
        
        Args:
            enable_sentiment: Enable sentiment analysis features
            enable_readability: Enable readability/complexity features
            enable_entities: Enable named entity recognition features
            enable_linguistic: Enable advanced linguistic features
            feature_selection_k: Number of features to select
            tfidf_max_features: Maximum TF-IDF features
            ngram_range: N-gram range for TF-IDF
            min_df: Minimum document frequency for TF-IDF
            max_df: Maximum document frequency for TF-IDF
        """
        self.enable_sentiment = enable_sentiment
        self.enable_readability = enable_readability
        self.enable_entities = enable_entities
        self.enable_linguistic = enable_linguistic
        self.feature_selection_k = feature_selection_k
        self.tfidf_max_features = tfidf_max_features
        self.ngram_range = ngram_range
        self.min_df = min_df
        self.max_df = max_df
        
        # Initialize feature extractors
        self.sentiment_analyzer = SentimentAnalyzer() if enable_sentiment else None
        self.readability_analyzer = ReadabilityAnalyzer() if enable_readability else None
        self.entity_analyzer = EntityAnalyzer() if enable_entities else None
        self.linguistic_analyzer = LinguisticAnalyzer() if enable_linguistic else None
        
        # Initialize TF-IDF components
        self.tfidf_vectorizer = None
        self.feature_selector = None
        self.feature_scaler = None
        
        # Feature metadata
        self.feature_names_ = []
        self.feature_importance_ = {}
        self.is_fitted_ = False
        
    def fit(self, X, y=None):
        """
        Fit the feature engineering pipeline.
        
        Args:
            X: Text data (array-like of strings)
            y: Target labels (optional, for supervised feature selection)
        """
        logger.info("Fitting advanced feature engineering pipeline...")
        
        # Convert to array if needed
        if isinstance(X, pd.Series):
            X = X.values
        elif isinstance(X, list):
            X = np.array(X)
        
        # Validate input
        if len(X) == 0:
            raise ValueError("Cannot fit on empty data")
        
        # Initialize TF-IDF vectorizer
        self.tfidf_vectorizer = TfidfVectorizer(
            max_features=self.tfidf_max_features,
            ngram_range=self.ngram_range,
            min_df=self.min_df,
            max_df=self.max_df,
            stop_words='english',
            sublinear_tf=True,
            norm='l2',
            lowercase=True
        )
        
        # Fit TF-IDF on text data
        logger.info("Fitting TF-IDF vectorizer...")
        tfidf_features = self.tfidf_vectorizer.fit_transform(X)
        
        # Extract additional features
        additional_features = self._extract_additional_features(X, fit=True)
        
        # Combine all features
        if additional_features.shape[1] > 0:
            all_features = hstack([tfidf_features, additional_features])
        else:
            all_features = tfidf_features
            
        logger.info(f"Total features before selection: {all_features.shape[1]}")
        
        # Feature selection
        if y is not None and self.feature_selection_k < all_features.shape[1]:
            logger.info(f"Performing feature selection (k={self.feature_selection_k})...")
            
            # Use chi2 for text features and mutual information for numerical features
            self.feature_selector = SelectKBest(
                score_func=chi2, 
                k=min(self.feature_selection_k, all_features.shape[1])
            )
            
            # Ensure non-negative features for chi2
            if hasattr(all_features, 'toarray'):
                features_dense = all_features.toarray()
            else:
                features_dense = all_features
                
            # Make features non-negative for chi2
            features_dense = np.maximum(features_dense, 0)
            
            self.feature_selector.fit(features_dense, y)
            selected_features = self.feature_selector.transform(features_dense)
            
            logger.info(f"Selected {selected_features.shape[1]} features")
        else:
            selected_features = all_features
            
        # Scale numerical features (additional features only)
        if additional_features.shape[1] > 0:
            self.feature_scaler = StandardScaler()
            # Only scale the additional features part
            additional_selected = selected_features[:, -additional_features.shape[1]:]
            self.feature_scaler.fit(additional_selected)
        
        # Generate feature names
        self._generate_feature_names()
        
        # Calculate feature importance if possible
        if y is not None and self.feature_selector is not None:
            self._calculate_feature_importance()
        
        self.is_fitted_ = True
        logger.info("Feature engineering pipeline fitted successfully")
        
        return self
    
    def transform(self, X):
        """
        Transform text data into enhanced feature vectors.
        
        Args:
            X: Text data (array-like of strings)
            
        Returns:
            Transformed feature matrix
        """
        if not self.is_fitted_:
            raise ValueError("Pipeline must be fitted before transforming")
        
        # Convert to array if needed
        if isinstance(X, pd.Series):
            X = X.values
        elif isinstance(X, list):
            X = np.array(X)
        
        # Extract TF-IDF features
        tfidf_features = self.tfidf_vectorizer.transform(X)
        
        # Extract additional features
        additional_features = self._extract_additional_features(X, fit=False)
        
        # Combine features
        if additional_features.shape[1] > 0:
            all_features = hstack([tfidf_features, additional_features])
        else:
            all_features = tfidf_features
        
        # Apply feature selection
        if self.feature_selector is not None:
            if hasattr(all_features, 'toarray'):
                features_dense = all_features.toarray()
            else:
                features_dense = all_features
                
            # Ensure non-negative for consistency
            features_dense = np.maximum(features_dense, 0)
            selected_features = self.feature_selector.transform(features_dense)
        else:
            selected_features = all_features
        
        # Scale additional features if scaler exists
        if self.feature_scaler is not None and additional_features.shape[1] > 0:
            # Scale only the additional features part
            tfidf_selected = selected_features[:, :-additional_features.shape[1]]
            additional_selected = selected_features[:, -additional_features.shape[1]:]
            additional_scaled = self.feature_scaler.transform(additional_selected)
            
            # Combine back
            if hasattr(tfidf_selected, 'toarray'):
                tfidf_selected = tfidf_selected.toarray()
            
            final_features = np.hstack([tfidf_selected, additional_scaled])
        else:
            if hasattr(selected_features, 'toarray'):
                final_features = selected_features.toarray()
            else:
                final_features = selected_features
        
        return final_features
    
    def _extract_additional_features(self, X, fit=False):
        """Extract additional features beyond TF-IDF"""
        feature_arrays = []
        
        try:
            # Sentiment features
            if self.sentiment_analyzer is not None:
                logger.info("Extracting sentiment features...")
                if fit:
                    sentiment_features = self.sentiment_analyzer.fit_transform(X)
                else:
                    sentiment_features = self.sentiment_analyzer.transform(X)
                feature_arrays.append(sentiment_features)
            
            # Readability features
            if self.readability_analyzer is not None:
                logger.info("Extracting readability features...")
                if fit:
                    readability_features = self.readability_analyzer.fit_transform(X)
                else:
                    readability_features = self.readability_analyzer.transform(X)
                feature_arrays.append(readability_features)
            
            # Entity features
            if self.entity_analyzer is not None:
                logger.info("Extracting entity features...")
                if fit:
                    entity_features = self.entity_analyzer.fit_transform(X)
                else:
                    entity_features = self.entity_analyzer.transform(X)
                feature_arrays.append(entity_features)
            
            # Linguistic features
            if self.linguistic_analyzer is not None:
                logger.info("Extracting linguistic features...")
                if fit:
                    linguistic_features = self.linguistic_analyzer.fit_transform(X)
                else:
                    linguistic_features = self.linguistic_analyzer.transform(X)
                feature_arrays.append(linguistic_features)
            
            # Combine all additional features
            if feature_arrays:
                additional_features = np.hstack(feature_arrays)
                logger.info(f"Extracted {additional_features.shape[1]} additional features")
            else:
                additional_features = np.empty((len(X), 0))
                
        except Exception as e:
            logger.warning(f"Error extracting additional features: {e}")
            additional_features = np.empty((len(X), 0))
        
        return additional_features
    
    def _generate_feature_names(self):
        """Generate comprehensive feature names"""
        self.feature_names_ = []
        
        # TF-IDF feature names
        if self.tfidf_vectorizer is not None:
            tfidf_names = [f"tfidf_{name}" for name in self.tfidf_vectorizer.get_feature_names_out()]
            self.feature_names_.extend(tfidf_names)
        
        # Additional feature names
        if self.sentiment_analyzer is not None:
            self.feature_names_.extend(self.sentiment_analyzer.get_feature_names())
        
        if self.readability_analyzer is not None:
            self.feature_names_.extend(self.readability_analyzer.get_feature_names())
        
        if self.entity_analyzer is not None:
            self.feature_names_.extend(self.entity_analyzer.get_feature_names())
        
        if self.linguistic_analyzer is not None:
            self.feature_names_.extend(self.linguistic_analyzer.get_feature_names())
        
        # Apply feature selection to names if applicable
        if self.feature_selector is not None:
            selected_indices = self.feature_selector.get_support()
            # Add bounds checking to prevent IndexError
            if len(selected_indices) == len(self.feature_names_):
                self.feature_names_ = [name for i, name in enumerate(self.feature_names_) if selected_indices[i]]
            else:
                logger.warning(f"Mismatch: {len(selected_indices)} selected_indices vs {len(self.feature_names_)} feature_names")
                # Use the shorter length to avoid index errors
                min_length = min(len(selected_indices), len(self.feature_names_))
                self.feature_names_ = [name for i, name in enumerate(self.feature_names_[:min_length]) if i < len(selected_indices) and selected_indices[i]]
    
    def _calculate_feature_importance(self):
        """Calculate feature importance scores"""
        if self.feature_selector is not None:
            scores = self.feature_selector.scores_
            selected_indices = self.feature_selector.get_support()
            
            # Get scores for selected features
            selected_scores = scores[selected_indices]
            
            # Create importance dictionary
            self.feature_importance_ = {
                name: float(score) for name, score in zip(self.feature_names_, selected_scores)
            }
            
            # Sort by importance
            self.feature_importance_ = dict(
                sorted(self.feature_importance_.items(), key=lambda x: x[1], reverse=True)
            )
    
    def get_feature_names(self):
        """Get names of output features"""
        if not self.is_fitted_:
            raise ValueError("Pipeline must be fitted first")
        return self.feature_names_
    
    def get_feature_importance(self, top_k=None):
        """Get feature importance scores"""
        if not self.feature_importance_:
            return {}
        
        if top_k is not None:
            return dict(list(self.feature_importance_.items())[:top_k])
        
        return self.feature_importance_
    
    def get_feature_metadata(self):
        """Get comprehensive feature metadata"""
        if not self.is_fitted_:
            raise ValueError("Pipeline must be fitted first")
        
        metadata = {
            'total_features': len(self.feature_names_),
            'feature_types': {
                'tfidf_features': sum(1 for name in self.feature_names_ if name.startswith('tfidf_')),
                'sentiment_features': sum(1 for name in self.feature_names_ if name.startswith('sentiment_')),
                'readability_features': sum(1 for name in self.feature_names_ if name.startswith('readability_')),
                'entity_features': sum(1 for name in self.feature_names_ if name.startswith('entity_')),
                'linguistic_features': sum(1 for name in self.feature_names_ if name.startswith('linguistic_'))
            },
            'configuration': {
                'enable_sentiment': self.enable_sentiment,
                'enable_readability': self.enable_readability,
                'enable_entities': self.enable_entities,
                'enable_linguistic': self.enable_linguistic,
                'feature_selection_k': self.feature_selection_k,
                'tfidf_max_features': self.tfidf_max_features,
                'ngram_range': self.ngram_range
            },
            'feature_importance_available': bool(self.feature_importance_),
            'timestamp': datetime.now().isoformat()
        }
        
        return metadata
    
    def save_pipeline(self, filepath):
        """Save the fitted pipeline"""
        if not self.is_fitted_:
            raise ValueError("Pipeline must be fitted before saving")
        
        save_data = {
            'feature_engineer': self,
            'metadata': self.get_feature_metadata(),
            'feature_names': self.feature_names_,
            'feature_importance': self.feature_importance_
        }
        
        joblib.dump(save_data, filepath)
        logger.info(f"Feature engineering pipeline saved to {filepath}")
    
    @classmethod
    def load_pipeline(cls, filepath):
        """Load a fitted pipeline"""
        save_data = joblib.load(filepath)
        feature_engineer = save_data['feature_engineer']
        
        logger.info(f"Feature engineering pipeline loaded from {filepath}")
        return feature_engineer


def create_enhanced_pipeline(X_train, y_train, 
                           enable_sentiment=True,
                           enable_readability=True, 
                           enable_entities=True,
                           enable_linguistic=True,
                           feature_selection_k=5000):
    """
    Create and fit an enhanced feature engineering pipeline.
    
    Args:
        X_train: Training text data
        y_train: Training labels
        enable_sentiment: Enable sentiment analysis features
        enable_readability: Enable readability features
        enable_entities: Enable entity features
        enable_linguistic: Enable linguistic features
        feature_selection_k: Number of features to select
        
    Returns:
        Fitted AdvancedFeatureEngineer instance
    """
    logger.info("Creating enhanced feature engineering pipeline...")
    
    # Create feature engineer
    feature_engineer = AdvancedFeatureEngineer(
        enable_sentiment=enable_sentiment,
        enable_readability=enable_readability,
        enable_entities=enable_entities,
        enable_linguistic=enable_linguistic,
        feature_selection_k=feature_selection_k
    )
    
    # Fit the pipeline
    feature_engineer.fit(X_train, y_train)
    
    # Log feature information
    metadata = feature_engineer.get_feature_metadata()
    logger.info(f"Enhanced pipeline created with {metadata['total_features']} features")
    logger.info(f"Feature breakdown: {metadata['feature_types']}")
    
    return feature_engineer


def analyze_feature_importance(feature_engineer, top_k=20):
    """
    Analyze and display feature importance.
    
    Args:
        feature_engineer: Fitted AdvancedFeatureEngineer instance
        top_k: Number of top features to analyze
        
    Returns:
        Dictionary with feature analysis results
    """
    if not feature_engineer.is_fitted_:
        raise ValueError("Feature engineer must be fitted first")
    
    # Get feature importance
    importance = feature_engineer.get_feature_importance(top_k=top_k)
    metadata = feature_engineer.get_feature_metadata()
    
    # Analyze feature types in top features
    top_features = list(importance.keys())
    feature_type_counts = {}
    
    for feature in top_features:
        if feature.startswith('tfidf_'):
            feature_type = 'tfidf'
        elif feature.startswith('sentiment_'):
            feature_type = 'sentiment'
        elif feature.startswith('readability_'):
            feature_type = 'readability'
        elif feature.startswith('entity_'):
            feature_type = 'entity'
        elif feature.startswith('linguistic_'):
            feature_type = 'linguistic'
        else:
            feature_type = 'other'
        
        feature_type_counts[feature_type] = feature_type_counts.get(feature_type, 0) + 1
    
    analysis = {
        'top_features': importance,
        'feature_type_distribution': feature_type_counts,
        'total_features': metadata['total_features'],
        'feature_breakdown': metadata['feature_types'],
        'analysis_timestamp': datetime.now().isoformat()
    }
    
    return analysis