File size: 20,023 Bytes
cc910a7
 
 
 
 
 
 
 
2d18777
cc910a7
 
 
 
2d18777
c678ee1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc910a7
c745fee
cc910a7
2d18777
c745fee
 
c678ee1
c745fee
c678ee1
 
 
c745fee
 
 
 
 
2d18777
cc910a7
 
c745fee
cc910a7
2d18777
cc910a7
c745fee
 
c678ee1
 
 
 
cc910a7
2d18777
cc910a7
 
c745fee
 
 
 
 
 
 
2d18777
c745fee
2d18777
cc910a7
 
 
 
2d18777
c745fee
2d18777
cc910a7
c745fee
cc910a7
2d18777
c678ee1
c745fee
 
c678ee1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c745fee
c678ee1
 
 
 
 
 
c745fee
 
 
c678ee1
 
 
c745fee
2d18777
cc910a7
 
c678ee1
 
2d18777
cc910a7
c678ee1
cc910a7
 
c678ee1
 
 
 
 
 
cc910a7
c678ee1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d18777
c745fee
 
 
 
cc910a7
2d18777
c745fee
c678ee1
 
 
 
 
 
 
 
2d18777
cc910a7
 
 
2d18777
cc910a7
 
 
 
2d18777
c678ee1
 
 
 
 
 
 
 
c745fee
 
c678ee1
 
 
c745fee
c678ee1
 
 
 
 
 
 
c745fee
2d18777
c678ee1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d18777
c678ee1
 
cc910a7
c678ee1
2d18777
c678ee1
 
 
2d18777
c678ee1
 
 
 
 
 
 
 
 
 
 
 
 
cc910a7
c678ee1
 
 
 
cc910a7
 
c678ee1
 
 
 
 
 
 
 
 
 
 
cc910a7
c678ee1
 
cc910a7
2d18777
c678ee1
c745fee
cc910a7
2d18777
c678ee1
 
 
 
cc910a7
2d18777
cc910a7
c745fee
c678ee1
 
cc910a7
 
2d18777
cc910a7
c745fee
cc910a7
2d18777
cc910a7
c745fee
cc910a7
 
c678ee1
 
cc910a7
2d18777
c745fee
cc910a7
2d18777
c745fee
c678ee1
 
 
 
c745fee
cc910a7
2d18777
c678ee1
 
 
c745fee
cc910a7
2d18777
cc910a7
c745fee
cc910a7
 
2d18777
c678ee1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc910a7
c678ee1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d18777
c678ee1
cc910a7
 
 
c678ee1
cc910a7
 
2d18777
c678ee1
 
 
 
cc910a7
2d18777
cc910a7
 
c678ee1
cc910a7
c745fee
cc910a7
c745fee
cc910a7
 
c745fee
cc910a7
2d18777
c678ee1
 
 
 
 
 
cc910a7
c678ee1
 
cc910a7
c745fee
2d18777
c678ee1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc910a7
2d18777
cc910a7
c678ee1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
import os
import sys
import shutil
import pandas as pd
import json
from pathlib import Path
from datetime import datetime


def log_step(message):
    """Log initialization steps"""
    print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")


def check_model_exists():
    """Check if trained model already exists"""
    model_files = [
        Path("/tmp/pipeline.pkl"),
        Path("/tmp/model.pkl"),
        Path("/tmp/vectorizer.pkl"),
        Path("/tmp/metadata.json")
    ]
    
    existing_files = [f for f in model_files if f.exists()]
    
    if len(existing_files) >= 2:  # At least pipeline + metadata OR model + vectorizer
        log_step(f"βœ… Found {len(existing_files)} existing model files")
        return True, existing_files
    else:
        log_step(f"❌ Missing model files - only found {len(existing_files)}")
        return False, existing_files


def check_training_data_exists():
    """Check if training data is available"""
    data_files = [
        Path("/tmp/data/combined_dataset.csv"),
        Path("/app/data/combined_dataset.csv"),
        Path("/tmp/data/kaggle/Fake.csv"),
        Path("/tmp/data/kaggle/True.csv")
    ]
    
    existing_data = [f for f in data_files if f.exists()]
    
    if existing_data:
        log_step(f"βœ… Found training data: {[str(f) for f in existing_data]}")
        return True, existing_data
    else:
        log_step("❌ No training data found")
        return False, []


def create_directories():
    """Create necessary directories"""
    log_step("Creating directory structure...")

    directories = [
        "/tmp/data",
        "/tmp/data/kaggle",
        "/tmp/model",
        "/tmp/logs",
        "/tmp/results",
        "/tmp/backups"
    ]

    for dir_path in directories:
        Path(dir_path).mkdir(parents=True, exist_ok=True)
        log_step(f"βœ… Created {dir_path}")


def copy_original_datasets():
    """Copy original datasets from /app to /tmp"""
    log_step("Copying original datasets...")

    source_files = [
        ("/app/data/kaggle/Fake.csv", "/tmp/data/kaggle/Fake.csv"),
        ("/app/data/kaggle/True.csv", "/tmp/data/kaggle/True.csv"),
        ("/app/data/combined_dataset.csv", "/tmp/data/combined_dataset.csv"),
        ("/app/data/liar/train.tsv", "/tmp/data/liar/train.tsv"),
        ("/app/data/liar/test.tsv", "/tmp/data/liar/test.tsv"),
        ("/app/data/liar/valid.tsv", "/tmp/data/liar/valid.tsv")
    ]

    copied_count = 0
    for source, dest in source_files:
        if Path(source).exists():
            Path(dest).parent.mkdir(parents=True, exist_ok=True)
            shutil.copy(source, dest)
            log_step(f"βœ… Copied {source} to {dest}")
            copied_count += 1
        else:
            log_step(f"⚠️ Source file not found: {source}")

    return copied_count > 0


def create_minimal_dataset():
    """Create a minimal dataset if original doesn't exist"""
    log_step("Creating minimal dataset...")

    combined_path = Path("/tmp/data/combined_dataset.csv")

    if combined_path.exists():
        log_step("βœ… Combined dataset already exists")
        return True

    # Create minimal training data with more samples for better training
    minimal_data = pd.DataFrame({
        'text': [
            # Real news samples
            'Scientists at Stanford University have developed a new method for detecting cancer cells using artificial intelligence',
            'The Federal Reserve announced today a decision to maintain current interest rates amid economic uncertainty',
            'Local authorities report significant improvements in air quality following new environmental regulations',
            'Research published in Nature journal shows promising results for renewable energy storage technology',
            'The United Nations climate summit concluded with new commitments from world leaders on carbon reduction',
            'Economic indicators suggest steady growth in the manufacturing sector according to latest government data',
            'Healthcare workers receive additional training on new medical procedures approved by regulatory agencies',
            'Transportation department announces infrastructure improvements to major highways across the region',
            'Educational institutions implement new digital learning platforms to enhance student engagement',
            'Agricultural studies reveal improved crop yields through sustainable farming practices',
            'Technology companies invest heavily in cybersecurity measures to protect user data and privacy',
            'Municipal government approves budget for public transportation expansion project in urban areas',
            'Medical researchers make breakthrough in understanding genetic factors contributing to heart disease',
            'International trade agreements show positive impact on local businesses and job creation',
            'Environmental protection agency releases report on water quality improvements in major rivers',
            
            # Fake news samples
            'SHOCKING: Government secretly controls weather using hidden technology, whistleblower reveals truth',
            'EXPOSED: Celebrities caught in massive conspiracy to manipulate public opinion through social media',
            'URGENT: New study proves that drinking water causes immediate memory loss in 99% of population',
            'BREAKING: Scientists discover that smartphones are actually mind control devices from aliens',
            'EXCLUSIVE: Secret documents reveal that all elections have been predetermined by shadow organization',
            'ALERT: Doctors confirm that eating vegetables makes people 500% more likely to develop rare diseases',
            'LEAKED: Underground network of billionaires planning to replace all humans with artificial intelligence',
            'CONSPIRACY: Major corporations hiding cure for aging to maintain population control and profits',
            'REVEALED: Government admits that gravity is fake and Earth is actually moving upward constantly',
            'WARNING: New technology allows complete thought reading through WiFi signals in your home',
            'BOMBSHELL: Ancient aliens return to Earth disguised as tech executives to harvest human energy',
            'UNCOVERED: All news media controlled by single person living in secret underground bunker',
            'PROOF: Time travel already exists but only available to wealthy elite who control world events',
            'SCANDAL: Pharmaceutical companies intentionally create diseases to sell more expensive treatments',
            'EXPOSED: Education system designed to suppress human creativity and independent thinking abilities'
        ],
        'label': [
            # Real news labels (0)
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            # Fake news labels (1)
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
        ]
    })

    minimal_data.to_csv(combined_path, index=False)
    log_step(f"βœ… Created enhanced minimal dataset with {len(minimal_data)} samples")
    log_step(f"   - Real news samples: {sum(minimal_data['label'] == 0)}")
    log_step(f"   - Fake news samples: {sum(minimal_data['label'] == 1)}")
    return True


def run_initial_training():
    """Run comprehensive model training for first-time setup"""
    log_step("πŸš€ Starting comprehensive model training for first-time setup...")

    try:
        # Import training modules
        from sklearn.feature_extraction.text import TfidfVectorizer
        from sklearn.linear_model import LogisticRegression
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
        from sklearn.pipeline import Pipeline
        from sklearn.feature_selection import SelectKBest, chi2
        from sklearn.preprocessing import FunctionTransformer
        from sklearn.metrics import accuracy_score, f1_score, classification_report
        import joblib
        import re

        # Text preprocessing function (same as in train.py)
        def preprocess_text_function(texts):
            def clean_single_text(text):
                text = str(text)
                text = re.sub(r'http\S+|www\S+|https\S+', '', text)
                text = re.sub(r'\S+@\S+', '', text)
                text = re.sub(r'[!]{2,}', '!', text)
                text = re.sub(r'[?]{2,}', '?', text)
                text = re.sub(r'[.]{3,}', '...', text)
                text = re.sub(r'[^a-zA-Z\s.!?]', '', text)
                text = re.sub(r'\s+', ' ', text)
                return text.strip().lower()
            
            processed = []
            for text in texts:
                processed.append(clean_single_text(text))
            return processed

        # Load dataset
        dataset_path = Path("/tmp/data/combined_dataset.csv")
        if not dataset_path.exists():
            log_step("❌ No dataset available for training")
            return False

        df = pd.read_csv(dataset_path)
        log_step(f"πŸ“Š Loaded dataset with {len(df)} samples")

        # Data validation and cleaning
        df = df.dropna(subset=['text', 'label'])
        df = df[df['text'].astype(str).str.len() > 10]
        
        log_step(f"πŸ“Š After cleaning: {len(df)} samples")
        log_step(f"πŸ“Š Class distribution: {df['label'].value_counts().to_dict()}")

        # Prepare data
        X = df['text'].values
        y = df['label'].values

        # Train-test split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )

        log_step(f"πŸ“Š Data split: {len(X_train)} train, {len(X_test)} test")

        # Create comprehensive pipeline
        text_preprocessor = FunctionTransformer(
            func=preprocess_text_function,
            validate=False
        )

        vectorizer = TfidfVectorizer(
            max_features=5000,
            min_df=1,
            max_df=0.95,
            ngram_range=(1, 2),
            stop_words='english',
            sublinear_tf=True,
            norm='l2'
        )

        feature_selector = SelectKBest(
            score_func=chi2,
            k=2000
        )

        # Create pipeline with Logistic Regression
        pipeline = Pipeline([
            ('preprocess', text_preprocessor),
            ('vectorize', vectorizer),
            ('feature_select', feature_selector),
            ('model', LogisticRegression(max_iter=500, class_weight='balanced', random_state=42))
        ])

        log_step("πŸ”§ Training model with optimized pipeline...")

        # Hyperparameter tuning for datasets with sufficient samples
        if len(X_train) >= 20:
            log_step("βš™οΈ Performing hyperparameter tuning...")
            param_grid = {
                'model__C': [0.1, 1, 10],
                'model__penalty': ['l2']
            }

            cv_folds = max(2, min(3, len(X_train) // 10))
            grid_search = GridSearchCV(
                pipeline,
                param_grid,
                cv=StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42),
                scoring='f1_weighted',
                n_jobs=1
            )

            grid_search.fit(X_train, y_train)
            best_pipeline = grid_search.best_estimator_
            
            log_step(f"βœ… Best parameters: {grid_search.best_params_}")
            log_step(f"βœ… Best CV score: {grid_search.best_score_:.4f}")
        else:
            log_step("βš™οΈ Using simple training for small dataset...")
            pipeline.fit(X_train, y_train)
            best_pipeline = pipeline

        # Evaluate model
        y_pred = best_pipeline.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')

        log_step(f"πŸ“ˆ Model Performance:")
        log_step(f"   - Accuracy: {accuracy:.4f}")
        log_step(f"   - F1 Score: {f1:.4f}")

        # Save model artifacts
        log_step("πŸ’Ύ Saving model artifacts...")

        # Save the complete pipeline
        joblib.dump(best_pipeline, "/tmp/pipeline.pkl")
        log_step("βœ… Saved complete pipeline")

        # Save individual components for compatibility
        joblib.dump(best_pipeline.named_steps['model'], "/tmp/model.pkl")
        joblib.dump(best_pipeline.named_steps['vectorize'], "/tmp/vectorizer.pkl")
        log_step("βœ… Saved individual model components")

        # Generate comprehensive metadata
        metadata = {
            "model_version": f"v1.0_init_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
            "model_type": "logistic_regression",
            "training_method": "initial_setup",
            "dataset_size": len(df),
            "train_size": len(X_train),
            "test_size": len(X_test),
            "test_accuracy": float(accuracy),
            "test_f1": float(f1),
            "hyperparameter_tuning": len(X_train) >= 20,
            "cv_folds": cv_folds if len(X_train) >= 20 else "not_used",
            "class_distribution": df['label'].value_counts().to_dict(),
            "training_config": {
                "max_features": 5000,
                "ngram_range": [1, 2],
                "feature_selection_k": 2000,
                "test_size": 0.2
            },
            "timestamp": datetime.now().isoformat(),
            "initialization_notes": "Model trained during system initialization",
            "ready_for_production": True
        }

        # Save metadata
        with open("/tmp/metadata.json", 'w') as f:
            json.dump(metadata, f, indent=2)

        log_step("βœ… Saved comprehensive metadata")
        log_step(f"πŸŽ‰ Initial model training completed successfully!")
        log_step(f"πŸ“Š Final Performance - Accuracy: {accuracy:.4f}, F1: {f1:.4f}")

        return True

    except Exception as e:
        log_step(f"❌ Training failed: {str(e)}")
        import traceback
        log_step(f"πŸ” Error details: {traceback.format_exc()}")
        return False


def create_initial_logs():
    """Create initial log files"""
    log_step("Creating initial log files...")

    try:
        # Activity log
        activity_log = [{
            "timestamp": datetime.now().strftime("%Y-%m-%d %I:%M %p"),
            "event": "System initialized successfully with trained model",
            "level": "INFO"
        }]

        with open("/tmp/activity_log.json", 'w') as f:
            json.dump(activity_log, f, indent=2)

        # Create empty monitoring logs
        log_dirs = ["/tmp/logs"]
        for log_dir in log_dirs:
            Path(log_dir).mkdir(parents=True, exist_ok=True)

        with open("/tmp/logs/monitoring_log.json", 'w') as f:
            json.dump([], f)

        with open("/tmp/logs/scheduler_execution.json", 'w') as f:
            json.dump([], f)

        log_step("βœ… Initial log files created")
        return True

    except Exception as e:
        log_step(f"❌ Log creation failed: {str(e)}")
        return False


def validate_installation():
    """Validate that the system is properly set up"""
    log_step("πŸ” Validating system installation...")

    validation_checks = []

    # Check model files
    model_exists, model_files = check_model_exists()
    validation_checks.append(("Model Files", model_exists, f"Found: {[str(f.name) for f in model_files]}"))

    # Check data files
    data_exists, data_files = check_training_data_exists()
    validation_checks.append(("Training Data", data_exists, f"Found: {len(data_files)} files"))

    # Check directories
    required_dirs = ["/tmp/data", "/tmp/model", "/tmp/logs"]
    dirs_exist = all(Path(d).exists() for d in required_dirs)
    validation_checks.append(("Directories", dirs_exist, f"Required dirs: {required_dirs}"))

    # Check logs
    log_exists = Path("/tmp/activity_log.json").exists()
    validation_checks.append(("Log Files", log_exists, "Activity log created"))

    # Test model loading
    model_loadable = False
    try:
        import joblib
        pipeline = joblib.load("/tmp/pipeline.pkl")
        test_prediction = pipeline.predict(["This is a test news article"])
        model_loadable = True
        validation_checks.append(("Model Loading", True, f"Test prediction: {test_prediction[0]}"))
    except Exception as e:
        validation_checks.append(("Model Loading", False, f"Error: {str(e)}"))

    # Print validation results
    log_step("πŸ“‹ Validation Results:")
    all_passed = True
    for check_name, passed, details in validation_checks:
        status = "βœ… PASS" if passed else "❌ FAIL"
        log_step(f"   {status} {check_name}: {details}")
        if not passed:
            all_passed = False

    return all_passed, validation_checks


def main():
    """Main initialization function with smart training logic"""
    log_step("πŸš€ Starting intelligent system initialization...")

    # Check if model already exists
    model_exists, existing_model_files = check_model_exists()
    
    if model_exists:
        log_step("🎯 EXISTING INSTALLATION DETECTED")
        log_step("πŸ“„ Found existing model files - skipping training")
        
        # Load existing metadata to show info
        try:
            with open("/tmp/metadata.json", 'r') as f:
                metadata = json.load(f)
            
            log_step(f"πŸ“Š Existing Model Info:")
            log_step(f"   - Version: {metadata.get('model_version', 'Unknown')}")
            log_step(f"   - Accuracy: {metadata.get('test_accuracy', 'Unknown')}")
            log_step(f"   - F1 Score: {metadata.get('test_f1', 'Unknown')}")
            log_step(f"   - Created: {metadata.get('timestamp', 'Unknown')}")
            
        except Exception as e:
            log_step(f"⚠️ Could not read existing metadata: {e}")
    
    else:
        log_step("πŸ†• FIRST-TIME INSTALLATION DETECTED")
        log_step("πŸ”§ No existing model found - will train new model")

    # Run initialization steps
    steps = [
        ("Directory Creation", create_directories),
        ("Dataset Copy", copy_original_datasets),
        ("Dataset Preparation", create_minimal_dataset),
        ("Log Creation", create_initial_logs)
    ]

    # Add training step only if model doesn't exist
    if not model_exists:
        steps.insert(-1, ("πŸ€– Model Training", run_initial_training))

    failed_steps = []

    for step_name, step_function in steps:
        try:
            log_step(f"▢️ Starting: {step_name}")
            if step_function():
                log_step(f"βœ… {step_name} completed")
            else:
                log_step(f"❌ {step_name} failed")
                failed_steps.append(step_name)
        except Exception as e:
            log_step(f"❌ {step_name} failed: {str(e)}")
            failed_steps.append(step_name)

    # Final validation
    log_step("πŸ” Running final system validation...")
    validation_passed, validation_results = validate_installation()

    # Summary
    log_step("=" * 60)
    if failed_steps:
        log_step(f"⚠️ Initialization completed with {len(failed_steps)} issues")
        log_step(f"❌ Failed steps: {', '.join(failed_steps)}")
    else:
        log_step("πŸŽ‰ System initialization completed successfully!")

    if validation_passed:
        log_step("βœ… All validation checks passed!")
        log_step("πŸš€ System is ready for use!")
        
        if not model_exists:
            log_step("πŸ€– NEW MODEL TRAINED AND READY")
            log_step("πŸ“Š You can now start making predictions!")
        else:
            log_step("πŸ”„ EXISTING MODEL VALIDATED AND READY")
            log_step("πŸ“Š System restored from previous installation!")
            
    else:
        log_step("❌ Some validation checks failed")
        log_step("πŸ”§ Manual intervention may be required")

    log_step("=" * 60)


if __name__ == "__main__":
    main()