Spaces:

Ahmedik95316
/

Fake-News-Detection-with-MLOps

Running

File size: 20,023 Bytes

cc910a7
 
 
 
 
 
 
 
2d18777
cc910a7
 
 
 
2d18777
c678ee1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc910a7
c745fee
cc910a7
2d18777
c745fee
 
c678ee1
c745fee
c678ee1
 
 
c745fee
 
 
 
 
2d18777
cc910a7
 
c745fee
cc910a7
2d18777
cc910a7
c745fee
 
c678ee1
 
 
 
cc910a7
2d18777
cc910a7
 
c745fee
 
 
 
 
 
 
2d18777
c745fee
2d18777
cc910a7
 
 
 
2d18777
c745fee
2d18777
cc910a7
c745fee
cc910a7
2d18777
c678ee1
c745fee
 
c678ee1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c745fee
c678ee1
 
 
 
 
 
c745fee
 
 
c678ee1
 
 
c745fee
2d18777
cc910a7
 
c678ee1
 
2d18777
cc910a7
c678ee1
cc910a7
 
c678ee1
 
 
 
 
 
cc910a7
c678ee1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d18777
c745fee
 
 
 
cc910a7
2d18777
c745fee
c678ee1
 
 
 
 
 
 
 
2d18777
cc910a7
 
 
2d18777
cc910a7
 
 
 
2d18777
c678ee1
 
 
 
 
 
 
 
c745fee
 
c678ee1
 
 
c745fee
c678ee1
 
 
 
 
 
 
c745fee
2d18777
c678ee1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d18777
c678ee1
 
cc910a7
c678ee1
2d18777
c678ee1
 
 
2d18777
c678ee1
 
 
 
 
 
 
 
 
 
 
 
 
cc910a7
c678ee1
 
 
 
cc910a7
 
c678ee1
 
 
 
 
 
 
 
 
 
 
cc910a7
c678ee1
 
cc910a7
2d18777
c678ee1
c745fee
cc910a7
2d18777
c678ee1
 
 
 
cc910a7
2d18777
cc910a7
c745fee
c678ee1
 
cc910a7
 
2d18777
cc910a7
c745fee
cc910a7
2d18777
cc910a7
c745fee
cc910a7
 
c678ee1
 
cc910a7
2d18777
c745fee
cc910a7
2d18777
c745fee
c678ee1
 
 
 
c745fee
cc910a7
2d18777
c678ee1
 
 
c745fee
cc910a7
2d18777
cc910a7
c745fee
cc910a7
 
2d18777
c678ee1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc910a7
c678ee1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d18777
c678ee1
cc910a7
 
 
c678ee1
cc910a7
 
2d18777
c678ee1
 
 
 
cc910a7
2d18777
cc910a7
 
c678ee1
cc910a7
c745fee
cc910a7
c745fee
cc910a7
 
c745fee
cc910a7
2d18777
c678ee1
 
 
 
 
 
cc910a7
c678ee1
 
cc910a7
c745fee
2d18777
c678ee1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc910a7
2d18777
cc910a7
c678ee1

import os
import sys
import shutil
import pandas as pd
import json
from pathlib import Path
from datetime import datetime


def log_step(message):
    """Log initialization steps"""
    print(f"[{datetime.now().strftime('%H:%M:%S')}] {message}")


def check_model_exists():
    """Check if trained model already exists"""
    model_files = [
        Path("/tmp/pipeline.pkl"),
        Path("/tmp/model.pkl"),
        Path("/tmp/vectorizer.pkl"),
        Path("/tmp/metadata.json")
    ]
    
    existing_files = [f for f in model_files if f.exists()]
    
    if len(existing_files) >= 2:  # At least pipeline + metadata OR model + vectorizer
        log_step(f"✅ Found {len(existing_files)} existing model files")
        return True, existing_files
    else:
        log_step(f"❌ Missing model files - only found {len(existing_files)}")
        return False, existing_files


def check_training_data_exists():
    """Check if training data is available"""
    data_files = [
        Path("/tmp/data/combined_dataset.csv"),
        Path("/app/data/combined_dataset.csv"),
        Path("/tmp/data/kaggle/Fake.csv"),
        Path("/tmp/data/kaggle/True.csv")
    ]
    
    existing_data = [f for f in data_files if f.exists()]
    
    if existing_data:
        log_step(f"✅ Found training data: {[str(f) for f in existing_data]}")
        return True, existing_data
    else:
        log_step("❌ No training data found")
        return False, []


def create_directories():
    """Create necessary directories"""
    log_step("Creating directory structure...")

    directories = [
        "/tmp/data",
        "/tmp/data/kaggle",
        "/tmp/model",
        "/tmp/logs",
        "/tmp/results",
        "/tmp/backups"
    ]

    for dir_path in directories:
        Path(dir_path).mkdir(parents=True, exist_ok=True)
        log_step(f"✅ Created {dir_path}")


def copy_original_datasets():
    """Copy original datasets from /app to /tmp"""
    log_step("Copying original datasets...")

    source_files = [
        ("/app/data/kaggle/Fake.csv", "/tmp/data/kaggle/Fake.csv"),
        ("/app/data/kaggle/True.csv", "/tmp/data/kaggle/True.csv"),
        ("/app/data/combined_dataset.csv", "/tmp/data/combined_dataset.csv"),
        ("/app/data/liar/train.tsv", "/tmp/data/liar/train.tsv"),
        ("/app/data/liar/test.tsv", "/tmp/data/liar/test.tsv"),
        ("/app/data/liar/valid.tsv", "/tmp/data/liar/valid.tsv")
    ]

    copied_count = 0
    for source, dest in source_files:
        if Path(source).exists():
            Path(dest).parent.mkdir(parents=True, exist_ok=True)
            shutil.copy(source, dest)
            log_step(f"✅ Copied {source} to {dest}")
            copied_count += 1
        else:
            log_step(f"⚠️ Source file not found: {source}")

    return copied_count > 0


def create_minimal_dataset():
    """Create a minimal dataset if original doesn't exist"""
    log_step("Creating minimal dataset...")

    combined_path = Path("/tmp/data/combined_dataset.csv")

    if combined_path.exists():
        log_step("✅ Combined dataset already exists")
        return True

    # Create minimal training data with more samples for better training
    minimal_data = pd.DataFrame({
        'text': [
            # Real news samples
            'Scientists at Stanford University have developed a new method for detecting cancer cells using artificial intelligence',
            'The Federal Reserve announced today a decision to maintain current interest rates amid economic uncertainty',
            'Local authorities report significant improvements in air quality following new environmental regulations',
            'Research published in Nature journal shows promising results for renewable energy storage technology',
            'The United Nations climate summit concluded with new commitments from world leaders on carbon reduction',
            'Economic indicators suggest steady growth in the manufacturing sector according to latest government data',
            'Healthcare workers receive additional training on new medical procedures approved by regulatory agencies',
            'Transportation department announces infrastructure improvements to major highways across the region',
            'Educational institutions implement new digital learning platforms to enhance student engagement',
            'Agricultural studies reveal improved crop yields through sustainable farming practices',
            'Technology companies invest heavily in cybersecurity measures to protect user data and privacy',
            'Municipal government approves budget for public transportation expansion project in urban areas',
            'Medical researchers make breakthrough in understanding genetic factors contributing to heart disease',
            'International trade agreements show positive impact on local businesses and job creation',
            'Environmental protection agency releases report on water quality improvements in major rivers',
            
            # Fake news samples
            'SHOCKING: Government secretly controls weather using hidden technology, whistleblower reveals truth',
            'EXPOSED: Celebrities caught in massive conspiracy to manipulate public opinion through social media',
            'URGENT: New study proves that drinking water causes immediate memory loss in 99% of population',
            'BREAKING: Scientists discover that smartphones are actually mind control devices from aliens',
            'EXCLUSIVE: Secret documents reveal that all elections have been predetermined by shadow organization',
            'ALERT: Doctors confirm that eating vegetables makes people 500% more likely to develop rare diseases',
            'LEAKED: Underground network of billionaires planning to replace all humans with artificial intelligence',
            'CONSPIRACY: Major corporations hiding cure for aging to maintain population control and profits',
            'REVEALED: Government admits that gravity is fake and Earth is actually moving upward constantly',
            'WARNING: New technology allows complete thought reading through WiFi signals in your home',
            'BOMBSHELL: Ancient aliens return to Earth disguised as tech executives to harvest human energy',
            'UNCOVERED: All news media controlled by single person living in secret underground bunker',
            'PROOF: Time travel already exists but only available to wealthy elite who control world events',
            'SCANDAL: Pharmaceutical companies intentionally create diseases to sell more expensive treatments',
            'EXPOSED: Education system designed to suppress human creativity and independent thinking abilities'
        ],
        'label': [
            # Real news labels (0)
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            # Fake news labels (1)
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
        ]
    })

    minimal_data.to_csv(combined_path, index=False)
    log_step(f"✅ Created enhanced minimal dataset with {len(minimal_data)} samples")
    log_step(f"   - Real news samples: {sum(minimal_data['label'] == 0)}")
    log_step(f"   - Fake news samples: {sum(minimal_data['label'] == 1)}")
    return True


def run_initial_training():
    """Run comprehensive model training for first-time setup"""
    log_step("🚀 Starting comprehensive model training for first-time setup...")

    try:
        # Import training modules
        from sklearn.feature_extraction.text import TfidfVectorizer
        from sklearn.linear_model import LogisticRegression
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
        from sklearn.pipeline import Pipeline
        from sklearn.feature_selection import SelectKBest, chi2
        from sklearn.preprocessing import FunctionTransformer
        from sklearn.metrics import accuracy_score, f1_score, classification_report
        import joblib
        import re

        # Text preprocessing function (same as in train.py)
        def preprocess_text_function(texts):
            def clean_single_text(text):
                text = str(text)
                text = re.sub(r'http\S+|www\S+|https\S+', '', text)
                text = re.sub(r'\S+@\S+', '', text)
                text = re.sub(r'[!]{2,}', '!', text)
                text = re.sub(r'[?]{2,}', '?', text)
                text = re.sub(r'[.]{3,}', '...', text)
                text = re.sub(r'[^a-zA-Z\s.!?]', '', text)
                text = re.sub(r'\s+', ' ', text)
                return text.strip().lower()
            
            processed = []
            for text in texts:
                processed.append(clean_single_text(text))
            return processed

        # Load dataset
        dataset_path = Path("/tmp/data/combined_dataset.csv")
        if not dataset_path.exists():
            log_step("❌ No dataset available for training")
            return False

        df = pd.read_csv(dataset_path)
        log_step(f"📊 Loaded dataset with {len(df)} samples")

        # Data validation and cleaning
        df = df.dropna(subset=['text', 'label'])
        df = df[df['text'].astype(str).str.len() > 10]
        
        log_step(f"📊 After cleaning: {len(df)} samples")
        log_step(f"📊 Class distribution: {df['label'].value_counts().to_dict()}")

        # Prepare data
        X = df['text'].values
        y = df['label'].values

        # Train-test split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )

        log_step(f"📊 Data split: {len(X_train)} train, {len(X_test)} test")

        # Create comprehensive pipeline
        text_preprocessor = FunctionTransformer(
            func=preprocess_text_function,
            validate=False
        )

        vectorizer = TfidfVectorizer(
            max_features=5000,
            min_df=1,
            max_df=0.95,
            ngram_range=(1, 2),
            stop_words='english',
            sublinear_tf=True,
            norm='l2'
        )

        feature_selector = SelectKBest(
            score_func=chi2,
            k=2000
        )

        # Create pipeline with Logistic Regression
        pipeline = Pipeline([
            ('preprocess', text_preprocessor),
            ('vectorize', vectorizer),
            ('feature_select', feature_selector),
            ('model', LogisticRegression(max_iter=500, class_weight='balanced', random_state=42))
        ])

        log_step("🔧 Training model with optimized pipeline...")

        # Hyperparameter tuning for datasets with sufficient samples
        if len(X_train) >= 20:
            log_step("⚙️ Performing hyperparameter tuning...")
            param_grid = {
                'model__C': [0.1, 1, 10],
                'model__penalty': ['l2']
            }

            cv_folds = max(2, min(3, len(X_train) // 10))
            grid_search = GridSearchCV(
                pipeline,
                param_grid,
                cv=StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42),
                scoring='f1_weighted',
                n_jobs=1
            )

            grid_search.fit(X_train, y_train)
            best_pipeline = grid_search.best_estimator_
            
            log_step(f"✅ Best parameters: {grid_search.best_params_}")
            log_step(f"✅ Best CV score: {grid_search.best_score_:.4f}")
        else:
            log_step("⚙️ Using simple training for small dataset...")
            pipeline.fit(X_train, y_train)
            best_pipeline = pipeline

        # Evaluate model
        y_pred = best_pipeline.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')

        log_step(f"📈 Model Performance:")
        log_step(f"   - Accuracy: {accuracy:.4f}")
        log_step(f"   - F1 Score: {f1:.4f}")

        # Save model artifacts
        log_step("💾 Saving model artifacts...")

        # Save the complete pipeline
        joblib.dump(best_pipeline, "/tmp/pipeline.pkl")
        log_step("✅ Saved complete pipeline")

        # Save individual components for compatibility
        joblib.dump(best_pipeline.named_steps['model'], "/tmp/model.pkl")
        joblib.dump(best_pipeline.named_steps['vectorize'], "/tmp/vectorizer.pkl")
        log_step("✅ Saved individual model components")

        # Generate comprehensive metadata
        metadata = {
            "model_version": f"v1.0_init_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
            "model_type": "logistic_regression",
            "training_method": "initial_setup",
            "dataset_size": len(df),
            "train_size": len(X_train),
            "test_size": len(X_test),
            "test_accuracy": float(accuracy),
            "test_f1": float(f1),
            "hyperparameter_tuning": len(X_train) >= 20,
            "cv_folds": cv_folds if len(X_train) >= 20 else "not_used",
            "class_distribution": df['label'].value_counts().to_dict(),
            "training_config": {
                "max_features": 5000,
                "ngram_range": [1, 2],
                "feature_selection_k": 2000,
                "test_size": 0.2
            },
            "timestamp": datetime.now().isoformat(),
            "initialization_notes": "Model trained during system initialization",
            "ready_for_production": True
        }

        # Save metadata
        with open("/tmp/metadata.json", 'w') as f:
            json.dump(metadata, f, indent=2)

        log_step("✅ Saved comprehensive metadata")
        log_step(f"🎉 Initial model training completed successfully!")
        log_step(f"📊 Final Performance - Accuracy: {accuracy:.4f}, F1: {f1:.4f}")

        return True

    except Exception as e:
        log_step(f"❌ Training failed: {str(e)}")
        import traceback
        log_step(f"🔍 Error details: {traceback.format_exc()}")
        return False


def create_initial_logs():
    """Create initial log files"""
    log_step("Creating initial log files...")

    try:
        # Activity log
        activity_log = [{
            "timestamp": datetime.now().strftime("%Y-%m-%d %I:%M %p"),
            "event": "System initialized successfully with trained model",
            "level": "INFO"
        }]

        with open("/tmp/activity_log.json", 'w') as f:
            json.dump(activity_log, f, indent=2)

        # Create empty monitoring logs
        log_dirs = ["/tmp/logs"]
        for log_dir in log_dirs:
            Path(log_dir).mkdir(parents=True, exist_ok=True)

        with open("/tmp/logs/monitoring_log.json", 'w') as f:
            json.dump([], f)

        with open("/tmp/logs/scheduler_execution.json", 'w') as f:
            json.dump([], f)

        log_step("✅ Initial log files created")
        return True

    except Exception as e:
        log_step(f"❌ Log creation failed: {str(e)}")
        return False


def validate_installation():
    """Validate that the system is properly set up"""
    log_step("🔍 Validating system installation...")

    validation_checks = []

    # Check model files
    model_exists, model_files = check_model_exists()
    validation_checks.append(("Model Files", model_exists, f"Found: {[str(f.name) for f in model_files]}"))

    # Check data files
    data_exists, data_files = check_training_data_exists()
    validation_checks.append(("Training Data", data_exists, f"Found: {len(data_files)} files"))

    # Check directories
    required_dirs = ["/tmp/data", "/tmp/model", "/tmp/logs"]
    dirs_exist = all(Path(d).exists() for d in required_dirs)
    validation_checks.append(("Directories", dirs_exist, f"Required dirs: {required_dirs}"))

    # Check logs
    log_exists = Path("/tmp/activity_log.json").exists()
    validation_checks.append(("Log Files", log_exists, "Activity log created"))

    # Test model loading
    model_loadable = False
    try:
        import joblib
        pipeline = joblib.load("/tmp/pipeline.pkl")
        test_prediction = pipeline.predict(["This is a test news article"])
        model_loadable = True
        validation_checks.append(("Model Loading", True, f"Test prediction: {test_prediction[0]}"))
    except Exception as e:
        validation_checks.append(("Model Loading", False, f"Error: {str(e)}"))

    # Print validation results
    log_step("📋 Validation Results:")
    all_passed = True
    for check_name, passed, details in validation_checks:
        status = "✅ PASS" if passed else "❌ FAIL"
        log_step(f"   {status} {check_name}: {details}")
        if not passed:
            all_passed = False

    return all_passed, validation_checks


def main():
    """Main initialization function with smart training logic"""
    log_step("🚀 Starting intelligent system initialization...")

    # Check if model already exists
    model_exists, existing_model_files = check_model_exists()
    
    if model_exists:
        log_step("🎯 EXISTING INSTALLATION DETECTED")
        log_step("📄 Found existing model files - skipping training")
        
        # Load existing metadata to show info
        try:
            with open("/tmp/metadata.json", 'r') as f:
                metadata = json.load(f)
            
            log_step(f"📊 Existing Model Info:")
            log_step(f"   - Version: {metadata.get('model_version', 'Unknown')}")
            log_step(f"   - Accuracy: {metadata.get('test_accuracy', 'Unknown')}")
            log_step(f"   - F1 Score: {metadata.get('test_f1', 'Unknown')}")
            log_step(f"   - Created: {metadata.get('timestamp', 'Unknown')}")
            
        except Exception as e:
            log_step(f"⚠️ Could not read existing metadata: {e}")
    
    else:
        log_step("🆕 FIRST-TIME INSTALLATION DETECTED")
        log_step("🔧 No existing model found - will train new model")

    # Run initialization steps
    steps = [
        ("Directory Creation", create_directories),
        ("Dataset Copy", copy_original_datasets),
        ("Dataset Preparation", create_minimal_dataset),
        ("Log Creation", create_initial_logs)
    ]

    # Add training step only if model doesn't exist
    if not model_exists:
        steps.insert(-1, ("🤖 Model Training", run_initial_training))

    failed_steps = []

    for step_name, step_function in steps:
        try:
            log_step(f"▶️ Starting: {step_name}")
            if step_function():
                log_step(f"✅ {step_name} completed")
            else:
                log_step(f"❌ {step_name} failed")
                failed_steps.append(step_name)
        except Exception as e:
            log_step(f"❌ {step_name} failed: {str(e)}")
            failed_steps.append(step_name)

    # Final validation
    log_step("🔍 Running final system validation...")
    validation_passed, validation_results = validate_installation()

    # Summary
    log_step("=" * 60)
    if failed_steps:
        log_step(f"⚠️ Initialization completed with {len(failed_steps)} issues")
        log_step(f"❌ Failed steps: {', '.join(failed_steps)}")
    else:
        log_step("🎉 System initialization completed successfully!")

    if validation_passed:
        log_step("✅ All validation checks passed!")
        log_step("🚀 System is ready for use!")
        
        if not model_exists:
            log_step("🤖 NEW MODEL TRAINED AND READY")
            log_step("📊 You can now start making predictions!")
        else:
            log_step("🔄 EXISTING MODEL VALIDATED AND READY")
            log_step("📊 System restored from previous installation!")
            
    else:
        log_step("❌ Some validation checks failed")
        log_step("🔧 Manual intervention may be required")

    log_step("=" * 60)


if __name__ == "__main__":
    main()