Spaces:

vishwak1
/

disease_prediction

Configuration error

File size: 6,067 Bytes

fb61aba

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os
import gc
import psutil
from typing import Dict, List, Tuple

# Set the style for plots
sns.set(style="whitegrid")

# Set up memory monitoring
def print_memory_usage():
    process = psutil.Process(os.getpid())
    memory_usage = process.memory_info().rss / (1024 * 1024)  # Convert to MB
    print(f"Current memory usage: {memory_usage:.2f} MB")

def train_and_evaluate_models(X_train, X_test, y_train, y_test, feature_names=None):
    """

    Train and evaluate multiple regression models for COVID-19 prediction

    

    Parameters:

    - X_train, X_test, y_train, y_test: Training and testing data

    - feature_names: List of feature names (for feature importance)

    

    Returns:

    - models: Dictionary of trained models

    - metrics: Dictionary of evaluation metrics for each model

    """
    models = {
        'Linear Regression': LinearRegression(),
        'Support Vector Regression': SVR(kernel='rbf', gamma='scale', C=1.0, epsilon=0.1),
        'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42),
        'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
    }
    
    metrics = {
        'Model': [],
        'RMSE': [],
        'MAE': [],
        'R²': []
    }
    
    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(X_train, y_train)
        
        # Predict
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        # Store metrics
        metrics['Model'].append(name)
        metrics['RMSE'].append(rmse)
        metrics['MAE'].append(mae)
        metrics['R²'].append(r2)
        
        print(f"{name} - RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.4f}")
        
        # Save the model
        joblib.dump(model, f'{name.replace(" ", "_").lower()}_model.pkl')
        
        # Plot actual vs predicted
        plt.figure(figsize=(10, 6))
        plt.scatter(y_test, y_pred, alpha=0.5)
        plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
        plt.title(f'{name} - Actual vs Predicted')
        plt.xlabel('Actual')
        plt.ylabel('Predicted')
        plt.savefig(f'{name.replace(" ", "_").lower()}_predictions.png')
        
        # If it's Random Forest or Gradient Boosting, plot feature importance
        if name in ['Random Forest', 'Gradient Boosting'] and feature_names is not None:
            plt.figure(figsize=(12, 8))
            feature_importance = model.feature_importances_
            sorted_idx = np.argsort(feature_importance)
            
            # Select top 15 features for better visualization
            top_k = min(15, len(feature_importance))
            plt.barh(range(top_k), feature_importance[sorted_idx][-top_k:])
            plt.yticks(range(top_k), [feature_names[i] for i in sorted_idx[-top_k:]])
            plt.title(f'{name} - Top {top_k} Feature Importance')
            plt.tight_layout()
            plt.savefig(f'{name.replace(" ", "_").lower()}_feature_importance.png')
    
    # Plot comparison of models
    metrics_df = pd.DataFrame(metrics)
    
    # Create bar plot for RMSE and MAE
    plt.figure(figsize=(12, 6))
    
    bar_width = 0.35
    index = np.arange(len(metrics_df['Model']))
    
    plt.bar(index, metrics_df['RMSE'], bar_width, label='RMSE')
    plt.bar(index + bar_width, metrics_df['MAE'], bar_width, label='MAE')
    
    plt.xlabel('Model')
    plt.ylabel('Error')
    plt.title('Model Comparison - RMSE and MAE')
    plt.xticks(index + bar_width / 2, metrics_df['Model'], rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.savefig('model_comparison_error.png')
    
    # Create bar plot for R²
    plt.figure(figsize=(12, 6))
    plt.bar(metrics_df['Model'], metrics_df['R²'], color='skyblue')
    plt.xlabel('Model')
    plt.ylabel('R²')
    plt.title('Model Comparison - R²')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('model_comparison_r2.png')
    
    print("\nModel training and evaluation complete!")
    print(f"Models saved as: {', '.join([f'{name.replace(' ', '_').lower()}_model.pkl' for name in models.keys()])}")
    
    return models, metrics_df

def main():
    """

    Main function to train and evaluate models

    """
    # Check if preprocessed data exists
    if not all(os.path.exists(f) for f in ['X_train.npy', 'X_test.npy', 'y_train.npy', 'y_test.npy']):
        print("Preprocessed data not found. Please run preprocess_data.py first.")
        return
    
    # Load preprocessed data
    X_train = np.load('X_train.npy')
    X_test = np.load('X_test.npy')
    y_train = np.load('y_train.npy')
    y_test = np.load('y_test.npy')
    
    # Load feature names
    feature_names = []
    if os.path.exists('features.txt'):
        with open('features.txt', 'r') as f:
            feature_names = [line.strip() for line in f.readlines()]
    
    print("Data loaded successfully!")
    print(f"Training data shape: {X_train.shape}")
    print(f"Testing data shape: {X_test.shape}")
    
    # Train and evaluate models
    models, metrics = train_and_evaluate_models(X_train, X_test, y_train, y_test, feature_names)
    
    # Display and save comparison table
    print("\nModel Comparison:")
    print(metrics)
    metrics.to_csv('model_comparison.csv', index=False)

if __name__ == "__main__":
    main()