Spaces:
Configuration error
Configuration error
File size: 6,067 Bytes
fb61aba |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os
import gc
import psutil
from typing import Dict, List, Tuple
# Set the style for plots
sns.set(style="whitegrid")
# Set up memory monitoring
def print_memory_usage():
process = psutil.Process(os.getpid())
memory_usage = process.memory_info().rss / (1024 * 1024) # Convert to MB
print(f"Current memory usage: {memory_usage:.2f} MB")
def train_and_evaluate_models(X_train, X_test, y_train, y_test, feature_names=None):
"""
Train and evaluate multiple regression models for COVID-19 prediction
Parameters:
- X_train, X_test, y_train, y_test: Training and testing data
- feature_names: List of feature names (for feature importance)
Returns:
- models: Dictionary of trained models
- metrics: Dictionary of evaluation metrics for each model
"""
models = {
'Linear Regression': LinearRegression(),
'Support Vector Regression': SVR(kernel='rbf', gamma='scale', C=1.0, epsilon=0.1),
'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42),
'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
}
metrics = {
'Model': [],
'RMSE': [],
'MAE': [],
'R²': []
}
for name, model in models.items():
print(f"Training {name}...")
model.fit(X_train, y_train)
# Predict
y_pred = model.predict(X_test)
# Calculate metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Store metrics
metrics['Model'].append(name)
metrics['RMSE'].append(rmse)
metrics['MAE'].append(mae)
metrics['R²'].append(r2)
print(f"{name} - RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.4f}")
# Save the model
joblib.dump(model, f'{name.replace(" ", "_").lower()}_model.pkl')
# Plot actual vs predicted
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.title(f'{name} - Actual vs Predicted')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.savefig(f'{name.replace(" ", "_").lower()}_predictions.png')
# If it's Random Forest or Gradient Boosting, plot feature importance
if name in ['Random Forest', 'Gradient Boosting'] and feature_names is not None:
plt.figure(figsize=(12, 8))
feature_importance = model.feature_importances_
sorted_idx = np.argsort(feature_importance)
# Select top 15 features for better visualization
top_k = min(15, len(feature_importance))
plt.barh(range(top_k), feature_importance[sorted_idx][-top_k:])
plt.yticks(range(top_k), [feature_names[i] for i in sorted_idx[-top_k:]])
plt.title(f'{name} - Top {top_k} Feature Importance')
plt.tight_layout()
plt.savefig(f'{name.replace(" ", "_").lower()}_feature_importance.png')
# Plot comparison of models
metrics_df = pd.DataFrame(metrics)
# Create bar plot for RMSE and MAE
plt.figure(figsize=(12, 6))
bar_width = 0.35
index = np.arange(len(metrics_df['Model']))
plt.bar(index, metrics_df['RMSE'], bar_width, label='RMSE')
plt.bar(index + bar_width, metrics_df['MAE'], bar_width, label='MAE')
plt.xlabel('Model')
plt.ylabel('Error')
plt.title('Model Comparison - RMSE and MAE')
plt.xticks(index + bar_width / 2, metrics_df['Model'], rotation=45)
plt.legend()
plt.tight_layout()
plt.savefig('model_comparison_error.png')
# Create bar plot for R²
plt.figure(figsize=(12, 6))
plt.bar(metrics_df['Model'], metrics_df['R²'], color='skyblue')
plt.xlabel('Model')
plt.ylabel('R²')
plt.title('Model Comparison - R²')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('model_comparison_r2.png')
print("\nModel training and evaluation complete!")
print(f"Models saved as: {', '.join([f'{name.replace(' ', '_').lower()}_model.pkl' for name in models.keys()])}")
return models, metrics_df
def main():
"""
Main function to train and evaluate models
"""
# Check if preprocessed data exists
if not all(os.path.exists(f) for f in ['X_train.npy', 'X_test.npy', 'y_train.npy', 'y_test.npy']):
print("Preprocessed data not found. Please run preprocess_data.py first.")
return
# Load preprocessed data
X_train = np.load('X_train.npy')
X_test = np.load('X_test.npy')
y_train = np.load('y_train.npy')
y_test = np.load('y_test.npy')
# Load feature names
feature_names = []
if os.path.exists('features.txt'):
with open('features.txt', 'r') as f:
feature_names = [line.strip() for line in f.readlines()]
print("Data loaded successfully!")
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")
# Train and evaluate models
models, metrics = train_and_evaluate_models(X_train, X_test, y_train, y_test, feature_names)
# Display and save comparison table
print("\nModel Comparison:")
print(metrics)
metrics.to_csv('model_comparison.csv', index=False)
if __name__ == "__main__":
main()
|