File size: 4,628 Bytes
4ec7aed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# Import necessary modules
import sys
import os
import numpy as np
from dataclasses import dataclass

from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor, BaggingRegressor, HistGradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
from sklearn.kernel_ridge import KernelRidge

from src.exception import CustomException
from src.logger import logging
from src.utils import save_object, evaluate_models

# Define a configuration class for model trainer settings
@dataclass
class ModelTrainerConfig:
    trained_model_file_path: str = os.path.join("artifacts", "model.pkl")

# Define the main class responsible for model training
class ModelTrainer:

    def __init__(self):
        self.model_trainer_config = ModelTrainerConfig()

    def initiate_model_trainer(self, train_array, test_array):
        """

        Train and evaluate multiple models to find the best performing one based on a chosen metric.

        

        Parameters:

        train_array (array-like): Training data array with features and target.

        test_array (array-like): Testing data array with features and target.



        Returns:

        tuple: A tuple containing the R2 score, MAE, MSE, RMSE of the best model on test data, and the name of the best model.

        """

        try:
            logging.info("Splitting Training and Testing Input Data.")
            
            # Split the train and test arrays into features and target
            X_train, y_train = train_array[:, :-1], train_array[:, -1]
            X_test, y_test = test_array[:, :-1], test_array[:, -1]

            # Initialize models
            models = {
                'Random Forest': RandomForestRegressor(random_state=42),
                #'Gradient Boosting': GradientBoostingRegressor(random_state=6112024),
                #'XGBoost': XGBRegressor(random_state=6112024),
                #'AdaBoost': AdaBoostRegressor(random_state=6112024),
                #'Bagging': BaggingRegressor(random_state=6112024),
                #'Kernel Ridge': KernelRidge(),
                #'Hist Gradient Boosting': HistGradientBoostingRegressor(random_state=6112024)
            }

            # Define parameter grids for hyperparameter tuning
            param_grids = {
                'Random Forest': {'n_estimators': [100, 200, 300],'max_depth': [None, 10, 20],'min_samples_split': [2, 5, 10],'min_samples_leaf': [1, 2, 4]}
                #'Gradient Boosting': {'n_estimators': [100, 200, 300], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 4, 5]},
                #'XGBoost': {'n_estimators': [100, 200, 300], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 4, 5]},
                #'AdaBoost': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0]},
                #'Bagging': {'n_estimators': [10, 50, 100], 'max_samples': [0.5, 0.75, 1.0], 'max_features': [0.5, 0.75, 1.0]},
                #'Kernel Ridge': {'alpha': [0.1, 1.0, 10.0], 'kernel': ['linear', 'rbf', 'poly'], 'gamma': [0.01, 0.1, 1.0, None]},
                #'Hist Gradient Boosting': {'learning_rate': [0.01, 0.1, 0.2], 'max_iter': [100, 200, 300], 'max_depth': [None, 10, 20], 'min_samples_leaf': [20, 50, 100]}
            }

            # Evaluate models and get a report of their performance
            model_report: dict = evaluate_models(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, models=models, param=param_grids)

            logging.info("Report on All Models: {model_report}")

            # Find the best model based on R2 score
            best_model_name = max(model_report, key=lambda k: model_report[k]['Test Metrics']['R2 Score'])
            best_model = models[best_model_name]
            best_model.fit(X_train, y_train)

            # Save the best model to a file
            save_object(file_path=self.model_trainer_config.trained_model_file_path, obj=best_model)

            logging.info(f"Best Model found: {best_model_name}")

            # Fit the best model
            
            predicted = best_model.predict(X_test)
            r2 = r2_score(y_test, predicted)
            mae = mean_absolute_error(y_test, predicted)
            mse = mean_squared_error(y_test, predicted)
            rmse = np.sqrt(mse)

            return (r2, mae, mse, rmse, best_model_name)

        except Exception as e:
            # Raise a custom exception if an error occurs
            raise CustomException(e, sys)