import joblib import pandas as pd import numpy as np from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import make_column_transformer from sklearn.impute import SimpleImputer from sklearn.pipeline import Pipeline from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error # Load the dataset df = pd.read_csv("insurance.csv") # Define features and target target = 'charges' numerical_features = ['age', 'bmi', 'children'] categorical_features = ['sex', 'smoker', 'region'] print("Creating data subsets") X = df[numerical_features + categorical_features] y = df[target] Xtrain, Xtest, ytrain, ytest = train_test_split( X, y, test_size=0.2, random_state=42 ) # Define the numerical and categorical pipelines numerical_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler()) ]) categorical_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore')) ]) preprocessor = make_column_transformer( (numerical_pipeline, numerical_features), (categorical_pipeline, categorical_features) ) # Define the Random Forest model with the best parameters model_random_forest = RandomForestRegressor( n_estimators=125, min_samples_split=3, min_samples_leaf=4, max_depth=25, random_state=42, n_jobs=-1 ) print("Estimating Best Model Pipeline") model_pipeline = Pipeline([ ('preprocessor', preprocessor), ('regressor', model_random_forest) ]) # Train the model model_pipeline.fit(Xtrain, ytrain) # Predict on the test set y_pred = model_pipeline.predict(Xtest) # Calculate evaluation metrics mae = mean_absolute_error(ytest, y_pred) mse = mean_squared_error(ytest, y_pred) rmse = np.sqrt(mse) r2 = r2_score(ytest, y_pred) print("Logging Metrics") print(f"Mean Absolute Error (MAE): {mae}") print(f"Mean Squared Error (MSE): {mse}") print(f"Root Mean Squared Error (RMSE): {rmse}") print(f"R-squared (R²): {r2}") print("Serializing Model") # Save the model to a file saved_model_path = "random_forest_pipeline.pkl" joblib.dump(model_pipeline, saved_model_path) print(f"Model saved as {saved_model_path}")