Insurance2 / train.py
VinnoGS's picture
Upload 5 files
ad2de74 verified
raw
history blame contribute delete
No virus
2.42 kB
import joblib
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
# Load the dataset
df = pd.read_csv("insurance.csv")
# Define features and target
target = 'charges'
numerical_features = ['age', 'bmi', 'children']
categorical_features = ['sex', 'smoker', 'region']
print("Creating data subsets")
X = df[numerical_features + categorical_features]
y = df[target]
Xtrain, Xtest, ytrain, ytest = train_test_split(
X, y,
test_size=0.2,
random_state=42
)
# Define the numerical and categorical pipelines
numerical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
categorical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = make_column_transformer(
(numerical_pipeline, numerical_features),
(categorical_pipeline, categorical_features)
)
# Define the Random Forest model with the best parameters
model_random_forest = RandomForestRegressor(
n_estimators=125,
min_samples_split=3,
min_samples_leaf=4,
max_depth=25,
random_state=42,
n_jobs=-1
)
print("Estimating Best Model Pipeline")
model_pipeline = Pipeline([
('preprocessor', preprocessor),
('regressor', model_random_forest)
])
# Train the model
model_pipeline.fit(Xtrain, ytrain)
# Predict on the test set
y_pred = model_pipeline.predict(Xtest)
# Calculate evaluation metrics
mae = mean_absolute_error(ytest, y_pred)
mse = mean_squared_error(ytest, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(ytest, y_pred)
print("Logging Metrics")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R²): {r2}")
print("Serializing Model")
# Save the model to a file
saved_model_path = "random_forest_pipeline.pkl"
joblib.dump(model_pipeline, saved_model_path)
print(f"Model saved as {saved_model_path}")