Spaces:

VinnoGS
/

Insurance7

Sleeping

App Files Files Community

Insurance7 / train.py

VinnoGS

Upload 5 files

9252403 verified 12 months ago

raw

history blame contribute delete

2.42 kB


	import joblib
	import pandas as pd
	import numpy as np
	from sklearn.preprocessing import StandardScaler, OneHotEncoder
	from sklearn.compose import make_column_transformer
	from sklearn.impute import SimpleImputer
	from sklearn.pipeline import Pipeline
	from sklearn.model_selection import train_test_split
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

	# Load the dataset
	df = pd.read_csv("insurance.csv")

	# Define features and target
	target = 'charges'
	numerical_features = ['age', 'bmi', 'children']
	categorical_features = ['sex', 'smoker', 'region']

	print("Creating data subsets")

	X = df[numerical_features + categorical_features]
	y = df[target]

	Xtrain, Xtest, ytrain, ytest = train_test_split(
	X, y,
	test_size=0.2,
	random_state=42
	)

	# Define the numerical and categorical pipelines
	numerical_pipeline = Pipeline([
	('imputer', SimpleImputer(strategy='median')),
	('scaler', StandardScaler())
	])

	categorical_pipeline = Pipeline([
	('imputer', SimpleImputer(strategy='most_frequent')),
	('onehot', OneHotEncoder(handle_unknown='ignore'))
	])

	preprocessor = make_column_transformer(
	(numerical_pipeline, numerical_features),
	(categorical_pipeline, categorical_features)
	)

	# Define the Random Forest model with the best parameters
	model_random_forest = RandomForestRegressor(
	n_estimators=125,
	min_samples_split=3,
	min_samples_leaf=4,
	max_depth=25,
	random_state=42,
	n_jobs=-1
	)

	print("Estimating Best Model Pipeline")

	model_pipeline = Pipeline([
	('preprocessor', preprocessor),
	('regressor', model_random_forest)
	])

	# Train the model
	model_pipeline.fit(Xtrain, ytrain)

	# Predict on the test set
	y_pred = model_pipeline.predict(Xtest)

	# Calculate evaluation metrics
	mae = mean_absolute_error(ytest, y_pred)
	mse = mean_squared_error(ytest, y_pred)
	rmse = np.sqrt(mse)
	r2 = r2_score(ytest, y_pred)

	print("Logging Metrics")
	print(f"Mean Absolute Error (MAE): {mae}")
	print(f"Mean Squared Error (MSE): {mse}")
	print(f"Root Mean Squared Error (RMSE): {rmse}")
	print(f"R-squared (R²): {r2}")

	print("Serializing Model")

	# Save the model to a file
	saved_model_path = "random_forest_pipeline.pkl"
	joblib.dump(model_pipeline, saved_model_path)
	print(f"Model saved as {saved_model_path}")