Spaces:

kgauvin603
/

HealthyLife-Insurance-Charge-Prediction

Runtime error

App Files Files Community

HealthyLife-Insurance-Charge-Prediction / train.py

kgauvin603

Rename 19MAY24-1.train.py to train.py

29c8870 verified 10 months ago

raw

history blame

1.75 kB

	import joblib
	from sklearn.datasets import fetch_openml
	from sklearn.preprocessing import StandardScaler, OneHotEncoder
	from sklearn.compose import make_column_transformer
	from sklearn.pipeline import make_pipeline
	from sklearn.model_selection import train_test_split
	from sklearn.linear_model import LinearRegression
	from sklearn.metrics import mean_squared_error, r2_score
	import numpy as np
	import pandas as pd

	# Read data
	df = pd.read_csv("insurance.csv")

	# Create the y, X variables by splitting of the target continuous variable 'charges'
	# from the remaining features

	print("Creating data subsets")

	y = df['charges']
	X = df.drop('charges', axis=1)

	print("Data subsets created")

	# Extract the names of the numerical and categorical columns
	numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
	categorical_features = X.select_dtypes(include=['object']).columns.tolist()
	target = 'charges'

	# Split the independent and dependent features into X and y variables with a test size of 20% and random state set to 42
	Xtrain, Xtest, ytrain, ytest = train_test_split(
	X, y,
	test_size=0.2,
	random_state=42
	)

	print("Preprocessing Data")

	preprocessor = make_column_transformer(
	(StandardScaler(), numeric_features),
	(OneHotEncoder(handle_unknown='ignore'), categorical_features)
	)

	model_linear_regression = LinearRegression(n_jobs=-1)

	print("Estimating Model Pipeline")

	model_pipeline = make_pipeline(
	preprocessor,
	model_linear_regression
	)

	model_pipeline.fit(Xtrain, ytrain)

	print("Logging Metrics")
	print(f"R-squared: {r2_score(ytest, model_pipeline.predict(Xtest))}")

	print("Serializing Model")

	saved_model_path = "model.joblib"

	joblib.dump(model_pipeline, saved_model_path)