ravi6k's picture
Upload 2 files
9c09830 verified
raw
history blame contribute delete
No virus
3.22 kB
# The libraries - pandas & numpy helps with reading and manipulating data
import pandas as pd
import numpy as np
# Importing train_test_split
from sklearn.model_selection import train_test_split
# Importing preprocessing modules from sklearn
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Importing make_pipeline function from pipeline module
from sklearn.pipeline import make_pipeline
import joblib
# Read data in python dataframe.
path = '/content/sample_data/insurance.csv'
df_raw = pd.read_csv(path)
# Removing duplicated rows
df_raw.drop_duplicates(inplace=True)
# drop the columns which was not required for modelling
# Drop the index and target that are not required for model training
data_df = df_raw.drop(columns=['index'])
data_df.info()
print("Creating data subsets")
#target = 'charges'
y = data_df[['charges']]
X = data_df.drop('charges',axis=1)
# split data in to numerical and categorical
num_var = [var for var in X.select_dtypes(include=np.number)]
cat_var = [var for var in df_raw.select_dtypes(include=object)]
Xtrain, Xtest, ytrain, ytest = train_test_split(
X, y,
test_size=0.2,
random_state=42
)
# Creating a pipeline for numerical feature processing, including imputation of missing values with mean and standard scaling.
numerical_pipeline = Pipeline([
('scaler', StandardScaler())
])
# Creating a pipeline for categorical feature processing, including imputation of missing values with the most frequent value and one-hot encoding with handling of unknown categories.
categorical_pipeline = Pipeline([
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# Creating a column transformer named preprocessor to apply specific pipelines to numerical and categorical features separately.
preprocessor = make_column_transformer(
(numerical_pipeline, num_var),
(categorical_pipeline, cat_var)
)
# Creating a Linear regression model with parallel processing enabled (-1 indicates using all available cores) for improved training efficiency.
model_linear_regression = LinearRegression(n_jobs=-1)
print("Estimating Model Pipeline")
model_pipeline = make_pipeline(
preprocessor,
model_linear_regression
)
# Creating a pipeline combining preprocessing steps (imputation and encoding) with linear regression modeling.
model_pipeline = make_pipeline(
preprocessor, # Applying preprocessing steps
model_linear_regression # Training logistic regression model
)
model_pipeline
model_pipeline.fit(Xtrain, ytrain)
print("Logging Metrics")
# write you are code here
# Make prediction on the test data
model_pipeline.predict(Xtest)
print(f"RMSE: {mean_squared_error(ytest, model_pipeline.predict(Xtest), squared=False)}")
print(f"R-squared: {r2_score(ytest, model_pipeline.predict(Xtest))}")
#print(f"R-squared: {r2_score(ytest, model_pipeline.predict(Xtest))}")
#
#print("Serializing Model")
saved_model_path = "model.joblib"
joblib.dump(model_pipeline, saved_model_path)