# The libraries - pandas & numpy helps with reading and manipulating data import pandas as pd import numpy as np # Importing train_test_split from sklearn.model_selection import train_test_split # Importing preprocessing modules from sklearn from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder from sklearn.compose import make_column_transformer from sklearn.impute import SimpleImputer from sklearn.pipeline import Pipeline from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error, r2_score # Importing make_pipeline function from pipeline module from sklearn.pipeline import make_pipeline import joblib # Read data in python dataframe. path = '/content/sample_data/insurance.csv' df_raw = pd.read_csv(path) # Removing duplicated rows df_raw.drop_duplicates(inplace=True) # drop the columns which was not required for modelling # Drop the index and target that are not required for model training data_df = df_raw.drop(columns=['index']) data_df.info() print("Creating data subsets") #target = 'charges' y = data_df[['charges']] X = data_df.drop('charges',axis=1) # split data in to numerical and categorical num_var = [var for var in X.select_dtypes(include=np.number)] cat_var = [var for var in df_raw.select_dtypes(include=object)] Xtrain, Xtest, ytrain, ytest = train_test_split( X, y, test_size=0.2, random_state=42 ) # Creating a pipeline for numerical feature processing, including imputation of missing values with mean and standard scaling. numerical_pipeline = Pipeline([ ('scaler', StandardScaler()) ]) # Creating a pipeline for categorical feature processing, including imputation of missing values with the most frequent value and one-hot encoding with handling of unknown categories. categorical_pipeline = Pipeline([ ('onehot', OneHotEncoder(handle_unknown='ignore')) ]) # Creating a column transformer named preprocessor to apply specific pipelines to numerical and categorical features separately. preprocessor = make_column_transformer( (numerical_pipeline, num_var), (categorical_pipeline, cat_var) ) # Creating a Linear regression model with parallel processing enabled (-1 indicates using all available cores) for improved training efficiency. model_linear_regression = LinearRegression(n_jobs=-1) print("Estimating Model Pipeline") model_pipeline = make_pipeline( preprocessor, model_linear_regression ) # Creating a pipeline combining preprocessing steps (imputation and encoding) with linear regression modeling. model_pipeline = make_pipeline( preprocessor, # Applying preprocessing steps model_linear_regression # Training logistic regression model ) model_pipeline model_pipeline.fit(Xtrain, ytrain) print("Logging Metrics") # write you are code here # Make prediction on the test data model_pipeline.predict(Xtest) print(f"RMSE: {mean_squared_error(ytest, model_pipeline.predict(Xtest), squared=False)}") print(f"R-squared: {r2_score(ytest, model_pipeline.predict(Xtest))}") #print(f"R-squared: {r2_score(ytest, model_pipeline.predict(Xtest))}") # #print("Serializing Model") saved_model_path = "model.joblib" joblib.dump(model_pipeline, saved_model_path)