Spaces:

ravi6k
/

Ravikant_insurance_charge_prediction

Runtime error

App Files Files Community

ravi6k commited on May 19

Commit

9c09830

•

1 Parent(s): 8a1e680

Upload 2 files

Browse files

Files changed (2) hide show

app.py +1 -1
train.py +100 -0

app.py CHANGED Viewed

@@ -14,7 +14,7 @@ from pathlib import Path
 # model with the filename 'model.joblib'
 # Load the freshly trained model from disk
-insurance_charge_mlops = joblib.load('/content/model.joblib')
 # Prepare the logging functionality
 log_file = Path("logs/") / f"data_{uuid.uuid4()}.json"

 # model with the filename 'model.joblib'
 # Load the freshly trained model from disk
+insurance_charge_mlops = joblib.load('model.joblib')
 # Prepare the logging functionality
 log_file = Path("logs/") / f"data_{uuid.uuid4()}.json"

train.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# The libraries - pandas & numpy helps with reading and manipulating data
+import pandas as pd
+import numpy as np
+# Importing train_test_split
+from sklearn.model_selection import train_test_split
+# Importing preprocessing modules from sklearn
+from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
+from sklearn.compose import make_column_transformer
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
+from sklearn.linear_model import LinearRegression
+from sklearn.metrics import mean_squared_error, r2_score
+# Importing make_pipeline function from pipeline module
+from sklearn.pipeline import make_pipeline
+import joblib
+# Read data in python dataframe.
+path = '/content/sample_data/insurance.csv'
+df_raw = pd.read_csv(path)
+# Removing duplicated rows
+df_raw.drop_duplicates(inplace=True)
+# drop the columns which was not required for modelling
+# Drop the index and target that are not required for model training
+data_df = df_raw.drop(columns=['index'])
+data_df.info()
+print("Creating data subsets")
+#target = 'charges'
+y = data_df[['charges']]
+X = data_df.drop('charges',axis=1)
+# split data in to numerical and categorical
+num_var = [var for var in X.select_dtypes(include=np.number)]
+cat_var = [var for var in df_raw.select_dtypes(include=object)]
+Xtrain, Xtest, ytrain, ytest = train_test_split(
+    X, y,
+    test_size=0.2,
+    random_state=42
+)
+# Creating a pipeline for numerical feature processing, including imputation of missing values with mean and standard scaling.
+numerical_pipeline = Pipeline([
+    ('scaler', StandardScaler())
+])
+# Creating a pipeline for categorical feature processing, including imputation of missing values with the most frequent value and one-hot encoding with handling of unknown categories.
+categorical_pipeline = Pipeline([
+    ('onehot', OneHotEncoder(handle_unknown='ignore'))
+])
+# Creating a column transformer named preprocessor to apply specific pipelines to numerical and categorical features separately.
+preprocessor = make_column_transformer(
+    (numerical_pipeline, num_var),
+    (categorical_pipeline, cat_var)
+)
+# Creating a Linear regression model with parallel processing enabled (-1 indicates using all available cores) for improved training efficiency.
+model_linear_regression = LinearRegression(n_jobs=-1)
+print("Estimating Model Pipeline")
+model_pipeline = make_pipeline(
+    preprocessor,
+    model_linear_regression
+)
+# Creating a pipeline combining preprocessing steps (imputation and encoding) with linear regression modeling.
+model_pipeline = make_pipeline(
+    preprocessor,  # Applying preprocessing steps
+    model_linear_regression  # Training logistic regression model
+)
+model_pipeline
+model_pipeline.fit(Xtrain, ytrain)
+print("Logging Metrics")
+# write you are code here
+# Make prediction on the test data
+model_pipeline.predict(Xtest)
+print(f"RMSE: {mean_squared_error(ytest, model_pipeline.predict(Xtest), squared=False)}")
+print(f"R-squared: {r2_score(ytest, model_pipeline.predict(Xtest))}")
+#print(f"R-squared: {r2_score(ytest, model_pipeline.predict(Xtest))}")
+#
+#print("Serializing Model")
+saved_model_path = "model.joblib"
+joblib.dump(model_pipeline, saved_model_path)