ravi6k commited on
Commit
9c09830
1 Parent(s): 8a1e680

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +1 -1
  2. train.py +100 -0
app.py CHANGED
@@ -14,7 +14,7 @@ from pathlib import Path
14
  # model with the filename 'model.joblib'
15
 
16
  # Load the freshly trained model from disk
17
- insurance_charge_mlops = joblib.load('/content/model.joblib')
18
 
19
  # Prepare the logging functionality
20
  log_file = Path("logs/") / f"data_{uuid.uuid4()}.json"
 
14
  # model with the filename 'model.joblib'
15
 
16
  # Load the freshly trained model from disk
17
+ insurance_charge_mlops = joblib.load('model.joblib')
18
 
19
  # Prepare the logging functionality
20
  log_file = Path("logs/") / f"data_{uuid.uuid4()}.json"
train.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # The libraries - pandas & numpy helps with reading and manipulating data
3
+ import pandas as pd
4
+ import numpy as np
5
+
6
+ # Importing train_test_split
7
+ from sklearn.model_selection import train_test_split
8
+
9
+ # Importing preprocessing modules from sklearn
10
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
11
+ from sklearn.compose import make_column_transformer
12
+ from sklearn.impute import SimpleImputer
13
+ from sklearn.pipeline import Pipeline
14
+
15
+ from sklearn.linear_model import LinearRegression
16
+ from sklearn.metrics import mean_squared_error, r2_score
17
+
18
+ # Importing make_pipeline function from pipeline module
19
+ from sklearn.pipeline import make_pipeline
20
+
21
+ import joblib
22
+
23
+ # Read data in python dataframe.
24
+ path = '/content/sample_data/insurance.csv'
25
+ df_raw = pd.read_csv(path)
26
+
27
+ # Removing duplicated rows
28
+ df_raw.drop_duplicates(inplace=True)
29
+
30
+ # drop the columns which was not required for modelling
31
+ # Drop the index and target that are not required for model training
32
+ data_df = df_raw.drop(columns=['index'])
33
+ data_df.info()
34
+
35
+ print("Creating data subsets")
36
+ #target = 'charges'
37
+ y = data_df[['charges']]
38
+ X = data_df.drop('charges',axis=1)
39
+
40
+ # split data in to numerical and categorical
41
+ num_var = [var for var in X.select_dtypes(include=np.number)]
42
+ cat_var = [var for var in df_raw.select_dtypes(include=object)]
43
+
44
+ Xtrain, Xtest, ytrain, ytest = train_test_split(
45
+ X, y,
46
+ test_size=0.2,
47
+ random_state=42
48
+ )
49
+
50
+ # Creating a pipeline for numerical feature processing, including imputation of missing values with mean and standard scaling.
51
+ numerical_pipeline = Pipeline([
52
+ ('scaler', StandardScaler())
53
+ ])
54
+
55
+ # Creating a pipeline for categorical feature processing, including imputation of missing values with the most frequent value and one-hot encoding with handling of unknown categories.
56
+ categorical_pipeline = Pipeline([
57
+ ('onehot', OneHotEncoder(handle_unknown='ignore'))
58
+ ])
59
+
60
+ # Creating a column transformer named preprocessor to apply specific pipelines to numerical and categorical features separately.
61
+ preprocessor = make_column_transformer(
62
+ (numerical_pipeline, num_var),
63
+ (categorical_pipeline, cat_var)
64
+ )
65
+
66
+
67
+ # Creating a Linear regression model with parallel processing enabled (-1 indicates using all available cores) for improved training efficiency.
68
+ model_linear_regression = LinearRegression(n_jobs=-1)
69
+
70
+ print("Estimating Model Pipeline")
71
+
72
+ model_pipeline = make_pipeline(
73
+ preprocessor,
74
+ model_linear_regression
75
+ )
76
+
77
+ # Creating a pipeline combining preprocessing steps (imputation and encoding) with linear regression modeling.
78
+ model_pipeline = make_pipeline(
79
+ preprocessor, # Applying preprocessing steps
80
+ model_linear_regression # Training logistic regression model
81
+ )
82
+ model_pipeline
83
+ model_pipeline.fit(Xtrain, ytrain)
84
+
85
+ print("Logging Metrics")
86
+ # write you are code here
87
+ # Make prediction on the test data
88
+ model_pipeline.predict(Xtest)
89
+
90
+ print(f"RMSE: {mean_squared_error(ytest, model_pipeline.predict(Xtest), squared=False)}")
91
+
92
+ print(f"R-squared: {r2_score(ytest, model_pipeline.predict(Xtest))}")
93
+
94
+ #print(f"R-squared: {r2_score(ytest, model_pipeline.predict(Xtest))}")
95
+ #
96
+ #print("Serializing Model")
97
+
98
+ saved_model_path = "model.joblib"
99
+
100
+ joblib.dump(model_pipeline, saved_model_path)