mjocp54 commited on
Commit
56ef21c
1 Parent(s): a3cc6e8

Upload train.py

Browse files
Files changed (1) hide show
  1. train.py +59 -0
train.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import joblib
3
+
4
+ import pandas as pd
5
+
6
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder
7
+ from sklearn.compose import make_column_transformer
8
+
9
+ from sklearn.pipeline import make_pipeline
10
+
11
+ from sklearn.model_selection import train_test_split
12
+
13
+ from sklearn.linear_model import LinearRegression
14
+ from sklearn.metrics import mean_squared_error, r2_score
15
+
16
+ #Read csv
17
+ df = pd.read_csv("insurance.csv")
18
+
19
+ # split data in to numerical and categorical
20
+ target = 'charges'
21
+ numeric_features = ['age', 'bmi', 'children']
22
+ categorical_features = ['sex','smoker','region']
23
+
24
+ print("Creating Data Subsets")
25
+
26
+ #Model Estimation
27
+ X = df[numeric_features + categorical_features]
28
+ y = df[target]
29
+
30
+ Xtrain, Xtest, ytrain, ytest = train_test_split(
31
+ X, y,
32
+ test_size=0.2,
33
+ random_state=42
34
+ )
35
+
36
+ preprocessor = make_column_transformer(
37
+ (StandardScaler(), numeric_features),
38
+ (OneHotEncoder(handle_unknown='ignore'), categorical_features)
39
+ )
40
+
41
+ model_linear_regression = LinearRegression(n_jobs=-1)
42
+
43
+ print("Estimating Model Pipeline")
44
+
45
+ model_pipeline = make_pipeline(
46
+ preprocessor,
47
+ model_linear_regression
48
+ )
49
+
50
+ model_pipeline.fit(Xtrain, ytrain)
51
+
52
+ print("Logging Metrics")
53
+ print(f"R-squared: {r2_score(ytest, model_pipeline.predict(Xtest))}")
54
+
55
+ print("Serializing Model")
56
+
57
+ saved_model_path = "model.joblib"
58
+
59
+ joblib.dump(model_pipeline, saved_model_path)