abhishekrs4 commited on
Commit
77b575f
1 Parent(s): 76a5f1b

updated scripts in the modeling module

Browse files
modeling/__init__.py CHANGED
@@ -1 +1,3 @@
1
- import os, sys; sys.path.append(os.path.dirname(os.path.realpath(__file__)))
 
 
 
1
+ import os, sys
2
+
3
+ sys.path.append(os.path.dirname(os.path.realpath(__file__)))
modeling/data_utils.py CHANGED
@@ -1,16 +1,72 @@
1
  import numpy as np
2
  import pandas as pd
3
 
4
- def read_csv_file(file_csv):
5
- df_csv = pd.read_csv(file_csv)
6
- return df_csv
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  def get_dict_nan_counts_per_col(data_frame):
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  dict_nan_counts_per_col = data_frame.isna().sum().to_dict()
10
- dict_nan_counts_per_col = dict(sorted(dict_nan_counts_per_col.items(), key=lambda kv: kv[1], reverse=True))
 
 
11
  return dict_nan_counts_per_col
12
-
13
- def get_data_from_data_frame(data_frame):
14
- arr = data_frame.to_numpy()
15
- X_arr, Y_arr = arr[:, :-1], arr[:, -1:].reshape(-1)
16
- return X_arr, Y_arr
 
1
  import numpy as np
2
  import pandas as pd
3
 
4
+ from sklearn.model_selection import train_test_split
5
+
6
+
7
+ class WaterPotabilityDataLoader:
8
+ def __init__(self, file_csv, test_size=0.1, random_state=4):
9
+ self.file_csv = file_csv
10
+ self.test_size = test_size
11
+ self.random_state = random_state
12
+
13
+ self.df_csv = None
14
+ self.df_train = None
15
+ self.df_test = None
16
+ self.X_train = None
17
+ self.Y_train = None
18
+ self.X_test = None
19
+ self.Y_test = None
20
+
21
+ def read_csv_file(self):
22
+ self.df_csv = pd.read_csv(self.file_csv)
23
+ return
24
+
25
+ def split_data(self):
26
+ self.df_train, self.df_test = train_test_split(
27
+ self.df_csv, test_size=self.test_size, random_state=self.random_state
28
+ )
29
+ return
30
+
31
+ def get_data_from_data_frame(self, which_set="train"):
32
+ """
33
+ ---------
34
+ Arguments
35
+ ---------
36
+ which_set : str
37
+ a string indicating for which set the data arrays should be returned
38
+
39
+ -------
40
+ Returns
41
+ -------
42
+ (X_arr, Y_arr) : tuple
43
+ a tuple of numpy arrays of features and labels for the appropriate set
44
+ """
45
+ if which_set == "train":
46
+ data_frame = self.df_train
47
+ else:
48
+ data_frame = self.df_test
49
+ arr = data_frame.to_numpy()
50
+ X_arr, Y_arr = arr[:, :-1], arr[:, -1:].reshape(-1)
51
+ return X_arr, Y_arr
52
+
53
 
54
  def get_dict_nan_counts_per_col(data_frame):
55
+ """
56
+ ---------
57
+ Arguments
58
+ ---------
59
+ data_frame : pd.DataFrame
60
+ a pandas dataframe of some dataset
61
+
62
+ -------
63
+ Returns
64
+ -------
65
+ dict_nan_counts_per_col : dict
66
+ a dictionary of NaN counts per column
67
+ """
68
  dict_nan_counts_per_col = data_frame.isna().sum().to_dict()
69
+ dict_nan_counts_per_col = dict(
70
+ sorted(dict_nan_counts_per_col.items(), key=lambda kv: kv[1], reverse=True)
71
+ )
72
  return dict_nan_counts_per_col
 
 
 
 
 
modeling/eda.py CHANGED
@@ -6,10 +6,13 @@ import numpy as np
6
  import seaborn as sns
7
  import matplotlib.pyplot as plt
8
 
9
- from data_utils import read_csv_file, get_data_from_data_frame
 
10
 
11
  def do_eda(ARGS):
12
- data_frame = read_csv_file(ARGS.file_csv)
 
 
13
  label_counts = dict(data_frame[ARGS.target_column].value_counts())
14
  # print(label_counts)
15
 
@@ -54,6 +57,7 @@ def do_eda(ARGS):
54
 
55
  return
56
 
 
57
  def main():
58
  file_csv = "dataset/water_potability.csv"
59
  target_column = "Potability"
@@ -61,13 +65,19 @@ def main():
61
  parser = argparse.ArgumentParser(
62
  formatter_class=argparse.ArgumentDefaultsHelpFormatter
63
  )
64
- parser.add_argument("--file_csv", default=file_csv,
65
- type=str, help="full path to dataset csv file")
66
- parser.add_argument("--target_column", default=target_column,
67
- type=str, help="target label for which the EDA needs to be done")
 
 
 
 
 
68
  ARGS, unparsed = parser.parse_known_args()
69
  do_eda(ARGS)
70
  return
71
 
 
72
  if __name__ == "__main__":
73
  main()
 
6
  import seaborn as sns
7
  import matplotlib.pyplot as plt
8
 
9
+ from data_utils import WaterPotabilityDataLoader
10
+
11
 
12
  def do_eda(ARGS):
13
+ water_pot_dataset = WaterPotabilityDataLoader(ARGS.file_csv)
14
+ water_pot_dataset.read_csv_file()
15
+ data_frame = water_pot_dataset.df_csv
16
  label_counts = dict(data_frame[ARGS.target_column].value_counts())
17
  # print(label_counts)
18
 
 
57
 
58
  return
59
 
60
+
61
  def main():
62
  file_csv = "dataset/water_potability.csv"
63
  target_column = "Potability"
 
65
  parser = argparse.ArgumentParser(
66
  formatter_class=argparse.ArgumentDefaultsHelpFormatter
67
  )
68
+ parser.add_argument(
69
+ "--file_csv", default=file_csv, type=str, help="full path to dataset csv file"
70
+ )
71
+ parser.add_argument(
72
+ "--target_column",
73
+ default=target_column,
74
+ type=str,
75
+ help="target label for which the EDA needs to be done",
76
+ )
77
  ARGS, unparsed = parser.parse_known_args()
78
  do_eda(ARGS)
79
  return
80
 
81
+
82
  if __name__ == "__main__":
83
  main()
modeling/ml_model_dev.py CHANGED
@@ -12,144 +12,395 @@ import lightgbm as lgbm
12
  from sklearn.svm import SVC
13
  from sklearn.decomposition import PCA
14
  from sklearn.pipeline import Pipeline
15
- from sklearn.preprocessing import StandardScaler
16
  from sklearn.linear_model import LogisticRegression
17
  from sklearn.experimental import enable_iterative_imputer
18
  from sklearn.metrics import accuracy_score, f1_score, make_scorer
19
  from sklearn.impute import KNNImputer, SimpleImputer, IterativeImputer
20
  from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
21
- from sklearn.model_selection import cross_validate, train_test_split, GridSearchCV, KFold
22
-
23
- from data_utils import read_csv_file, get_data_from_data_frame
24
-
25
-
26
- def load_ml_model(pkl_file_name):
27
- model_pipeline = mlflow.sklearn.load_model(pkl_file_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  return model_pipeline
29
 
30
- def get_imputer(imputer_type):
31
- # setup parameter search space for different imputers
32
- imputer, imputer_params = None, None
33
- if imputer_type == "simple":
34
- imputer = SimpleImputer()
35
- imputer_params = {
36
- "imputer__strategy": ["mean", "median", "most_frequent"],
37
- }
38
- elif imputer_type == "knn":
39
- imputer = KNNImputer()
40
- imputer_params = {
41
- "imputer__n_neighbors": [5, 7],
42
- "imputer__weights": ["uniform", "distance"],
43
- }
44
- elif imputer_type == "iterative":
45
- imputer = IterativeImputer()
46
- imputer_params = {
47
- "imputer__initial_strategy": ["mean", "median", "most_frequent"],
48
- "imputer__imputation_order": ["ascending", "descending"],
49
- }
50
- else:
51
- print(f"unidentified option for arg, imputer_type: {imputer_type}")
52
- sys.exit(0)
53
- return imputer, imputer_params
54
-
55
- def get_scaler():
56
- scaler = StandardScaler()
57
- return scaler
58
-
59
- def get_pca(max_num_feats):
60
- pca = PCA()
61
- pca_params = {
62
- "pca__n_components": np.arange(2, max_num_feats+1),
63
- }
64
- return pca, pca_params
65
-
66
- def get_classifier(classifier_type):
67
- # setup parameter search space for different classifiers
68
-
69
- classifier, classifier_params = None, None
70
- if classifier_type == "ada_boost":
71
- classifier = AdaBoostClassifier()
72
- classifier_params = {
73
- "classifier__learning_rate": [0.5, 1, 1.5, 2, 2.5, 3],
74
- "classifier__n_estimators": [100, 200, 500],
75
- }
76
- elif classifier_type == "log_reg":
77
- classifier = LogisticRegression(max_iter=200, solver="saga")
78
- classifier_params = {
79
- "classifier__penalty": ["l1", "l2", "elasticnet"],
80
- "classifier__class_weight": [None, "balanced"],
81
- "classifier__C": [0.1, 0.5, 1, 2],
82
- "classifier__l1_ratio": np.arange(0.1, 1, 0.1),
83
- }
84
- elif classifier_type == "random_forest":
85
- classifier = RandomForestClassifier()
86
- classifier_params = {
87
- "classifier__n_estimators": [100, 250],
88
- "classifier__criterion": ["gini", "entropy"],
89
- "classifier__max_depth": [None, 10, 25, 50, 75],
90
- "classifier__min_samples_leaf": [1, 5, 10, 20],
91
- "classifier__min_samples_split": [2, 3, 4, 5],
92
- }
93
- elif classifier_type == "svc":
94
- classifier = SVC()
95
- classifier_params = {
96
- "classifier__C": [0.5, 1, 1.5, 2, 2.5],
97
- "classifier__kernel": ["linear", "poly", "rbf", "sigmoid"],
98
- "classifier__degree": [2, 3, 4],
99
- }
100
- elif classifier_type == "light_gbm":
101
- classifier = lgbm.LGBMClassifier(
102
- boosting_type="gbdt", objective="binary", metric="auc", verbosity=-1)
103
- classifier_params = {
104
- "classifier__num_leaves": [15, 31, 63, 127, 255],
105
- "classifier__learning_rate": [0.1, 0.5, 1, 2],
106
- "classifier__n_estimators": [100, 500, 1000],
107
- "classifier__reg_lambda": [0.1, 0.5, 1],
108
- "classifier__min_data_in_leaf": [10, 20, 30, 50],
109
- }
110
- else:
111
- print(f"unidentified option for arg, classifier_type: {classifier_type}")
112
- sys.exit(0)
113
-
114
- return classifier, classifier_params
115
 
116
- def get_pipeline_params(imputer_params, classifier_params):
117
- pipeline_params = {**imputer_params, **classifier_params}
118
- return pipeline_params
119
-
120
- def train_model(df_train, df_test, imputer_type, classifier_type):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  # get data arrays from the data frame for train and test sets
122
- X_train, Y_train = get_data_from_data_frame(df_train)
123
- X_test, Y_test = get_data_from_data_frame(df_test)
124
-
125
- # get imputer and its params
126
- imputer, imputer_params = get_imputer(imputer_type)
127
-
128
- # get classifier and its params
129
- classifier, classifier_params = get_classifier(classifier_type)
130
-
131
- # get the pipeline params
132
- pipeline_params = get_pipeline_params(imputer_params, classifier_params)
133
-
134
- print("\n" + "-"*100)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  # build the model pipeline
136
- if classifier_type == "svc" or classifier_type == "log_reg":
137
- scaler = get_scaler()
138
- pca, pca_params = get_pca(X_train.shape[1])
139
- print(f"Started training the model with the imputer: {imputer_type}, preprocessing: std_scaler + pca, classifier: {classifier_type}")
140
-
141
- pipeline = Pipeline([("imputer", imputer), ("scaler", scaler), ("pca", pca), ("classifier", classifier)])
142
- pipeline_params = get_pipeline_params(pipeline_params, pca_params)
143
- else:
144
- print(f"Started training the model with the imputer: {imputer_type}, classifier: {classifier_type}")
145
- pipeline = Pipeline([("imputer", imputer), ("classifier", classifier)])
146
  print("Model pipeline params space: ")
147
- print(pipeline_params)
148
- print("-"*100)
149
 
150
  # setup grid search with k-fold cross validation
151
  k_fold_cv = KFold(n_splits=5, shuffle=True, random_state=4)
152
- grid_cv = GridSearchCV(pipeline, pipeline_params, scoring="f1", cv=k_fold_cv)
 
 
 
 
 
153
  grid_cv.fit(X_train, Y_train)
154
 
155
  # get the cross validation score and the params for the best estimator
@@ -167,15 +418,15 @@ def train_model(df_train, df_test, imputer_type, classifier_type):
167
  test_f1 = f1_score(Y_test, Y_test_pred)
168
  test_acc = accuracy_score(Y_test, Y_test_pred)
169
 
170
- print("\n" + "-"*50)
171
  # begin mlflow logging for the best estimator
172
  mlflow.set_experiment("water_potability")
173
  experiment = mlflow.get_experiment_by_name("water_potability")
174
  print(f"Started mlflow logging for the best estimator")
 
175
  with mlflow.start_run(experiment_id=experiment.experiment_id):
176
  # log the model and the metrics
177
- mlflow.sklearn.log_model(cv_best_estimator, f"{imputer_type}_{classifier_type}")
178
- mlflow.sklearn.save_model(cv_best_estimator, f"{imputer_type}_{classifier_type}")
179
  mlflow.log_params(cv_best_params)
180
  mlflow.log_metric("cv_f1_score", cv_best_f1)
181
  mlflow.log_metric("train_f1_score", train_f1)
@@ -185,54 +436,99 @@ def train_model(df_train, df_test, imputer_type, classifier_type):
185
  # end mlflow logging
186
  mlflow.end_run()
187
  print(f"Completed mlflow logging for the best estimator")
188
- print("-"*50)
189
  return
190
 
 
191
  def init_and_train_model(ARGS):
192
- df_csv = read_csv_file(ARGS.file_csv)
193
- df_train, df_test = train_test_split(df_csv, test_size=0.1, random_state=4)
 
194
 
195
- num_samples_train = df_train.shape[0]
196
- num_samples_test = df_test.shape[0]
197
 
198
- print("\n" + "-"*40)
199
  print("Num samples after splitting the dataset")
200
- print("-"*40)
201
  print(f"train: {num_samples_train}, test: {num_samples_test}")
202
 
203
- print("\n" + "-"*40)
204
  print("A few samples from train data")
205
- print("-"*40)
206
- print(df_train.head())
207
 
208
  if ARGS.is_train:
209
- train_model(df_train, df_test, ARGS.imputer_type, ARGS.classifier_type)
 
 
 
 
 
 
 
210
  return
211
 
 
212
  def main():
213
  file_csv = "dataset/water_potability.csv"
214
  classifier_type = "ada_boost"
215
  imputer_type = "knn"
 
 
216
  is_train = 1
 
217
 
218
  parser = argparse.ArgumentParser(
219
  formatter_class=argparse.ArgumentDefaultsHelpFormatter
220
  )
221
 
222
- parser.add_argument("--file_csv", default=file_csv,
223
- type=str, help="full path to dataset csv file")
224
- parser.add_argument("--is_train", default=is_train,
225
- type=int, choices=[0, 1], help="to train or not")
226
- parser.add_argument("--classifier_type", default=classifier_type,
227
- type=str, choices=["ada_boost", "log_reg", "random_forest", "svc", "light_gbm"],
228
- help="classifier to be used in the training model pipeline")
229
- parser.add_argument("--imputer_type", default=imputer_type,
230
- type=str, choices=["simple", "knn", "iterative"],
231
- help="imputer to be used in the training model pipeline")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
 
233
  ARGS, unparsed = parser.parse_known_args()
234
  init_and_train_model(ARGS)
235
  return
236
 
 
237
  if __name__ == "__main__":
238
  main()
 
12
  from sklearn.svm import SVC
13
  from sklearn.decomposition import PCA
14
  from sklearn.pipeline import Pipeline
 
15
  from sklearn.linear_model import LogisticRegression
16
  from sklearn.experimental import enable_iterative_imputer
17
  from sklearn.metrics import accuracy_score, f1_score, make_scorer
18
  from sklearn.impute import KNNImputer, SimpleImputer, IterativeImputer
19
  from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
20
+ from sklearn.model_selection import (
21
+ cross_validate,
22
+ GridSearchCV,
23
+ KFold,
24
+ )
25
+ from sklearn.preprocessing import (
26
+ StandardScaler,
27
+ MinMaxScaler,
28
+ Normalizer,
29
+ PolynomialFeatures,
30
+ PowerTransformer,
31
+ RobustScaler,
32
+ )
33
+
34
+ from data_utils import WaterPotabilityDataLoader
35
+
36
+
37
+ def load_mlflow_model(dir_mlflow_model):
38
+ """
39
+ ---------
40
+ Arguments
41
+ ---------
42
+ dir_mlflow_model : str
43
+ full direcotry path of the mlflow model
44
+
45
+ -------
46
+ Returns
47
+ -------
48
+ model_pipeline : object
49
+ an object of the mlflow sklearn model pipeline
50
+ """
51
+ model_pipeline = mlflow.sklearn.load_model(dir_mlflow_model)
52
  return model_pipeline
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
+ class ClassificationPipeline:
56
+ def __init__(self):
57
+ self._imputer = None
58
+ self._imputer_params = None
59
+ self._preprocessor = None
60
+ self._preprocessor_params = None
61
+ self._transformer = None
62
+ self._pca = None
63
+ self._pca_params = None
64
+ self._classifier = None
65
+ self._classifier_params = None
66
+
67
+ self.clf_pipeline = None
68
+ self.clf_pipeline_params = None
69
+
70
+ def set_imputer(self, imputer_type):
71
+ """
72
+ ---------
73
+ Arguments
74
+ ---------
75
+ imputer_type : str
76
+ a string indicating the type of imputer to be used in the ML pipeline
77
+ """
78
+
79
+ # setup parameter search space for different imputers
80
+ imputer, imputer_params = None, None
81
+ if imputer_type == "simple":
82
+ imputer = SimpleImputer()
83
+ imputer_params = {
84
+ "imputer__strategy": ["mean", "median", "most_frequent"],
85
+ }
86
+ elif imputer_type == "knn":
87
+ imputer = KNNImputer()
88
+ imputer_params = {
89
+ "imputer__n_neighbors": [5, 7],
90
+ "imputer__weights": ["uniform", "distance"],
91
+ }
92
+ elif imputer_type == "iterative":
93
+ imputer = IterativeImputer()
94
+ imputer_params = {
95
+ "imputer__initial_strategy": ["mean", "median", "most_frequent"],
96
+ "imputer__imputation_order": ["ascending", "descending"],
97
+ }
98
+ else:
99
+ print(f"unidentified option for arg, imputer_type: {imputer_type}")
100
+ sys.exit(0)
101
+
102
+ self._imputer = imputer
103
+ self._imputer_params = imputer_params
104
+ return
105
+
106
+ def set_preprocessor(self, preprocessor_type):
107
+ """
108
+ ---------
109
+ Arguments
110
+ ---------
111
+ preprocessor_type : str
112
+ a string indicating the type of preprocessor to be used in the ML pipeline
113
+ """
114
+ if preprocessor_type == "std":
115
+ preprocessor = StandardScaler()
116
+ preprocessor_params = None
117
+ elif preprocessor_type == "min_max":
118
+ # range must be positive for box-cox transformation
119
+ # so use min max scaler and make sure range is proper
120
+ # so use the same range even without that transformation
121
+ preprocessor = MinMaxScaler(feature_range=(1, 2), clip=True)
122
+ preprocessor_params = None
123
+ elif preprocessor_type == "norm":
124
+ preprocessor = Normalizer()
125
+ preprocessor_params = {
126
+ "preprocessing__norm": ["l1", "l2", "max"],
127
+ }
128
+ elif preprocessor_type == "poly":
129
+ preprocessor = PolynomialFeatures()
130
+ preprocessor_params = {
131
+ "preprocessor__degree": [2],
132
+ "preprocessor__interaction_only": [True, False],
133
+ "preprocessor__include_bias": [True, False],
134
+ }
135
+ elif preprocessor_type == "robust":
136
+ preprocessor = RobustScaler()
137
+ preprocessor_params = None
138
+ else:
139
+ print(
140
+ f"unidentified option for arg, preprocessor_type: {preprocessor_type}"
141
+ )
142
+ sys.exit(0)
143
+
144
+ self._preprocessor = preprocessor
145
+ self._preprocessor_params = preprocessor_params
146
+ return
147
+
148
+ def set_transformer(self, transformer_type):
149
+ """
150
+ ---------
151
+ Arguments
152
+ ---------
153
+ transformer_type : str
154
+ a string indicating the transformer type to be used in the ML pipeline
155
+ """
156
+ if transformer_type == "power_box_cox":
157
+ self._transformer = PowerTransformer(method="box-cox")
158
+ elif transformer_type == "power_yeo_johnson":
159
+ self._transformer = PowerTransformer(method="yeo-johnson")
160
+ else:
161
+ print(f"unidentified option for arg, transformer_type: {transformer_type}")
162
+ sys.exit(0)
163
+ return
164
+
165
+ def set_pca(self, max_num_feats):
166
+ """
167
+ ---------
168
+ Arguments
169
+ ---------
170
+ max_num_feats : int
171
+ an integer indicating the maximum number of features in the dataset
172
+ """
173
+ self._pca = PCA()
174
+ self._pca_params = {
175
+ "pca__n_components": np.arange(2, max_num_feats + 1),
176
+ }
177
+ return
178
+
179
+ def set_classifier(self, classifier_type):
180
+ """
181
+ ---------
182
+ Arguments
183
+ ---------
184
+ classifier_type : str
185
+ a string indicating the type of classifier to be used in the ML pipeline
186
+ """
187
+
188
+ # setup parameter search space for different classifiers
189
+ classifier, classifier_params = None, None
190
+ if classifier_type == "ada_boost":
191
+ classifier = AdaBoostClassifier()
192
+ classifier_params = {
193
+ "classifier__learning_rate": [0.5, 1, 1.5, 2, 2.5, 3],
194
+ "classifier__n_estimators": [100, 200, 500],
195
+ }
196
+ elif classifier_type == "log_reg":
197
+ classifier = LogisticRegression(max_iter=200, solver="saga")
198
+ classifier_params = {
199
+ "classifier__penalty": ["l1", "l2", "elasticnet"],
200
+ "classifier__class_weight": [None, "balanced"],
201
+ "classifier__C": [0.1, 0.5, 1, 2],
202
+ "classifier__l1_ratio": np.arange(0.1, 1, 0.1),
203
+ }
204
+ elif classifier_type == "random_forest":
205
+ classifier = RandomForestClassifier()
206
+ classifier_params = {
207
+ "classifier__n_estimators": [100, 250],
208
+ "classifier__criterion": ["gini", "entropy"],
209
+ "classifier__max_depth": [None, 10, 25, 50, 75],
210
+ "classifier__min_samples_leaf": [1, 5, 10, 20],
211
+ "classifier__min_samples_split": [2, 3, 4, 5],
212
+ }
213
+ elif classifier_type == "svc":
214
+ classifier = SVC()
215
+ classifier_params = {
216
+ "classifier__C": [0.5, 1, 1.5, 2, 2.5],
217
+ "classifier__kernel": ["linear", "poly", "rbf", "sigmoid"],
218
+ "classifier__degree": [2, 3, 4],
219
+ }
220
+ elif classifier_type == "light_gbm":
221
+ classifier = lgbm.LGBMClassifier(
222
+ boosting_type="gbdt", objective="binary", metric="auc", verbosity=-1
223
+ )
224
+ classifier_params = {
225
+ "classifier__num_leaves": [15, 31, 63, 127, 255],
226
+ "classifier__learning_rate": [0.1, 0.5, 1, 2],
227
+ "classifier__n_estimators": [100, 500, 1000],
228
+ "classifier__reg_lambda": [0.1, 0.5, 1],
229
+ "classifier__min_data_in_leaf": [10, 20, 30, 50],
230
+ }
231
+ else:
232
+ print(f"unidentified option for arg, classifier_type: {classifier_type}")
233
+ sys.exit(0)
234
+
235
+ self._classifier = classifier
236
+ self._classifier_params = classifier_params
237
+ return
238
+
239
+ def build_pipeline(self):
240
+ if self._pca == None:
241
+ if self._preprocessor == None:
242
+ self.clf_pipeline = Pipeline(
243
+ [("imputer", self._imputer), ("classifier", self._classifier)]
244
+ )
245
+ list_pipeline_params = [self._imputer_params, self._classifier_params]
246
+ else:
247
+ if self._transformer == None:
248
+ self.clf_pipeline = Pipeline(
249
+ [
250
+ ("imputer", self._imputer),
251
+ ("preprocessor", self._preprocessor),
252
+ ("classifier", self._classifier),
253
+ ]
254
+ )
255
+ else:
256
+ self.clf_pipeline = Pipeline(
257
+ [
258
+ ("imputer", self._imputer),
259
+ ("preprocessor", self._preprocessor),
260
+ ("transformer", self._transformer),
261
+ ("classifier", self._classifier),
262
+ ]
263
+ )
264
+ if self._preprocessor_params is not None:
265
+ list_pipeline_params = [
266
+ self._imputer_params,
267
+ self._preprocessor_params,
268
+ self._classifier_params,
269
+ ]
270
+ else:
271
+ list_pipeline_params = [
272
+ self._imputer_params,
273
+ self._classifier_params,
274
+ ]
275
+ else:
276
+ # Preprocessing is a must for applying PCA
277
+ self.clf_pipeline = Pipeline(
278
+ [
279
+ ("imputer", self._imputer),
280
+ ("preprocessor", self._preprocessor),
281
+ ("pca", self._pca),
282
+ ("classifier", self._classifier),
283
+ ]
284
+ )
285
+ if self._preprocessor_params is not None:
286
+ list_pipeline_params = [
287
+ self._imputer_params,
288
+ self._preprocessor_params,
289
+ self._pca_params,
290
+ self._classifier_params,
291
+ ]
292
+ else:
293
+ list_pipeline_params = [
294
+ self._imputer_params,
295
+ self._pca_params,
296
+ self._classifier_params,
297
+ ]
298
+ self._set_pipeline_params(list_pipeline_params)
299
+ return
300
+
301
+ def _set_pipeline_params(self, list_pipeline_params):
302
+ """
303
+ ---------
304
+ Arguments
305
+ ---------
306
+ list_pipeline_params : list
307
+ a list of dictionaries of pipeline params
308
+ """
309
+ final_pipeline_params = {}
310
+ for _index in range(len(list_pipeline_params)):
311
+ temp_pipeline_params = {
312
+ **final_pipeline_params,
313
+ **list_pipeline_params[_index],
314
+ }
315
+ final_pipeline_params = temp_pipeline_params
316
+ self.clf_pipeline_params = final_pipeline_params
317
+ return
318
+
319
+
320
+ def train_model(
321
+ water_pot_dataset,
322
+ imputer_type,
323
+ preprocessor_type,
324
+ transformer_type,
325
+ classifier_type,
326
+ is_pca=False,
327
+ ):
328
+ """
329
+ ---------
330
+ Arguments
331
+ ---------
332
+ water_pot_dataset : object
333
+ an object of type WaterPotabilityDataLoader class
334
+ imputer_type : str
335
+ a string indicating the imputer type to be used in the ML pipeline
336
+ preprocessor_type : str
337
+ a string indicating the preprocessor type to be used in the ML pipeline
338
+ transformer_type : str
339
+ a string indicating the additional transformer type to be used in the ML pipeline
340
+ classifier_type : str
341
+ a string indicating the classifier type to be used in the ML pipeline
342
+ is_pca : bool
343
+ a boolean indicating whether to use PCA or not in the ML pipeline
344
+ """
345
  # get data arrays from the data frame for train and test sets
346
+ X_train, Y_train = water_pot_dataset.get_data_from_data_frame(which_set="train")
347
+ X_test, Y_test = water_pot_dataset.get_data_from_data_frame(which_set="test")
348
+
349
+ pca_str = "no_pca"
350
+ preprocessor_str = "no_preproc"
351
+ transformer_str = "no_transform"
352
+
353
+ clf_pipeline = ClassificationPipeline()
354
+
355
+ # set imputer and its params
356
+ clf_pipeline.set_imputer(imputer_type)
357
+
358
+ # set preprocessor and its params
359
+ if preprocessor_type != "none":
360
+ clf_pipeline.set_preprocessor(preprocessor_type)
361
+ preprocessor_str = preprocessor_type
362
+
363
+ # set the additional transformer if needed
364
+ if transformer_type != "none":
365
+ transformer_str = transformer_type
366
+ if transformer_type == "power_box_cox":
367
+ # range must be positive for box-cox transformation
368
+ # so use min max scaler and make sure range is proper
369
+ clf_pipeline.set_preprocessor("min_max")
370
+ clf_pipeline.set_transformer(transformer_type)
371
+ preprocessor_str = "min_max"
372
+ else:
373
+ # std scaler for yeo-johnson transformation yields better results
374
+ clf_pipeline.set_preprocessor("std")
375
+ clf_pipeline.set_transformer(transformer_type)
376
+ preprocessor_str = "std"
377
+
378
+ # to use PCA or not
379
+ if is_pca == True:
380
+ clf_pipeline.set_pca(X_train.shape[1])
381
+ pca_str = "pca"
382
+
383
+ # set classifier and its params
384
+ clf_pipeline.set_classifier(classifier_type)
385
+
386
+ print("\n" + "-" * 100)
387
  # build the model pipeline
388
+ clf_pipeline.build_pipeline()
389
+ print(clf_pipeline.clf_pipeline)
390
+
391
+ print("\n" + "-" * 100)
 
 
 
 
 
 
392
  print("Model pipeline params space: ")
393
+ print(clf_pipeline.clf_pipeline_params)
394
+ print("-" * 100)
395
 
396
  # setup grid search with k-fold cross validation
397
  k_fold_cv = KFold(n_splits=5, shuffle=True, random_state=4)
398
+ grid_cv = GridSearchCV(
399
+ clf_pipeline.clf_pipeline,
400
+ clf_pipeline.clf_pipeline_params,
401
+ scoring="f1",
402
+ cv=k_fold_cv,
403
+ )
404
  grid_cv.fit(X_train, Y_train)
405
 
406
  # get the cross validation score and the params for the best estimator
 
418
  test_f1 = f1_score(Y_test, Y_test_pred)
419
  test_acc = accuracy_score(Y_test, Y_test_pred)
420
 
421
+ print("\n" + "-" * 50)
422
  # begin mlflow logging for the best estimator
423
  mlflow.set_experiment("water_potability")
424
  experiment = mlflow.get_experiment_by_name("water_potability")
425
  print(f"Started mlflow logging for the best estimator")
426
+ model_log_str = f"{imputer_type}_{preprocessor_str}_{transformer_str}_{pca_str}_{classifier_type}"
427
  with mlflow.start_run(experiment_id=experiment.experiment_id):
428
  # log the model and the metrics
429
+ mlflow.sklearn.log_model(cv_best_estimator, model_log_str)
 
430
  mlflow.log_params(cv_best_params)
431
  mlflow.log_metric("cv_f1_score", cv_best_f1)
432
  mlflow.log_metric("train_f1_score", train_f1)
 
436
  # end mlflow logging
437
  mlflow.end_run()
438
  print(f"Completed mlflow logging for the best estimator")
439
+ print("-" * 50)
440
  return
441
 
442
+
443
  def init_and_train_model(ARGS):
444
+ water_pot_dataset = WaterPotabilityDataLoader(ARGS.file_csv)
445
+ water_pot_dataset.read_csv_file()
446
+ water_pot_dataset.split_data()
447
 
448
+ num_samples_train = water_pot_dataset.df_train.shape[0]
449
+ num_samples_test = water_pot_dataset.df_test.shape[0]
450
 
451
+ print("\n" + "-" * 40)
452
  print("Num samples after splitting the dataset")
453
+ print("-" * 40)
454
  print(f"train: {num_samples_train}, test: {num_samples_test}")
455
 
456
+ print("\n" + "-" * 40)
457
  print("A few samples from train data")
458
+ print("-" * 40)
459
+ print(water_pot_dataset.df_train.head())
460
 
461
  if ARGS.is_train:
462
+ train_model(
463
+ water_pot_dataset,
464
+ ARGS.imputer_type,
465
+ ARGS.preprocessor_type,
466
+ ARGS.transformer_type,
467
+ ARGS.classifier_type,
468
+ bool(ARGS.is_pca),
469
+ )
470
  return
471
 
472
+
473
  def main():
474
  file_csv = "dataset/water_potability.csv"
475
  classifier_type = "ada_boost"
476
  imputer_type = "knn"
477
+ preprocessor_type = "none"
478
+ transformer_type = "none"
479
  is_train = 1
480
+ is_pca = 0
481
 
482
  parser = argparse.ArgumentParser(
483
  formatter_class=argparse.ArgumentDefaultsHelpFormatter
484
  )
485
 
486
+ parser.add_argument(
487
+ "--file_csv", default=file_csv, type=str, help="full path to dataset csv file"
488
+ )
489
+ parser.add_argument(
490
+ "--is_train", default=is_train, type=int, choices=[0, 1], help="to train or not"
491
+ )
492
+ parser.add_argument(
493
+ "--classifier_type",
494
+ default=classifier_type,
495
+ type=str,
496
+ choices=["ada_boost", "log_reg", "random_forest", "svc", "light_gbm"],
497
+ help="classifier to be used in the ML model pipeline",
498
+ )
499
+ parser.add_argument(
500
+ "--imputer_type",
501
+ default=imputer_type,
502
+ type=str,
503
+ choices=["simple", "knn", "iterative"],
504
+ help="imputer to be used in the ML model pipeline",
505
+ )
506
+ parser.add_argument(
507
+ "--preprocessor_type",
508
+ default=preprocessor_type,
509
+ type=str,
510
+ choices=["none", "std", "min_max", "norm", "poly", "robust"],
511
+ help="preprocessor to be used in the ML model pipeline",
512
+ )
513
+ parser.add_argument(
514
+ "--transformer_type",
515
+ default=transformer_type,
516
+ type=str,
517
+ choices=["none", "power_box_cox", "power_yeo_johnson"],
518
+ help="additional transformer to be used in the ML model pipeline",
519
+ )
520
+ parser.add_argument(
521
+ "--is_pca",
522
+ default=is_pca,
523
+ type=int,
524
+ choices=[0, 1],
525
+ help="indicates if pca should be used in the ML model pipeline (0: False, 1: True)",
526
+ )
527
 
528
  ARGS, unparsed = parser.parse_known_args()
529
  init_and_train_model(ARGS)
530
  return
531
 
532
+
533
  if __name__ == "__main__":
534
  main()
modeling/ml_model_test.py CHANGED
@@ -4,35 +4,44 @@ import mlflow
4
  import numpy as np
5
  from sklearn.metrics import classification_report
6
 
7
- from ml_model_dev import load_ml_model, train_test_split, read_csv_file
 
 
8
 
9
  def test_ml_pipeline(ARGS):
10
- df_csv = read_csv_file(ARGS.file_csv)
11
- df_train, df_test = train_test_split(df_csv, test_size=0.1, random_state=4)
12
- arr_test = df_test.to_numpy()
13
- X_test, Y_test = arr_test[:, :-1], arr_test[:, -1:].reshape(-1)
14
 
15
- model_pipeline = load_ml_model(ARGS.pkl_file_name)
16
  Y_pred_test = model_pipeline.predict(X_test)
17
  print(classification_report(Y_test, Y_pred_test))
18
  return
19
 
 
20
  def main():
21
  file_csv = "dataset/water_potability.csv"
22
- pkl_file_name = "trained_models/knn_ada_boost"
23
 
24
  parser = argparse.ArgumentParser(
25
  formatter_class=argparse.ArgumentDefaultsHelpFormatter
26
  )
27
 
28
- parser.add_argument("--file_csv", default=file_csv,
29
- type=str, help="full path to dataset csv file")
30
- parser.add_argument("--pkl_file_name", default=pkl_file_name,
31
- type=str, help="full path to ml model pkl file")
 
 
 
 
 
32
 
33
  ARGS, unparsed = parser.parse_known_args()
34
  test_ml_pipeline(ARGS)
35
  return
36
 
 
37
  if __name__ == "__main__":
38
  main()
 
4
  import numpy as np
5
  from sklearn.metrics import classification_report
6
 
7
+ from data_utils import WaterPotabilityDataLoader
8
+ from ml_model_dev import load_mlflow_model
9
+
10
 
11
  def test_ml_pipeline(ARGS):
12
+ water_pot_dataset = WaterPotabilityDataLoader(ARGS.file_csv)
13
+ water_pot_dataset.read_csv_file()
14
+ water_pot_dataset.split_data()
15
+ X_test, Y_test = water_pot_dataset.get_data_from_data_frame(which_set="test")
16
 
17
+ model_pipeline = load_mlflow_model(ARGS.dir_mlflow_model)
18
  Y_pred_test = model_pipeline.predict(X_test)
19
  print(classification_report(Y_test, Y_pred_test))
20
  return
21
 
22
+
23
  def main():
24
  file_csv = "dataset/water_potability.csv"
25
+ dir_mlflow_model = "model_for_production"
26
 
27
  parser = argparse.ArgumentParser(
28
  formatter_class=argparse.ArgumentDefaultsHelpFormatter
29
  )
30
 
31
+ parser.add_argument(
32
+ "--file_csv", default=file_csv, type=str, help="full path to dataset csv file"
33
+ )
34
+ parser.add_argument(
35
+ "--dir_mlflow_model",
36
+ default=dir_mlflow_model,
37
+ type=str,
38
+ help="full path to directory containing mlflow model",
39
+ )
40
 
41
  ARGS, unparsed = parser.parse_known_args()
42
  test_ml_pipeline(ARGS)
43
  return
44
 
45
+
46
  if __name__ == "__main__":
47
  main()