abhishekrs4 commited on
Commit
5fbf3c7
1 Parent(s): 5f32fa0

added modeling module

Browse files
modeling/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ import os, sys; sys.path.append(os.path.dirname(os.path.realpath(__file__)))
modeling/data_utils.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+
4
+ def read_csv_file(file_csv):
5
+ df_csv = pd.read_csv(file_csv)
6
+ return df_csv
7
+
8
+ def get_dict_nan_counts_per_col(data_frame):
9
+ dict_nan_counts_per_col = data_frame.isna().sum().to_dict()
10
+ dict_nan_counts_per_col = dict(sorted(dict_nan_counts_per_col.items(), key=lambda kv: kv[1], reverse=True))
11
+ return dict_nan_counts_per_col
12
+
13
+ def get_data_from_data_frame(data_frame):
14
+ arr = data_frame.to_numpy()
15
+ X_arr, Y_arr = arr[:, :-1], arr[:, -1:].reshape(-1)
16
+ return X_arr, Y_arr
modeling/eda.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import argparse
4
+
5
+ import numpy as np
6
+ import seaborn as sns
7
+ import matplotlib.pyplot as plt
8
+
9
+ from data_utils import read_csv_file, get_data_from_data_frame
10
+
11
+ def do_eda(ARGS):
12
+ data_frame = read_csv_file(ARGS.file_csv)
13
+ label_counts = dict(data_frame[ARGS.target_column].value_counts())
14
+ # print(label_counts)
15
+
16
+ # plot a histogram
17
+ plt.figure(figsize=(12, 12))
18
+ plt.bar([str(l) for l in label_counts.keys()], label_counts.values(), width=0.5)
19
+ plt.xlabel(f"{ARGS.target_column}", fontsize=20)
20
+ plt.ylabel("Number of samples", fontsize=20)
21
+ plt.title("Distribution of samples in the dataset", fontsize=20)
22
+ plt.grid()
23
+ plt.xticks(fontsize=20)
24
+ plt.yticks(fontsize=20)
25
+ plt.show()
26
+
27
+ """
28
+ feat_cols = data_frame.columns[:-1]
29
+ num_feat_cols = len(feat_cols)
30
+
31
+ fig, axs = plt.subplots(num_feat_cols)
32
+ fig.suptitle("Distribution of features")
33
+ #axs.set_xlabel(ARGS.target_column)
34
+
35
+ for col_index in range(num_feat_cols):
36
+ column = feat_cols[col_index]
37
+ not_nan_indices = list(data_frame[column].notna())
38
+ lbl_with_not_nans = data_frame[ARGS.target_column][not_nan_indices]
39
+ col_with_not_nans = data_frame[column][not_nan_indices]
40
+ print(column, len(lbl_with_not_nans), len(col_with_not_nans))
41
+
42
+ axs[col_index].scatter(lbl_with_not_nans, col_with_not_nans)
43
+ axs[col_index].set(ylabel=column)
44
+ plt.show()
45
+ """
46
+
47
+ plt.figure()
48
+ corr_mat = data_frame.corr()
49
+ sns.heatmap(corr_mat)
50
+ plt.title("Feature correlation matrix", fontsize=20)
51
+ plt.xticks(fontsize=20)
52
+ plt.yticks(fontsize=20)
53
+ plt.show()
54
+
55
+ return
56
+
57
+ def main():
58
+ file_csv = "dataset/water_potability.csv"
59
+ target_column = "Potability"
60
+
61
+ parser = argparse.ArgumentParser(
62
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
63
+ )
64
+ parser.add_argument("--file_csv", default=file_csv,
65
+ type=str, help="full path to dataset csv file")
66
+ parser.add_argument("--target_column", default=target_column,
67
+ type=str, help="target label for which the EDA needs to be done")
68
+ ARGS, unparsed = parser.parse_known_args()
69
+ do_eda(ARGS)
70
+ return
71
+
72
+ if __name__ == "__main__":
73
+ main()
modeling/ml_model_dev.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import joblib
4
+ import argparse
5
+ import collections
6
+
7
+ import mlflow
8
+ import numpy as np
9
+
10
+ import lightgbm as lgbm
11
+
12
+ from sklearn.svm import SVC
13
+ from sklearn.decomposition import PCA
14
+ from sklearn.pipeline import Pipeline
15
+ from sklearn.preprocessing import StandardScaler
16
+ from sklearn.linear_model import LogisticRegression
17
+ from sklearn.experimental import enable_iterative_imputer
18
+ from sklearn.metrics import accuracy_score, f1_score, make_scorer
19
+ from sklearn.impute import KNNImputer, SimpleImputer, IterativeImputer
20
+ from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
21
+ from sklearn.model_selection import cross_validate, train_test_split, GridSearchCV, KFold
22
+
23
+ from data_utils import read_csv_file, get_data_from_data_frame
24
+
25
+
26
+ def load_ml_model(pkl_file_name):
27
+ model_pipeline = mlflow.sklearn.load_model(pkl_file_name)
28
+ return model_pipeline
29
+
30
+ def get_imputer(imputer_type):
31
+ # setup parameter search space for different imputers
32
+ imputer, imputer_params = None, None
33
+ if imputer_type == "simple":
34
+ imputer = SimpleImputer()
35
+ imputer_params = {
36
+ "imputer__strategy": ["mean", "median", "most_frequent"],
37
+ }
38
+ elif imputer_type == "knn":
39
+ imputer = KNNImputer()
40
+ imputer_params = {
41
+ "imputer__n_neighbors": [5, 7],
42
+ "imputer__weights": ["uniform", "distance"],
43
+ }
44
+ elif imputer_type == "iterative":
45
+ imputer = IterativeImputer()
46
+ imputer_params = {
47
+ "imputer__initial_strategy": ["mean", "median", "most_frequent"],
48
+ "imputer__imputation_order": ["ascending", "descending"],
49
+ }
50
+ else:
51
+ print(f"unidentified option for arg, imputer_type: {imputer_type}")
52
+ sys.exit(0)
53
+ return imputer, imputer_params
54
+
55
+ def get_scaler():
56
+ scaler = StandardScaler()
57
+ return scaler
58
+
59
+ def get_pca(max_num_feats):
60
+ pca = PCA()
61
+ pca_params = {
62
+ "pca__n_components": np.arange(2, max_num_feats+1),
63
+ }
64
+ return pca, pca_params
65
+
66
+ def get_classifier(classifier_type):
67
+ # setup parameter search space for different classifiers
68
+
69
+ classifier, classifier_params = None, None
70
+ if classifier_type == "ada_boost":
71
+ classifier = AdaBoostClassifier()
72
+ classifier_params = {
73
+ "classifier__learning_rate": [0.5, 1, 1.5, 2, 2.5, 3],
74
+ "classifier__n_estimators": [100, 200, 500],
75
+ }
76
+ elif classifier_type == "log_reg":
77
+ classifier = LogisticRegression(max_iter=200, solver="saga")
78
+ classifier_params = {
79
+ "classifier__penalty": ["l1", "l2", "elasticnet"],
80
+ "classifier__class_weight": [None, "balanced"],
81
+ "classifier__C": [0.1, 0.5, 1, 2],
82
+ "classifier__l1_ratio": np.arange(0.1, 1, 0.1),
83
+ }
84
+ elif classifier_type == "random_forest":
85
+ classifier = RandomForestClassifier()
86
+ classifier_params = {
87
+ "classifier__n_estimators": [100, 250],
88
+ "classifier__criterion": ["gini", "entropy"],
89
+ "classifier__max_depth": [None, 10, 25, 50, 75],
90
+ "classifier__min_samples_leaf": [1, 5, 10, 20],
91
+ "classifier__min_samples_split": [2, 3, 4, 5],
92
+ }
93
+ elif classifier_type == "svc":
94
+ classifier = SVC()
95
+ classifier_params = {
96
+ "classifier__C": [0.5, 1, 1.5, 2, 2.5],
97
+ "classifier__kernel": ["linear", "poly", "rbf", "sigmoid"],
98
+ "classifier__degree": [2, 3, 4],
99
+ }
100
+ elif classifier_type == "light_gbm":
101
+ classifier = lgbm.LGBMClassifier(
102
+ boosting_type="gbdt", objective="binary", metric="auc", verbosity=-1)
103
+ classifier_params = {
104
+ "classifier__num_leaves": [15, 31, 63, 127, 255],
105
+ "classifier__learning_rate": [0.1, 0.5, 1, 2],
106
+ "classifier__n_estimators": [100, 500, 1000],
107
+ "classifier__reg_lambda": [0.1, 0.5, 1],
108
+ "classifier__min_data_in_leaf": [10, 20, 30, 50],
109
+ }
110
+ else:
111
+ print(f"unidentified option for arg, classifier_type: {classifier_type}")
112
+ sys.exit(0)
113
+
114
+ return classifier, classifier_params
115
+
116
+ def get_pipeline_params(imputer_params, classifier_params):
117
+ pipeline_params = {**imputer_params, **classifier_params}
118
+ return pipeline_params
119
+
120
+ def train_model(df_train, df_test, imputer_type, classifier_type):
121
+ # get data arrays from the data frame for train and test sets
122
+ X_train, Y_train = get_data_from_data_frame(df_train)
123
+ X_test, Y_test = get_data_from_data_frame(df_test)
124
+
125
+ # get imputer and its params
126
+ imputer, imputer_params = get_imputer(imputer_type)
127
+
128
+ # get classifier and its params
129
+ classifier, classifier_params = get_classifier(classifier_type)
130
+
131
+ # get the pipeline params
132
+ pipeline_params = get_pipeline_params(imputer_params, classifier_params)
133
+
134
+ print("\n" + "-"*100)
135
+ # build the model pipeline
136
+ if classifier_type == "svc" or classifier_type == "log_reg":
137
+ scaler = get_scaler()
138
+ pca, pca_params = get_pca(X_train.shape[1])
139
+ print(f"Started training the model with the imputer: {imputer_type}, preprocessing: std_scaler + pca, classifier: {classifier_type}")
140
+
141
+ pipeline = Pipeline([("imputer", imputer), ("scaler", scaler), ("pca", pca), ("classifier", classifier)])
142
+ pipeline_params = get_pipeline_params(pipeline_params, pca_params)
143
+ else:
144
+ print(f"Started training the model with the imputer: {imputer_type}, classifier: {classifier_type}")
145
+ pipeline = Pipeline([("imputer", imputer), ("classifier", classifier)])
146
+ print("Model pipeline params space: ")
147
+ print(pipeline_params)
148
+ print("-"*100)
149
+
150
+ # setup grid search with k-fold cross validation
151
+ k_fold_cv = KFold(n_splits=5, shuffle=True, random_state=4)
152
+ grid_cv = GridSearchCV(pipeline, pipeline_params, scoring="f1", cv=k_fold_cv)
153
+ grid_cv.fit(X_train, Y_train)
154
+
155
+ # get the cross validation score and the params for the best estimator
156
+ cv_best_estimator = grid_cv.best_estimator_
157
+ cv_best_f1 = grid_cv.best_score_
158
+ cv_best_params = grid_cv.best_params_
159
+
160
+ # predict and compute train set metrics
161
+ Y_train_pred = cv_best_estimator.predict(X_train)
162
+ train_f1 = f1_score(Y_train, Y_train_pred)
163
+ train_acc = accuracy_score(Y_train, Y_train_pred)
164
+
165
+ # predict and compute test set metrics
166
+ Y_test_pred = cv_best_estimator.predict(X_test)
167
+ test_f1 = f1_score(Y_test, Y_test_pred)
168
+ test_acc = accuracy_score(Y_test, Y_test_pred)
169
+
170
+ print("\n" + "-"*50)
171
+ # begin mlflow logging for the best estimator
172
+ mlflow.set_experiment("water_potability")
173
+ experiment = mlflow.get_experiment_by_name("water_potability")
174
+ print(f"Started mlflow logging for the best estimator")
175
+ with mlflow.start_run(experiment_id=experiment.experiment_id):
176
+ # log the model and the metrics
177
+ mlflow.sklearn.log_model(cv_best_estimator, f"{imputer_type}_{classifier_type}")
178
+ mlflow.sklearn.save_model(cv_best_estimator, f"{imputer_type}_{classifier_type}")
179
+ mlflow.log_params(cv_best_params)
180
+ mlflow.log_metric("cv_f1_score", cv_best_f1)
181
+ mlflow.log_metric("train_f1_score", train_f1)
182
+ mlflow.log_metric("train_acc_score", train_acc)
183
+ mlflow.log_metric("test_f1_score", test_f1)
184
+ mlflow.log_metric("test_acc_score", test_acc)
185
+ # end mlflow logging
186
+ mlflow.end_run()
187
+ print(f"Completed mlflow logging for the best estimator")
188
+ print("-"*50)
189
+ return
190
+
191
+ def init_and_train_model(ARGS):
192
+ df_csv = read_csv_file(ARGS.file_csv)
193
+ df_train, df_test = train_test_split(df_csv, test_size=0.1, random_state=4)
194
+
195
+ num_samples_train = df_train.shape[0]
196
+ num_samples_test = df_test.shape[0]
197
+
198
+ print("\n" + "-"*40)
199
+ print("Num samples after splitting the dataset")
200
+ print("-"*40)
201
+ print(f"train: {num_samples_train}, test: {num_samples_test}")
202
+
203
+ print("\n" + "-"*40)
204
+ print("A few samples from train data")
205
+ print("-"*40)
206
+ print(df_train.head())
207
+
208
+ if ARGS.is_train:
209
+ train_model(df_train, df_test, ARGS.imputer_type, ARGS.classifier_type)
210
+ return
211
+
212
+ def main():
213
+ file_csv = "dataset/water_potability.csv"
214
+ classifier_type = "ada_boost"
215
+ imputer_type = "knn"
216
+ is_train = 1
217
+
218
+ parser = argparse.ArgumentParser(
219
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
220
+ )
221
+
222
+ parser.add_argument("--file_csv", default=file_csv,
223
+ type=str, help="full path to dataset csv file")
224
+ parser.add_argument("--is_train", default=is_train,
225
+ type=int, choices=[0, 1], help="to train or not")
226
+ parser.add_argument("--classifier_type", default=classifier_type,
227
+ type=str, choices=["ada_boost", "log_reg", "random_forest", "svc", "light_gbm"],
228
+ help="classifier to be used in the training model pipeline")
229
+ parser.add_argument("--imputer_type", default=imputer_type,
230
+ type=str, choices=["simple", "knn", "iterative"],
231
+ help="imputer to be used in the training model pipeline")
232
+
233
+ ARGS, unparsed = parser.parse_known_args()
234
+ init_and_train_model(ARGS)
235
+ return
236
+
237
+ if __name__ == "__main__":
238
+ main()
modeling/ml_model_test.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+ import mlflow
4
+ import numpy as np
5
+ from sklearn.metrics import classification_report
6
+
7
+ from ml_model_dev import load_ml_model, train_test_split, read_csv_file
8
+
9
+ def test_ml_pipeline(ARGS):
10
+ df_csv = read_csv_file(ARGS.file_csv)
11
+ df_train, df_test = train_test_split(df_csv, test_size=0.1, random_state=4)
12
+ arr_test = df_test.to_numpy()
13
+ X_test, Y_test = arr_test[:, :-1], arr_test[:, -1:].reshape(-1)
14
+
15
+ model_pipeline = load_ml_model(ARGS.pkl_file_name)
16
+ Y_pred_test = model_pipeline.predict(X_test)
17
+ print(classification_report(Y_test, Y_pred_test))
18
+ return
19
+
20
+ def main():
21
+ file_csv = "dataset/water_potability.csv"
22
+ pkl_file_name = "trained_models/knn_ada_boost"
23
+
24
+ parser = argparse.ArgumentParser(
25
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
26
+ )
27
+
28
+ parser.add_argument("--file_csv", default=file_csv,
29
+ type=str, help="full path to dataset csv file")
30
+ parser.add_argument("--pkl_file_name", default=pkl_file_name,
31
+ type=str, help="full path to ml model pkl file")
32
+
33
+ ARGS, unparsed = parser.parse_known_args()
34
+ test_ml_pipeline(ARGS)
35
+ return
36
+
37
+ if __name__ == "__main__":
38
+ main()