Spaces:

abhishekrs4
/

ML_water_potability

Sleeping

App Files Files Community

abhishekrs4 commited on Feb 24

Commit

77b575f

•

1 Parent(s): 76a5f1b

updated scripts in the modeling module

Browse files

Files changed (5) hide show

modeling/__init__.py +3 -1
modeling/data_utils.py +65 -9
modeling/eda.py +16 -6
modeling/ml_model_dev.py +444 -148
modeling/ml_model_test.py +20 -11

modeling/__init__.py CHANGED Viewed

	@@ -1 +1,3 @@
1	- import os, sys~~; sys.path.append(os.path.dirname(os.path.realpath(__file__)))~~


1	+ import os, sys
2	+
3	+ sys.path.append(os.path.dirname(os.path.realpath(__file__)))

modeling/data_utils.py CHANGED Viewed

@@ -1,16 +1,72 @@
 import numpy as np
 import pandas as pd
-def read_csv_file(file_csv):
-    df_csv = pd.read_csv(file_csv)
-    return df_csv
 def get_dict_nan_counts_per_col(data_frame):
     dict_nan_counts_per_col = data_frame.isna().sum().to_dict()
-    dict_nan_counts_per_col = dict(sorted(dict_nan_counts_per_col.items(), key=lambda kv: kv[1], reverse=True))
     return dict_nan_counts_per_col
-def get_data_from_data_frame(data_frame):
-    arr = data_frame.to_numpy()
-    X_arr, Y_arr = arr[:, :-1], arr[:, -1:].reshape(-1)
-    return X_arr, Y_arr

 import numpy as np
 import pandas as pd
+from sklearn.model_selection import train_test_split
+class WaterPotabilityDataLoader:
+    def __init__(self, file_csv, test_size=0.1, random_state=4):
+        self.file_csv = file_csv
+        self.test_size = test_size
+        self.random_state = random_state
+        self.df_csv = None
+        self.df_train = None
+        self.df_test = None
+        self.X_train = None
+        self.Y_train = None
+        self.X_test = None
+        self.Y_test = None
+    def read_csv_file(self):
+        self.df_csv = pd.read_csv(self.file_csv)
+        return
+    def split_data(self):
+        self.df_train, self.df_test = train_test_split(
+            self.df_csv, test_size=self.test_size, random_state=self.random_state
+        )
+        return
+    def get_data_from_data_frame(self, which_set="train"):
+        """
+        ---------
+        Arguments
+        ---------
+        which_set : str
+            a string indicating for which set the data arrays should be returned
+        -------
+        Returns
+        -------
+        (X_arr, Y_arr) : tuple
+            a tuple of numpy arrays of features and labels for the appropriate set
+        """
+        if which_set == "train":
+            data_frame = self.df_train
+        else:
+            data_frame = self.df_test
+        arr = data_frame.to_numpy()
+        X_arr, Y_arr = arr[:, :-1], arr[:, -1:].reshape(-1)
+        return X_arr, Y_arr
 def get_dict_nan_counts_per_col(data_frame):
+    """
+    ---------
+    Arguments
+    ---------
+    data_frame : pd.DataFrame
+        a pandas dataframe of some dataset
+    -------
+    Returns
+    -------
+    dict_nan_counts_per_col : dict
+        a dictionary of NaN counts per column
+    """
     dict_nan_counts_per_col = data_frame.isna().sum().to_dict()
+    dict_nan_counts_per_col = dict(
+        sorted(dict_nan_counts_per_col.items(), key=lambda kv: kv[1], reverse=True)
+    )
     return dict_nan_counts_per_col

modeling/eda.py CHANGED Viewed

@@ -6,10 +6,13 @@ import numpy as np
 import seaborn as sns
 import matplotlib.pyplot as plt
-from data_utils import read_csv_file, get_data_from_data_frame
 def do_eda(ARGS):
-    data_frame = read_csv_file(ARGS.file_csv)
     label_counts = dict(data_frame[ARGS.target_column].value_counts())
     # print(label_counts)
@@ -54,6 +57,7 @@ def do_eda(ARGS):
     return
 def main():
     file_csv = "dataset/water_potability.csv"
     target_column = "Potability"
@@ -61,13 +65,19 @@ def main():
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
     )
-    parser.add_argument("--file_csv", default=file_csv,
-        type=str, help="full path to dataset csv file")
-    parser.add_argument("--target_column", default=target_column,
-        type=str, help="target label for which the EDA needs to be done")
     ARGS, unparsed = parser.parse_known_args()
     do_eda(ARGS)
     return
 if __name__ == "__main__":
     main()

 import seaborn as sns
 import matplotlib.pyplot as plt
+from data_utils import WaterPotabilityDataLoader
 def do_eda(ARGS):
+    water_pot_dataset = WaterPotabilityDataLoader(ARGS.file_csv)
+    water_pot_dataset.read_csv_file()
+    data_frame = water_pot_dataset.df_csv
     label_counts = dict(data_frame[ARGS.target_column].value_counts())
     # print(label_counts)
     return
 def main():
     file_csv = "dataset/water_potability.csv"
     target_column = "Potability"
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
     )
+    parser.add_argument(
+        "--file_csv", default=file_csv, type=str, help="full path to dataset csv file"
+    )
+    parser.add_argument(
+        "--target_column",
+        default=target_column,
+        type=str,
+        help="target label for which the EDA needs to be done",
+    )
     ARGS, unparsed = parser.parse_known_args()
     do_eda(ARGS)
     return
 if __name__ == "__main__":
     main()

modeling/ml_model_dev.py CHANGED Viewed

@@ -12,144 +12,395 @@ import lightgbm as lgbm
 from sklearn.svm import SVC
 from sklearn.decomposition import PCA
 from sklearn.pipeline import Pipeline
-from sklearn.preprocessing import StandardScaler
 from sklearn.linear_model import LogisticRegression
 from sklearn.experimental import enable_iterative_imputer
 from sklearn.metrics import accuracy_score, f1_score, make_scorer
 from sklearn.impute import KNNImputer, SimpleImputer, IterativeImputer
 from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
-from sklearn.model_selection import cross_validate, train_test_split, GridSearchCV, KFold
-from data_utils import read_csv_file, get_data_from_data_frame
-def load_ml_model(pkl_file_name):
-    model_pipeline = mlflow.sklearn.load_model(pkl_file_name)
     return model_pipeline
-def get_imputer(imputer_type):
-    # setup parameter search space for different imputers
-    imputer, imputer_params = None, None
-    if imputer_type == "simple":
-        imputer = SimpleImputer()
-        imputer_params = {
-            "imputer__strategy": ["mean", "median", "most_frequent"],
-        }
-    elif imputer_type == "knn":
-        imputer = KNNImputer()
-        imputer_params = {
-            "imputer__n_neighbors": [5, 7],
-            "imputer__weights": ["uniform", "distance"],
-        }
-    elif imputer_type == "iterative":
-        imputer = IterativeImputer()
-        imputer_params = {
-            "imputer__initial_strategy": ["mean", "median", "most_frequent"],
-            "imputer__imputation_order": ["ascending", "descending"],
-        }
-    else:
-        print(f"unidentified option for arg, imputer_type: {imputer_type}")
-        sys.exit(0)
-    return imputer, imputer_params
-def get_scaler():
-    scaler = StandardScaler()
-    return scaler
-def get_pca(max_num_feats):
-    pca = PCA()
-    pca_params = {
-        "pca__n_components": np.arange(2, max_num_feats+1),
-    }
-    return pca, pca_params
-def get_classifier(classifier_type):
-    # setup parameter search space for different classifiers
-    classifier, classifier_params = None, None
-    if classifier_type == "ada_boost":
-        classifier = AdaBoostClassifier()
-        classifier_params = {
-            "classifier__learning_rate": [0.5, 1, 1.5, 2, 2.5, 3],
-            "classifier__n_estimators": [100, 200, 500],
-        }
-    elif classifier_type == "log_reg":
-        classifier = LogisticRegression(max_iter=200, solver="saga")
-        classifier_params = {
-            "classifier__penalty": ["l1", "l2", "elasticnet"],
-            "classifier__class_weight": [None, "balanced"],
-            "classifier__C": [0.1, 0.5, 1, 2],
-            "classifier__l1_ratio": np.arange(0.1, 1, 0.1),
-        }
-    elif classifier_type == "random_forest":
-        classifier = RandomForestClassifier()
-        classifier_params = {
-            "classifier__n_estimators": [100, 250],
-            "classifier__criterion": ["gini", "entropy"],
-            "classifier__max_depth": [None, 10, 25, 50, 75],
-            "classifier__min_samples_leaf": [1, 5, 10, 20],
-            "classifier__min_samples_split": [2, 3, 4, 5],
-        }
-    elif classifier_type == "svc":
-        classifier = SVC()
-        classifier_params = {
-            "classifier__C": [0.5, 1, 1.5, 2, 2.5],
-            "classifier__kernel": ["linear", "poly", "rbf", "sigmoid"],
-            "classifier__degree": [2, 3, 4],
-        }
-    elif classifier_type == "light_gbm":
-        classifier = lgbm.LGBMClassifier(
-            boosting_type="gbdt", objective="binary", metric="auc", verbosity=-1)
-        classifier_params = {
-            "classifier__num_leaves": [15, 31, 63, 127, 255],
-            "classifier__learning_rate": [0.1, 0.5, 1, 2],
-            "classifier__n_estimators": [100, 500, 1000],
-            "classifier__reg_lambda": [0.1, 0.5, 1],
-            "classifier__min_data_in_leaf": [10, 20, 30, 50],
-        }
-    else:
-        print(f"unidentified option for arg, classifier_type: {classifier_type}")
-        sys.exit(0)
-    return classifier, classifier_params
-def get_pipeline_params(imputer_params, classifier_params):
-    pipeline_params = {**imputer_params, **classifier_params}
-    return pipeline_params
-def train_model(df_train, df_test, imputer_type, classifier_type):
     # get data arrays from the data frame for train and test sets
-    X_train, Y_train = get_data_from_data_frame(df_train)
-    X_test, Y_test = get_data_from_data_frame(df_test)
-    # get imputer and its params
-    imputer, imputer_params = get_imputer(imputer_type)
-    # get classifier and its params
-    classifier, classifier_params = get_classifier(classifier_type)
-    # get the pipeline params
-    pipeline_params = get_pipeline_params(imputer_params, classifier_params)
-    print("\n" + "-"*100)
     # build the model pipeline
-    if classifier_type == "svc" or classifier_type == "log_reg":
-        scaler = get_scaler()
-        pca, pca_params = get_pca(X_train.shape[1])
-        print(f"Started training the model with the imputer: {imputer_type}, preprocessing: std_scaler + pca, classifier: {classifier_type}")
-        pipeline = Pipeline([("imputer", imputer), ("scaler", scaler), ("pca", pca), ("classifier", classifier)])
-        pipeline_params = get_pipeline_params(pipeline_params, pca_params)
-    else:
-        print(f"Started training the model with the imputer: {imputer_type}, classifier: {classifier_type}")
-        pipeline = Pipeline([("imputer", imputer), ("classifier", classifier)])
     print("Model pipeline params space: ")
-    print(pipeline_params)
-    print("-"*100)
     # setup grid search with k-fold cross validation
     k_fold_cv = KFold(n_splits=5, shuffle=True, random_state=4)
-    grid_cv = GridSearchCV(pipeline, pipeline_params, scoring="f1", cv=k_fold_cv)
     grid_cv.fit(X_train, Y_train)
     # get the cross validation score and the params for the best estimator
@@ -167,15 +418,15 @@ def train_model(df_train, df_test, imputer_type, classifier_type):
     test_f1 = f1_score(Y_test, Y_test_pred)
     test_acc = accuracy_score(Y_test, Y_test_pred)
-    print("\n" + "-"*50)
     # begin mlflow logging for the best estimator
     mlflow.set_experiment("water_potability")
     experiment = mlflow.get_experiment_by_name("water_potability")
     print(f"Started mlflow logging for the best estimator")
     with mlflow.start_run(experiment_id=experiment.experiment_id):
         # log the model and the metrics
-        mlflow.sklearn.log_model(cv_best_estimator, f"{imputer_type}_{classifier_type}")
-        mlflow.sklearn.save_model(cv_best_estimator, f"{imputer_type}_{classifier_type}")
         mlflow.log_params(cv_best_params)
         mlflow.log_metric("cv_f1_score", cv_best_f1)
         mlflow.log_metric("train_f1_score", train_f1)
@@ -185,54 +436,99 @@ def train_model(df_train, df_test, imputer_type, classifier_type):
     # end mlflow logging
     mlflow.end_run()
     print(f"Completed mlflow logging for the best estimator")
-    print("-"*50)
     return
 def init_and_train_model(ARGS):
-    df_csv = read_csv_file(ARGS.file_csv)
-    df_train, df_test = train_test_split(df_csv, test_size=0.1, random_state=4)
-    num_samples_train = df_train.shape[0]
-    num_samples_test = df_test.shape[0]
-    print("\n" + "-"*40)
     print("Num samples after splitting the dataset")
-    print("-"*40)
     print(f"train: {num_samples_train}, test: {num_samples_test}")
-    print("\n" + "-"*40)
     print("A few samples from train data")
-    print("-"*40)
-    print(df_train.head())
     if ARGS.is_train:
-        train_model(df_train, df_test, ARGS.imputer_type, ARGS.classifier_type)
     return
 def main():
     file_csv = "dataset/water_potability.csv"
     classifier_type = "ada_boost"
     imputer_type = "knn"
     is_train = 1
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
     )
-    parser.add_argument("--file_csv", default=file_csv,
-        type=str, help="full path to dataset csv file")
-    parser.add_argument("--is_train", default=is_train,
-        type=int, choices=[0, 1], help="to train or not")
-    parser.add_argument("--classifier_type", default=classifier_type,
-        type=str, choices=["ada_boost", "log_reg", "random_forest", "svc", "light_gbm"],
-        help="classifier to be used in the training model pipeline")
-    parser.add_argument("--imputer_type", default=imputer_type,
-        type=str, choices=["simple", "knn", "iterative"],
-        help="imputer to be used in the training model pipeline")
     ARGS, unparsed = parser.parse_known_args()
     init_and_train_model(ARGS)
     return
 if __name__ == "__main__":
     main()

 from sklearn.svm import SVC
 from sklearn.decomposition import PCA
 from sklearn.pipeline import Pipeline
 from sklearn.linear_model import LogisticRegression
 from sklearn.experimental import enable_iterative_imputer
 from sklearn.metrics import accuracy_score, f1_score, make_scorer
 from sklearn.impute import KNNImputer, SimpleImputer, IterativeImputer
 from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
+from sklearn.model_selection import (
+    cross_validate,
+    GridSearchCV,
+    KFold,
+)
+from sklearn.preprocessing import (
+    StandardScaler,
+    MinMaxScaler,
+    Normalizer,
+    PolynomialFeatures,
+    PowerTransformer,
+    RobustScaler,
+)
+from data_utils import WaterPotabilityDataLoader
+def load_mlflow_model(dir_mlflow_model):
+    """
+    ---------
+    Arguments
+    ---------
+    dir_mlflow_model : str
+        full direcotry path of the mlflow model
+    -------
+    Returns
+    -------
+    model_pipeline : object
+        an object of the mlflow sklearn model pipeline
+    """
+    model_pipeline = mlflow.sklearn.load_model(dir_mlflow_model)
     return model_pipeline
+class ClassificationPipeline:
+    def __init__(self):
+        self._imputer = None
+        self._imputer_params = None
+        self._preprocessor = None
+        self._preprocessor_params = None
+        self._transformer = None
+        self._pca = None
+        self._pca_params = None
+        self._classifier = None
+        self._classifier_params = None
+        self.clf_pipeline = None
+        self.clf_pipeline_params = None
+    def set_imputer(self, imputer_type):
+        """
+        ---------
+        Arguments
+        ---------
+        imputer_type : str
+            a string indicating the type of imputer to be used in the ML pipeline
+        """
+        # setup parameter search space for different imputers
+        imputer, imputer_params = None, None
+        if imputer_type == "simple":
+            imputer = SimpleImputer()
+            imputer_params = {
+                "imputer__strategy": ["mean", "median", "most_frequent"],
+            }
+        elif imputer_type == "knn":
+            imputer = KNNImputer()
+            imputer_params = {
+                "imputer__n_neighbors": [5, 7],
+                "imputer__weights": ["uniform", "distance"],
+            }
+        elif imputer_type == "iterative":
+            imputer = IterativeImputer()
+            imputer_params = {
+                "imputer__initial_strategy": ["mean", "median", "most_frequent"],
+                "imputer__imputation_order": ["ascending", "descending"],
+            }
+        else:
+            print(f"unidentified option for arg, imputer_type: {imputer_type}")
+            sys.exit(0)
+        self._imputer = imputer
+        self._imputer_params = imputer_params
+        return
+    def set_preprocessor(self, preprocessor_type):
+        """
+        ---------
+        Arguments
+        ---------
+        preprocessor_type : str
+            a string indicating the type of preprocessor to be used in the ML pipeline
+        """
+        if preprocessor_type == "std":
+            preprocessor = StandardScaler()
+            preprocessor_params = None
+        elif preprocessor_type == "min_max":
+            # range must be positive for box-cox transformation
+            # so use min max scaler and make sure range is proper
+            # so use the same range even without that transformation
+            preprocessor = MinMaxScaler(feature_range=(1, 2), clip=True)
+            preprocessor_params = None
+        elif preprocessor_type == "norm":
+            preprocessor = Normalizer()
+            preprocessor_params = {
+                "preprocessing__norm": ["l1", "l2", "max"],
+            }
+        elif preprocessor_type == "poly":
+            preprocessor = PolynomialFeatures()
+            preprocessor_params = {
+                "preprocessor__degree": [2],
+                "preprocessor__interaction_only": [True, False],
+                "preprocessor__include_bias": [True, False],
+            }
+        elif preprocessor_type == "robust":
+            preprocessor = RobustScaler()
+            preprocessor_params = None
+        else:
+            print(
+                f"unidentified option for arg, preprocessor_type: {preprocessor_type}"
+            )
+            sys.exit(0)
+        self._preprocessor = preprocessor
+        self._preprocessor_params = preprocessor_params
+        return
+    def set_transformer(self, transformer_type):
+        """
+        ---------
+        Arguments
+        ---------
+        transformer_type : str
+            a string indicating the transformer type to be used in the ML pipeline
+        """
+        if transformer_type == "power_box_cox":
+            self._transformer = PowerTransformer(method="box-cox")
+        elif transformer_type == "power_yeo_johnson":
+            self._transformer = PowerTransformer(method="yeo-johnson")
+        else:
+            print(f"unidentified option for arg, transformer_type: {transformer_type}")
+            sys.exit(0)
+        return
+    def set_pca(self, max_num_feats):
+        """
+        ---------
+        Arguments
+        ---------
+        max_num_feats : int
+            an integer indicating the maximum number of features in the dataset
+        """
+        self._pca = PCA()
+        self._pca_params = {
+            "pca__n_components": np.arange(2, max_num_feats + 1),
+        }
+        return
+    def set_classifier(self, classifier_type):
+        """
+        ---------
+        Arguments
+        ---------
+        classifier_type : str
+            a string indicating the type of classifier to be used in the ML pipeline
+        """
+        # setup parameter search space for different classifiers
+        classifier, classifier_params = None, None
+        if classifier_type == "ada_boost":
+            classifier = AdaBoostClassifier()
+            classifier_params = {
+                "classifier__learning_rate": [0.5, 1, 1.5, 2, 2.5, 3],
+                "classifier__n_estimators": [100, 200, 500],
+            }
+        elif classifier_type == "log_reg":
+            classifier = LogisticRegression(max_iter=200, solver="saga")
+            classifier_params = {
+                "classifier__penalty": ["l1", "l2", "elasticnet"],
+                "classifier__class_weight": [None, "balanced"],
+                "classifier__C": [0.1, 0.5, 1, 2],
+                "classifier__l1_ratio": np.arange(0.1, 1, 0.1),
+            }
+        elif classifier_type == "random_forest":
+            classifier = RandomForestClassifier()
+            classifier_params = {
+                "classifier__n_estimators": [100, 250],
+                "classifier__criterion": ["gini", "entropy"],
+                "classifier__max_depth": [None, 10, 25, 50, 75],
+                "classifier__min_samples_leaf": [1, 5, 10, 20],
+                "classifier__min_samples_split": [2, 3, 4, 5],
+            }
+        elif classifier_type == "svc":
+            classifier = SVC()
+            classifier_params = {
+                "classifier__C": [0.5, 1, 1.5, 2, 2.5],
+                "classifier__kernel": ["linear", "poly", "rbf", "sigmoid"],
+                "classifier__degree": [2, 3, 4],
+            }
+        elif classifier_type == "light_gbm":
+            classifier = lgbm.LGBMClassifier(
+                boosting_type="gbdt", objective="binary", metric="auc", verbosity=-1
+            )
+            classifier_params = {
+                "classifier__num_leaves": [15, 31, 63, 127, 255],
+                "classifier__learning_rate": [0.1, 0.5, 1, 2],
+                "classifier__n_estimators": [100, 500, 1000],
+                "classifier__reg_lambda": [0.1, 0.5, 1],
+                "classifier__min_data_in_leaf": [10, 20, 30, 50],
+            }
+        else:
+            print(f"unidentified option for arg, classifier_type: {classifier_type}")
+            sys.exit(0)
+        self._classifier = classifier
+        self._classifier_params = classifier_params
+        return
+    def build_pipeline(self):
+        if self._pca == None:
+            if self._preprocessor == None:
+                self.clf_pipeline = Pipeline(
+                    [("imputer", self._imputer), ("classifier", self._classifier)]
+                )
+                list_pipeline_params = [self._imputer_params, self._classifier_params]
+            else:
+                if self._transformer == None:
+                    self.clf_pipeline = Pipeline(
+                        [
+                            ("imputer", self._imputer),
+                            ("preprocessor", self._preprocessor),
+                            ("classifier", self._classifier),
+                        ]
+                    )
+                else:
+                    self.clf_pipeline = Pipeline(
+                        [
+                            ("imputer", self._imputer),
+                            ("preprocessor", self._preprocessor),
+                            ("transformer", self._transformer),
+                            ("classifier", self._classifier),
+                        ]
+                    )
+                if self._preprocessor_params is not None:
+                    list_pipeline_params = [
+                        self._imputer_params,
+                        self._preprocessor_params,
+                        self._classifier_params,
+                    ]
+                else:
+                    list_pipeline_params = [
+                        self._imputer_params,
+                        self._classifier_params,
+                    ]
+        else:
+            # Preprocessing is a must for applying PCA
+            self.clf_pipeline = Pipeline(
+                [
+                    ("imputer", self._imputer),
+                    ("preprocessor", self._preprocessor),
+                    ("pca", self._pca),
+                    ("classifier", self._classifier),
+                ]
+            )
+            if self._preprocessor_params is not None:
+                list_pipeline_params = [
+                    self._imputer_params,
+                    self._preprocessor_params,
+                    self._pca_params,
+                    self._classifier_params,
+                ]
+            else:
+                list_pipeline_params = [
+                    self._imputer_params,
+                    self._pca_params,
+                    self._classifier_params,
+                ]
+        self._set_pipeline_params(list_pipeline_params)
+        return
+    def _set_pipeline_params(self, list_pipeline_params):
+        """
+        ---------
+        Arguments
+        ---------
+        list_pipeline_params : list
+            a list of dictionaries of pipeline params
+        """
+        final_pipeline_params = {}
+        for _index in range(len(list_pipeline_params)):
+            temp_pipeline_params = {
+                **final_pipeline_params,
+                **list_pipeline_params[_index],
+            }
+            final_pipeline_params = temp_pipeline_params
+        self.clf_pipeline_params = final_pipeline_params
+        return
+def train_model(
+    water_pot_dataset,
+    imputer_type,
+    preprocessor_type,
+    transformer_type,
+    classifier_type,
+    is_pca=False,
+):
+    """
+    ---------
+    Arguments
+    ---------
+    water_pot_dataset : object
+        an object of type WaterPotabilityDataLoader class
+    imputer_type : str
+        a string indicating the imputer type to be used in the ML pipeline
+    preprocessor_type : str
+        a string indicating the preprocessor type to be used in the ML pipeline
+    transformer_type : str
+        a string indicating the additional transformer type to be used in the ML pipeline
+    classifier_type : str
+        a string indicating the classifier type to be used in the ML pipeline
+    is_pca : bool
+        a boolean indicating whether to use PCA or not in the ML pipeline
+    """
     # get data arrays from the data frame for train and test sets
+    X_train, Y_train = water_pot_dataset.get_data_from_data_frame(which_set="train")
+    X_test, Y_test = water_pot_dataset.get_data_from_data_frame(which_set="test")
+    pca_str = "no_pca"
+    preprocessor_str = "no_preproc"
+    transformer_str = "no_transform"
+    clf_pipeline = ClassificationPipeline()
+    # set imputer and its params
+    clf_pipeline.set_imputer(imputer_type)
+    # set preprocessor and its params
+    if preprocessor_type != "none":
+        clf_pipeline.set_preprocessor(preprocessor_type)
+        preprocessor_str = preprocessor_type
+    # set the additional transformer if needed
+    if transformer_type != "none":
+        transformer_str = transformer_type
+        if transformer_type == "power_box_cox":
+            # range must be positive for box-cox transformation
+            # so use min max scaler and make sure range is proper
+            clf_pipeline.set_preprocessor("min_max")
+            clf_pipeline.set_transformer(transformer_type)
+            preprocessor_str = "min_max"
+        else:
+            # std scaler for yeo-johnson transformation yields better results
+            clf_pipeline.set_preprocessor("std")
+            clf_pipeline.set_transformer(transformer_type)
+            preprocessor_str = "std"
+    # to use PCA or not
+    if is_pca == True:
+        clf_pipeline.set_pca(X_train.shape[1])
+        pca_str = "pca"
+    # set classifier and its params
+    clf_pipeline.set_classifier(classifier_type)
+    print("\n" + "-" * 100)
     # build the model pipeline
+    clf_pipeline.build_pipeline()
+    print(clf_pipeline.clf_pipeline)
+    print("\n" + "-" * 100)
     print("Model pipeline params space: ")
+    print(clf_pipeline.clf_pipeline_params)
+    print("-" * 100)
     # setup grid search with k-fold cross validation
     k_fold_cv = KFold(n_splits=5, shuffle=True, random_state=4)
+    grid_cv = GridSearchCV(
+        clf_pipeline.clf_pipeline,
+        clf_pipeline.clf_pipeline_params,
+        scoring="f1",
+        cv=k_fold_cv,
+    )
     grid_cv.fit(X_train, Y_train)
     # get the cross validation score and the params for the best estimator
     test_f1 = f1_score(Y_test, Y_test_pred)
     test_acc = accuracy_score(Y_test, Y_test_pred)
+    print("\n" + "-" * 50)
     # begin mlflow logging for the best estimator
     mlflow.set_experiment("water_potability")
     experiment = mlflow.get_experiment_by_name("water_potability")
     print(f"Started mlflow logging for the best estimator")
+    model_log_str = f"{imputer_type}_{preprocessor_str}_{transformer_str}_{pca_str}_{classifier_type}"
     with mlflow.start_run(experiment_id=experiment.experiment_id):
         # log the model and the metrics
+        mlflow.sklearn.log_model(cv_best_estimator, model_log_str)
         mlflow.log_params(cv_best_params)
         mlflow.log_metric("cv_f1_score", cv_best_f1)
         mlflow.log_metric("train_f1_score", train_f1)
     # end mlflow logging
     mlflow.end_run()
     print(f"Completed mlflow logging for the best estimator")
+    print("-" * 50)
     return
 def init_and_train_model(ARGS):
+    water_pot_dataset = WaterPotabilityDataLoader(ARGS.file_csv)
+    water_pot_dataset.read_csv_file()
+    water_pot_dataset.split_data()
+    num_samples_train = water_pot_dataset.df_train.shape[0]
+    num_samples_test = water_pot_dataset.df_test.shape[0]
+    print("\n" + "-" * 40)
     print("Num samples after splitting the dataset")
+    print("-" * 40)
     print(f"train: {num_samples_train}, test: {num_samples_test}")
+    print("\n" + "-" * 40)
     print("A few samples from train data")
+    print("-" * 40)
+    print(water_pot_dataset.df_train.head())
     if ARGS.is_train:
+        train_model(
+            water_pot_dataset,
+            ARGS.imputer_type,
+            ARGS.preprocessor_type,
+            ARGS.transformer_type,
+            ARGS.classifier_type,
+            bool(ARGS.is_pca),
+        )
     return
 def main():
     file_csv = "dataset/water_potability.csv"
     classifier_type = "ada_boost"
     imputer_type = "knn"
+    preprocessor_type = "none"
+    transformer_type = "none"
     is_train = 1
+    is_pca = 0
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
     )
+    parser.add_argument(
+        "--file_csv", default=file_csv, type=str, help="full path to dataset csv file"
+    )
+    parser.add_argument(
+        "--is_train", default=is_train, type=int, choices=[0, 1], help="to train or not"
+    )
+    parser.add_argument(
+        "--classifier_type",
+        default=classifier_type,
+        type=str,
+        choices=["ada_boost", "log_reg", "random_forest", "svc", "light_gbm"],
+        help="classifier to be used in the ML model pipeline",
+    )
+    parser.add_argument(
+        "--imputer_type",
+        default=imputer_type,
+        type=str,
+        choices=["simple", "knn", "iterative"],
+        help="imputer to be used in the ML model pipeline",
+    )
+    parser.add_argument(
+        "--preprocessor_type",
+        default=preprocessor_type,
+        type=str,
+        choices=["none", "std", "min_max", "norm", "poly", "robust"],
+        help="preprocessor to be used in the ML model pipeline",
+    )
+    parser.add_argument(
+        "--transformer_type",
+        default=transformer_type,
+        type=str,
+        choices=["none", "power_box_cox", "power_yeo_johnson"],
+        help="additional transformer to be used in the ML model pipeline",
+    )
+    parser.add_argument(
+        "--is_pca",
+        default=is_pca,
+        type=int,
+        choices=[0, 1],
+        help="indicates if pca should be used in the ML model pipeline (0: False, 1: True)",
+    )
     ARGS, unparsed = parser.parse_known_args()
     init_and_train_model(ARGS)
     return
 if __name__ == "__main__":
     main()

modeling/ml_model_test.py CHANGED Viewed

@@ -4,35 +4,44 @@ import mlflow
 import numpy as np
 from sklearn.metrics import classification_report
-from ml_model_dev import load_ml_model, train_test_split, read_csv_file
 def test_ml_pipeline(ARGS):
-    df_csv = read_csv_file(ARGS.file_csv)
-    df_train, df_test = train_test_split(df_csv, test_size=0.1, random_state=4)
-    arr_test = df_test.to_numpy()
-    X_test, Y_test = arr_test[:, :-1], arr_test[:, -1:].reshape(-1)
-    model_pipeline = load_ml_model(ARGS.pkl_file_name)
     Y_pred_test = model_pipeline.predict(X_test)
     print(classification_report(Y_test, Y_pred_test))
     return
 def main():
     file_csv = "dataset/water_potability.csv"
-    pkl_file_name = "trained_models/knn_ada_boost"
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
     )
-    parser.add_argument("--file_csv", default=file_csv,
-        type=str, help="full path to dataset csv file")
-    parser.add_argument("--pkl_file_name", default=pkl_file_name,
-        type=str, help="full path to ml model pkl file")
     ARGS, unparsed = parser.parse_known_args()
     test_ml_pipeline(ARGS)
     return
 if __name__ == "__main__":
     main()

 import numpy as np
 from sklearn.metrics import classification_report
+from data_utils import WaterPotabilityDataLoader
+from ml_model_dev import load_mlflow_model
 def test_ml_pipeline(ARGS):
+    water_pot_dataset = WaterPotabilityDataLoader(ARGS.file_csv)
+    water_pot_dataset.read_csv_file()
+    water_pot_dataset.split_data()
+    X_test, Y_test = water_pot_dataset.get_data_from_data_frame(which_set="test")
+    model_pipeline = load_mlflow_model(ARGS.dir_mlflow_model)
     Y_pred_test = model_pipeline.predict(X_test)
     print(classification_report(Y_test, Y_pred_test))
     return
 def main():
     file_csv = "dataset/water_potability.csv"
+    dir_mlflow_model = "model_for_production"
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
     )
+    parser.add_argument(
+        "--file_csv", default=file_csv, type=str, help="full path to dataset csv file"
+    )
+    parser.add_argument(
+        "--dir_mlflow_model",
+        default=dir_mlflow_model,
+        type=str,
+        help="full path to directory containing mlflow model",
+    )
     ARGS, unparsed = parser.parse_known_args()
     test_ml_pipeline(ARGS)
     return
 if __name__ == "__main__":
     main()