Spaces:

abhishekrs4
/

ML_water_potability

Sleeping

App Files Files Community

abhishekrs4 commited on Feb 8

Commit

5fbf3c7

•

1 Parent(s): 5f32fa0

added modeling module

Browse files

Files changed (5) hide show

modeling/__init__.py +1 -0
modeling/data_utils.py +16 -0
modeling/eda.py +73 -0
modeling/ml_model_dev.py +238 -0
modeling/ml_model_test.py +38 -0

modeling/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ import os, sys; sys.path.append(os.path.dirname(os.path.realpath(__file__)))

modeling/data_utils.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import numpy as np
+import pandas as pd
+def read_csv_file(file_csv):
+    df_csv = pd.read_csv(file_csv)
+    return df_csv
+def get_dict_nan_counts_per_col(data_frame):
+    dict_nan_counts_per_col = data_frame.isna().sum().to_dict()
+    dict_nan_counts_per_col = dict(sorted(dict_nan_counts_per_col.items(), key=lambda kv: kv[1], reverse=True))
+    return dict_nan_counts_per_col
+def get_data_from_data_frame(data_frame):
+    arr = data_frame.to_numpy()
+    X_arr, Y_arr = arr[:, :-1], arr[:, -1:].reshape(-1)
+    return X_arr, Y_arr

modeling/eda.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import os
+import sys
+import argparse
+import numpy as np
+import seaborn as sns
+import matplotlib.pyplot as plt
+from data_utils import read_csv_file, get_data_from_data_frame
+def do_eda(ARGS):
+    data_frame = read_csv_file(ARGS.file_csv)
+    label_counts = dict(data_frame[ARGS.target_column].value_counts())
+    # print(label_counts)
+    # plot a histogram
+    plt.figure(figsize=(12, 12))
+    plt.bar([str(l) for l in label_counts.keys()], label_counts.values(), width=0.5)
+    plt.xlabel(f"{ARGS.target_column}", fontsize=20)
+    plt.ylabel("Number of samples", fontsize=20)
+    plt.title("Distribution of samples in the dataset", fontsize=20)
+    plt.grid()
+    plt.xticks(fontsize=20)
+    plt.yticks(fontsize=20)
+    plt.show()
+    """
+    feat_cols = data_frame.columns[:-1]
+    num_feat_cols = len(feat_cols)
+    fig, axs = plt.subplots(num_feat_cols)
+    fig.suptitle("Distribution of features")
+    #axs.set_xlabel(ARGS.target_column)
+    for col_index in range(num_feat_cols):
+        column = feat_cols[col_index]
+        not_nan_indices = list(data_frame[column].notna())
+        lbl_with_not_nans = data_frame[ARGS.target_column][not_nan_indices]
+        col_with_not_nans = data_frame[column][not_nan_indices]
+        print(column, len(lbl_with_not_nans), len(col_with_not_nans))
+        axs[col_index].scatter(lbl_with_not_nans, col_with_not_nans)
+        axs[col_index].set(ylabel=column)
+    plt.show()
+    """
+    plt.figure()
+    corr_mat = data_frame.corr()
+    sns.heatmap(corr_mat)
+    plt.title("Feature correlation matrix", fontsize=20)
+    plt.xticks(fontsize=20)
+    plt.yticks(fontsize=20)
+    plt.show()
+    return
+def main():
+    file_csv = "dataset/water_potability.csv"
+    target_column = "Potability"
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("--file_csv", default=file_csv,
+        type=str, help="full path to dataset csv file")
+    parser.add_argument("--target_column", default=target_column,
+        type=str, help="target label for which the EDA needs to be done")
+    ARGS, unparsed = parser.parse_known_args()
+    do_eda(ARGS)
+    return
+if __name__ == "__main__":
+    main()

modeling/ml_model_dev.py ADDED Viewed

	@@ -0,0 +1,238 @@

+import os
+import sys
+import joblib
+import argparse
+import collections
+import mlflow
+import numpy as np
+import lightgbm as lgbm
+from sklearn.svm import SVC
+from sklearn.decomposition import PCA
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.linear_model import LogisticRegression
+from sklearn.experimental import enable_iterative_imputer
+from sklearn.metrics import accuracy_score, f1_score, make_scorer
+from sklearn.impute import KNNImputer, SimpleImputer, IterativeImputer
+from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
+from sklearn.model_selection import cross_validate, train_test_split, GridSearchCV, KFold
+from data_utils import read_csv_file, get_data_from_data_frame
+def load_ml_model(pkl_file_name):
+    model_pipeline = mlflow.sklearn.load_model(pkl_file_name)
+    return model_pipeline
+def get_imputer(imputer_type):
+    # setup parameter search space for different imputers
+    imputer, imputer_params = None, None
+    if imputer_type == "simple":
+        imputer = SimpleImputer()
+        imputer_params = {
+            "imputer__strategy": ["mean", "median", "most_frequent"],
+        }
+    elif imputer_type == "knn":
+        imputer = KNNImputer()
+        imputer_params = {
+            "imputer__n_neighbors": [5, 7],
+            "imputer__weights": ["uniform", "distance"],
+        }
+    elif imputer_type == "iterative":
+        imputer = IterativeImputer()
+        imputer_params = {
+            "imputer__initial_strategy": ["mean", "median", "most_frequent"],
+            "imputer__imputation_order": ["ascending", "descending"],
+        }
+    else:
+        print(f"unidentified option for arg, imputer_type: {imputer_type}")
+        sys.exit(0)
+    return imputer, imputer_params
+def get_scaler():
+    scaler = StandardScaler()
+    return scaler
+def get_pca(max_num_feats):
+    pca = PCA()
+    pca_params = {
+        "pca__n_components": np.arange(2, max_num_feats+1),
+    }
+    return pca, pca_params
+def get_classifier(classifier_type):
+    # setup parameter search space for different classifiers
+    classifier, classifier_params = None, None
+    if classifier_type == "ada_boost":
+        classifier = AdaBoostClassifier()
+        classifier_params = {
+            "classifier__learning_rate": [0.5, 1, 1.5, 2, 2.5, 3],
+            "classifier__n_estimators": [100, 200, 500],
+        }
+    elif classifier_type == "log_reg":
+        classifier = LogisticRegression(max_iter=200, solver="saga")
+        classifier_params = {
+            "classifier__penalty": ["l1", "l2", "elasticnet"],
+            "classifier__class_weight": [None, "balanced"],
+            "classifier__C": [0.1, 0.5, 1, 2],
+            "classifier__l1_ratio": np.arange(0.1, 1, 0.1),
+        }
+    elif classifier_type == "random_forest":
+        classifier = RandomForestClassifier()
+        classifier_params = {
+            "classifier__n_estimators": [100, 250],
+            "classifier__criterion": ["gini", "entropy"],
+            "classifier__max_depth": [None, 10, 25, 50, 75],
+            "classifier__min_samples_leaf": [1, 5, 10, 20],
+            "classifier__min_samples_split": [2, 3, 4, 5],
+        }
+    elif classifier_type == "svc":
+        classifier = SVC()
+        classifier_params = {
+            "classifier__C": [0.5, 1, 1.5, 2, 2.5],
+            "classifier__kernel": ["linear", "poly", "rbf", "sigmoid"],
+            "classifier__degree": [2, 3, 4],
+        }
+    elif classifier_type == "light_gbm":
+        classifier = lgbm.LGBMClassifier(
+            boosting_type="gbdt", objective="binary", metric="auc", verbosity=-1)
+        classifier_params = {
+            "classifier__num_leaves": [15, 31, 63, 127, 255],
+            "classifier__learning_rate": [0.1, 0.5, 1, 2],
+            "classifier__n_estimators": [100, 500, 1000],
+            "classifier__reg_lambda": [0.1, 0.5, 1],
+            "classifier__min_data_in_leaf": [10, 20, 30, 50],
+        }
+    else:
+        print(f"unidentified option for arg, classifier_type: {classifier_type}")
+        sys.exit(0)
+    return classifier, classifier_params
+def get_pipeline_params(imputer_params, classifier_params):
+    pipeline_params = {**imputer_params, **classifier_params}
+    return pipeline_params
+def train_model(df_train, df_test, imputer_type, classifier_type):
+    # get data arrays from the data frame for train and test sets
+    X_train, Y_train = get_data_from_data_frame(df_train)
+    X_test, Y_test = get_data_from_data_frame(df_test)
+    # get imputer and its params
+    imputer, imputer_params = get_imputer(imputer_type)
+    # get classifier and its params
+    classifier, classifier_params = get_classifier(classifier_type)
+    # get the pipeline params
+    pipeline_params = get_pipeline_params(imputer_params, classifier_params)
+    print("\n" + "-"*100)
+    # build the model pipeline
+    if classifier_type == "svc" or classifier_type == "log_reg":
+        scaler = get_scaler()
+        pca, pca_params = get_pca(X_train.shape[1])
+        print(f"Started training the model with the imputer: {imputer_type}, preprocessing: std_scaler + pca, classifier: {classifier_type}")
+        pipeline = Pipeline([("imputer", imputer), ("scaler", scaler), ("pca", pca), ("classifier", classifier)])
+        pipeline_params = get_pipeline_params(pipeline_params, pca_params)
+    else:
+        print(f"Started training the model with the imputer: {imputer_type}, classifier: {classifier_type}")
+        pipeline = Pipeline([("imputer", imputer), ("classifier", classifier)])
+    print("Model pipeline params space: ")
+    print(pipeline_params)
+    print("-"*100)
+    # setup grid search with k-fold cross validation
+    k_fold_cv = KFold(n_splits=5, shuffle=True, random_state=4)
+    grid_cv = GridSearchCV(pipeline, pipeline_params, scoring="f1", cv=k_fold_cv)
+    grid_cv.fit(X_train, Y_train)
+    # get the cross validation score and the params for the best estimator
+    cv_best_estimator = grid_cv.best_estimator_
+    cv_best_f1 = grid_cv.best_score_
+    cv_best_params = grid_cv.best_params_
+    # predict and compute train set metrics
+    Y_train_pred = cv_best_estimator.predict(X_train)
+    train_f1 = f1_score(Y_train, Y_train_pred)
+    train_acc = accuracy_score(Y_train, Y_train_pred)
+    # predict and compute test set metrics
+    Y_test_pred = cv_best_estimator.predict(X_test)
+    test_f1 = f1_score(Y_test, Y_test_pred)
+    test_acc = accuracy_score(Y_test, Y_test_pred)
+    print("\n" + "-"*50)
+    # begin mlflow logging for the best estimator
+    mlflow.set_experiment("water_potability")
+    experiment = mlflow.get_experiment_by_name("water_potability")
+    print(f"Started mlflow logging for the best estimator")
+    with mlflow.start_run(experiment_id=experiment.experiment_id):
+        # log the model and the metrics
+        mlflow.sklearn.log_model(cv_best_estimator, f"{imputer_type}_{classifier_type}")
+        mlflow.sklearn.save_model(cv_best_estimator, f"{imputer_type}_{classifier_type}")
+        mlflow.log_params(cv_best_params)
+        mlflow.log_metric("cv_f1_score", cv_best_f1)
+        mlflow.log_metric("train_f1_score", train_f1)
+        mlflow.log_metric("train_acc_score", train_acc)
+        mlflow.log_metric("test_f1_score", test_f1)
+        mlflow.log_metric("test_acc_score", test_acc)
+    # end mlflow logging
+    mlflow.end_run()
+    print(f"Completed mlflow logging for the best estimator")
+    print("-"*50)
+    return
+def init_and_train_model(ARGS):
+    df_csv = read_csv_file(ARGS.file_csv)
+    df_train, df_test = train_test_split(df_csv, test_size=0.1, random_state=4)
+    num_samples_train = df_train.shape[0]
+    num_samples_test = df_test.shape[0]
+    print("\n" + "-"*40)
+    print("Num samples after splitting the dataset")
+    print("-"*40)
+    print(f"train: {num_samples_train}, test: {num_samples_test}")
+    print("\n" + "-"*40)
+    print("A few samples from train data")
+    print("-"*40)
+    print(df_train.head())
+    if ARGS.is_train:
+        train_model(df_train, df_test, ARGS.imputer_type, ARGS.classifier_type)
+    return
+def main():
+    file_csv = "dataset/water_potability.csv"
+    classifier_type = "ada_boost"
+    imputer_type = "knn"
+    is_train = 1
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("--file_csv", default=file_csv,
+        type=str, help="full path to dataset csv file")
+    parser.add_argument("--is_train", default=is_train,
+        type=int, choices=[0, 1], help="to train or not")
+    parser.add_argument("--classifier_type", default=classifier_type,
+        type=str, choices=["ada_boost", "log_reg", "random_forest", "svc", "light_gbm"],
+        help="classifier to be used in the training model pipeline")
+    parser.add_argument("--imputer_type", default=imputer_type,
+        type=str, choices=["simple", "knn", "iterative"],
+        help="imputer to be used in the training model pipeline")
+    ARGS, unparsed = parser.parse_known_args()
+    init_and_train_model(ARGS)
+    return
+if __name__ == "__main__":
+    main()

modeling/ml_model_test.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import argparse
+import mlflow
+import numpy as np
+from sklearn.metrics import classification_report
+from ml_model_dev import load_ml_model, train_test_split, read_csv_file
+def test_ml_pipeline(ARGS):
+    df_csv = read_csv_file(ARGS.file_csv)
+    df_train, df_test = train_test_split(df_csv, test_size=0.1, random_state=4)
+    arr_test = df_test.to_numpy()
+    X_test, Y_test = arr_test[:, :-1], arr_test[:, -1:].reshape(-1)
+    model_pipeline = load_ml_model(ARGS.pkl_file_name)
+    Y_pred_test = model_pipeline.predict(X_test)
+    print(classification_report(Y_test, Y_pred_test))
+    return
+def main():
+    file_csv = "dataset/water_potability.csv"
+    pkl_file_name = "trained_models/knn_ada_boost"
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("--file_csv", default=file_csv,
+        type=str, help="full path to dataset csv file")
+    parser.add_argument("--pkl_file_name", default=pkl_file_name,
+        type=str, help="full path to ml model pkl file")
+    ARGS, unparsed = parser.parse_known_args()
+    test_ml_pipeline(ARGS)
+    return
+if __name__ == "__main__":
+    main()