Spaces:

pkiage
/

credit_risk_modeling_demo

Build error

App Files Files Community

pkiage commited on Feb 1, 2022

Commit

232e5e5

0 Parent(s):

Initial commit

Browse files

Files changed (19) hide show

.gitignore +7 -0
README.md +30 -0
app.py +52 -0
common/__init__.py +0 -0
common/data.py +94 -0
common/util.py +391 -0
common/views.py +361 -0
data/processed/cr_loan_w2.csv +0 -0
data_setup.py +180 -0
poetry.lock +0 -0
pyproject.toml +30 -0
views/__init__.py +0 -0
views/decision_tree.py +70 -0
views/evaluation.py +410 -0
views/logistic.py +119 -0
views/model_comparison.py +81 -0
views/strategy_table.py +96 -0
views/threshold.py +272 -0
views/typing.py +15 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+*.png
+*.pyc
+.env
+.envrc
+*.ipynb
+__pycache__
+.vs

README.md ADDED Viewed

	@@ -0,0 +1,30 @@

+# Credit Risk Modelling
+# About
+An interactive tool demonstrating credit risk modelling.
+## Built With
+- [Streamlit](https://streamlit.io/)
+# References
+## Inspiration:
+[Credit Risk Modeling in Python by Datacamp](https://www.datacamp.com/courses/credit-risk-modeling-in-python)
+- General Methodology
+- Data
+[A Gentle Introduction to Threshold-Moving for Imbalanced Classification](https://machinelearningmastery.com/threshold-moving-for-imbalanced-classification/)
+- Selecting optimal threshold using Youden's J statistic
+## Political, Economic, Social, Technological, Legal and Environmental(PESTLE):
+[Europe fit for the Digital Age: Commission proposes new rules and actions for excellence and trust in Artificial Intelligence](https://ec.europa.eu/commission/presscorner/detail/en/ip_21_1682)
+[LAYING DOWN HARMONISED RULES ON ARTIFICIAL INTELLIGENCE (ARTIFICIAL INTELLIGENCE ACT) AND AMENDING CERTAIN UNION LEGISLATIVE ACTS](https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:52021PC0206&from=EN)
+"(37) Another area in which the use of AI systems deserves special consideration is the access to and enjoyment of certain essential private and public services and benefits necessary for people to fully participate in society or to improve one’s standard of living. In particular, AI systems used to evaluate the credit score or creditworthiness of natural persons should be classified as high-risk AI systems, since they determine those persons’ access to financial resources or essential services such as housing, electricity, and telecommunication services. AI systems used for this purpose may lead to discrimination of persons or groups and perpetuate historical patterns of discrimination, for example based on racial or ethnic origins, disabilities, age, sexual orientation, or create new forms of discriminatory impacts. Considering the very limited scale of the impact and the available alternatives on the market, it is appropriate to exempt AI systems for the purpose of creditworthiness assessment and credit scoring when put into service by small-scale providers for their own use. Natural persons applying for or receiving public assistance benefits and services from public authorities are typically dependent on those benefits and services and in a vulnerable position in relation to the responsible authorities. If AI systems are used for determining whether such benefits and services should be denied, reduced, revoked or reclaimed by authorities, they may have a significant impact on persons’ livelihood and may infringe their fundamental rights, such as the right to social protection, non-discrimination, human dignity or an effective remedy. Those systems should therefore be classified as high-risk. Nonetheless, this Regulation should not hamper the development and use of innovative approaches in the public administration, which would stand to benefit from a wider use of compliant and safe AI systems, provided that those systems do not entail a high risk to legal and natural persons."

app.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from typing import OrderedDict
+import streamlit as st
+from data_setup import initialise_data
+from views.decision_tree import decisiontree_view
+from views.logistic import logistic_view
+from views.model_comparison import model_comparison_view
+from views.strategy_table import strategy_table_view
+def main():
+    currency_options = ["USD", "KES", "GBP"]
+    currency = st.sidebar.selectbox(
+        label="What currency will you be using?", options=currency_options
+    )
+    st.title("GUI for Credit Risk Modelling")
+    st.title("Data")
+    (_dataset, split_dataset) = initialise_data()
+    st.title("Modelling")
+    model_options = ["Logistic Regression", "Decision Trees"]
+    # Returns list
+    models_selected_list = st.sidebar.multiselect(
+        label="Select model", options=model_options, default=model_options
+    )
+    models_selected_set = set(models_selected_list)
+    model_views = OrderedDict()
+    if "Logistic Regression" in models_selected_set:
+        logistic_model_view = logistic_view(split_dataset, currency)
+        model_views["Logistic Regression"] = logistic_model_view
+    if "Decision Trees" in models_selected_set:
+        decision_tree_model_view = decisiontree_view(split_dataset, currency)
+        model_views["Decision Trees"] = decision_tree_model_view
+    if models_selected_list:
+        model_comparison_view(
+            split_dataset,
+            model_views,
+        )
+        strategy_table_view(currency, model_views)
+if __name__ == "__main__":
+    main()

common/__init__.py ADDED Viewed

File without changes

common/data.py ADDED Viewed

	@@ -0,0 +1,94 @@

+from typing import List, Union, cast
+from dataclasses import dataclass
+from sklearn.model_selection import train_test_split
+import pandas as pd
+from common.util import drop_columns
+@dataclass
+class SplitDataset:
+    X_test: pd.DataFrame
+    X_train: pd.DataFrame
+    y_test: pd.Series
+    y_train: pd.Series
+    @property
+    def X_y_test(self) -> pd.DataFrame:
+        return pd.concat(
+            cast(
+                List[Union[pd.DataFrame, pd.Series]],
+                [
+                    self.X_test.reset_index(drop=True),
+                    self.y_test.reset_index(drop=True),
+                ],
+            ),
+            axis=1,
+        )
+    @property
+    def X_y_train(self) -> pd.DataFrame:
+        return pd.concat(
+            cast(
+                List[Union[pd.DataFrame, pd.Series]],
+                [
+                    self.X_train.reset_index(drop=True),
+                    self.y_train.reset_index(drop=True),
+                ],
+            ),
+            axis=1,
+        )
+@dataclass
+class Dataset:
+    df: pd.DataFrame
+    random_state: int
+    test_size: int
+    @property
+    def y_value(self) -> pd.DataFrame:
+        return self.df["loan_status"]
+    @property
+    def x_values(self) -> pd.DataFrame:
+        return cast(
+            pd.DataFrame,
+            drop_columns(
+                self.df,
+                [
+                    "loan_status",
+                    "loan_grade_A",
+                    "loan_grade_B",
+                    "loan_grade_C",
+                    "loan_grade_D",
+                    "loan_grade_E",
+                    "loan_grade_F",
+                    "loan_grade_G",
+                ],
+            ),
+        )
+    @property
+    def x_values_column_names(self):
+        return self.x_values.columns.tolist()
+    def x_values_filtered_columns(self, columns: List[str]) -> pd.DataFrame:
+        return self.df.filter(columns)
+    def train_test_split(
+        self, selected_x_values: pd.DataFrame
+    ) -> SplitDataset:
+        X_train, X_test, y_train, y_test = train_test_split(
+            selected_x_values,
+            self.y_value,
+            test_size=self.test_size / 100,  # since up was given as pct
+            random_state=self.random_state,
+        )
+        return SplitDataset(
+            X_train=cast(pd.DataFrame, X_train),
+            X_test=cast(pd.DataFrame, X_test),
+            y_train=cast(pd.Series, y_train),
+            y_test=cast(pd.Series, y_test),
+        )

common/util.py ADDED Viewed

	@@ -0,0 +1,391 @@

+# DATA MANIPULATION & ANALYSIS
+import pickle
+import streamlit as st
+# Arrays
+import numpy as np
+# DataFrames and Series
+import pandas as pd
+# Returns the indices of the maximum values along an axis
+from numpy import argmax
+# MODELLING
+# Logistic regression
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import StratifiedKFold
+# XGBoosted Decision Trees
+import xgboost as xgb
+# REPORTING, EVALUATION, AND INTERPRETATION
+# Classification report
+from sklearn.metrics import classification_report
+# Reciever Operator Curve
+from sklearn.metrics import roc_curve
+# Evaluate a score by cross-validation
+from sklearn.model_selection import cross_val_score
+# # Functions
+def drop_columns(df, columns):
+    return df.drop(columns, axis=1)
+def remove_less_than_0_columns(df, column):
+    df[column].dropna()
+    return df.loc[(df[column] != 0).any(1)]
+def boolean_int_condition_label(df, label_column_name, condition):
+    df[label_column_name] = condition
+    y = df[label_column_name].astype(int)
+    df = drop_columns(df, label_column_name)
+    return y, df
+@st.cache(suppress_st_warning=True)
+def undersample_training_data(
+    df: pd.DataFrame, column_name: str, split_dataset
+):
+    count_nondefault, count_default = split_dataset.X_y_train[
+        column_name
+    ].value_counts()
+    nondefaults = df[df[column_name] == 0]  # 0
+    defaults = df[df[column_name] == 1]
+    under_sample = min(count_nondefault, count_default)
+    nondefaults_under = nondefaults.sample(under_sample)
+    defaults_under = defaults.sample(under_sample)
+    X_y_train_under = pd.concat(
+        [
+            nondefaults_under.reset_index(drop=True),
+            defaults_under.reset_index(drop=True),
+        ],
+        axis=0,
+    )
+    X_train_under = X_y_train_under.drop([column_name], axis=1)  # remove label
+    y_train_under = X_y_train_under[column_name]  # label only
+    class_balance_default = X_y_train_under[column_name].value_counts()
+    return [
+        X_train_under,
+        y_train_under,
+        X_y_train_under,
+        class_balance_default,
+    ]
+def create_coeffient_feature_dictionary_logistic_model(
+    logistic_model, training_data
+):
+    return {
+        feat: coef
+        for coef, feat in zip(
+            logistic_model.coef_[0, :], training_data.columns
+        )
+    }
+@st.cache(suppress_st_warning=True)
+def test_variables_logistic(X_train, y_train):
+    # Create and fit the logistic regression model
+    return LogisticRegression(solver="lbfgs").fit(X_train, np.ravel(y_train))
+@st.cache(suppress_st_warning=True)
+def print_coeff_logistic(clf_logistic_model, split_dataset):
+    # Dictionary of features and their coefficients
+    return create_coeffient_feature_dictionary_logistic_model(
+        clf_logistic_model, split_dataset.X_train
+    )
+@st.cache(suppress_st_warning=True, hash_funcs={
+    xgb.XGBClassifier: pickle.dumps
+})
+def test_variables_gbt(X_train, y_train):
+    # Using hyperparameters learning_rate and max_depth
+    return xgb.XGBClassifier(
+        learning_rate=0.1,
+        max_depth=7,
+        use_label_encoder=False,
+        eval_metric="logloss",
+    ).fit(X_train, np.ravel(y_train), eval_metric="logloss")
+# In[398]:
+def get_df_trueStatus_probabilityDefault_threshStatus_loanAmount(
+    model, X, y, threshold, loan_amount_col_name
+):
+    true_status = y.to_frame()
+    loan_amount = X[loan_amount_col_name]
+    clf_prediction_prob = model.predict_proba(np.ascontiguousarray(X))
+    clf_prediction_prob_df = pd.DataFrame(
+        clf_prediction_prob[:, 1], columns=["PROB_DEFAULT"]
+    )
+    clf_thresh_predicted_default_status = (
+        clf_prediction_prob_df["PROB_DEFAULT"]
+        .apply(lambda x: 1 if x > threshold else 0)
+        .rename("PREDICT_DEFAULT_STATUS")
+    )
+    return pd.concat(
+        [
+            true_status.reset_index(drop=True),
+            clf_prediction_prob_df.reset_index(drop=True),
+            clf_thresh_predicted_default_status.reset_index(drop=True),
+            loan_amount.reset_index(drop=True),
+        ],
+        axis=1,
+    )
+def find_best_threshold_J_statistic(y, clf_prediction_prob_df):
+    fpr, tpr, thresholds = roc_curve(y, clf_prediction_prob_df)
+    # get the best threshold
+    # Youden’s J statistic tpr-fpr
+    # Argmax to get the index in
+    # thresholds
+    return thresholds[argmax(tpr - fpr)]
+# In[399]:
+# Function that makes dataframe with probability of default, predicted default status based on threshold
+# and actual default status
+def model_probability_values_df(model, X):
+    return pd.DataFrame(model.predict_proba(X)[:, 1], columns=["PROB_DEFAULT"])
+def apply_threshold_to_probability_values(probability_values, threshold):
+    return (
+        probability_values["PROB_DEFAULT"]
+        .apply(lambda x: 1 if x > threshold else 0)
+        .rename("PREDICT_DEFAULT_STATUS")
+    )
+@st.cache(suppress_st_warning=True)
+def find_best_threshold_J_statistic(y, clf_prediction_prob_df):
+    fpr, tpr, thresholds = roc_curve(y, clf_prediction_prob_df)
+    # get the best threshold
+    J = tpr - fpr  # Youden’s J statistic
+    ix = argmax(J)
+    return thresholds[ix]
+# In[401]:
+def create_cross_validation_df(
+    X, y, eval_metric, seed, trees, n_folds, early_stopping_rounds
+):
+    # Test data x and y
+    DTrain = xgb.DMatrix(X, label=y)
+    # auc or logloss
+    params = {
+        "eval_metric": eval_metric,
+        "objective": "binary:logistic",  # logistic say 0 or 1 for loan status
+        "seed": seed,
+    }
+    # Create the data frame of cross validations
+    cv_df = xgb.cv(
+        params,
+        DTrain,
+        num_boost_round=trees,
+        nfold=n_folds,
+        early_stopping_rounds=early_stopping_rounds,
+        shuffle=True,
+    )
+    return [DTrain, cv_df]
+# In[450]:
+def cross_validation_scores(model, X, y, nfold, score, seed):
+    # return cv scores of metric
+    return cross_val_score(
+        model,
+        np.ascontiguousarray(X),
+        np.ravel(np.ascontiguousarray(y)),
+        cv=StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seed),
+        scoring=score,
+    )
+def default_status_per_threshold(threshold_list, prob_default):
+    threshold_default_status_list = []
+    for threshold in threshold_list:
+        threshold_default_status = prob_default.apply(
+            lambda x: 1 if x > threshold else 0
+        )
+        threshold_default_status_list.append(threshold_default_status)
+    return threshold_default_status_list
+def classification_report_per_threshold(
+    threshold_list, threshold_default_status_list, y_test
+):
+    target_names = ["Non-Default", "Default"]
+    classification_report_list = []
+    for threshold_default_status in threshold_default_status_list:
+        thresh_classification_report = classification_report(
+            y_test,
+            threshold_default_status,
+            target_names=target_names,
+            output_dict=True,
+            zero_division=0,
+        )
+        classification_report_list.append(thresh_classification_report)
+    # Return threshold classification report dict
+    return dict(zip(threshold_list, classification_report_list))
+def thresh_classification_report_recall_accuracy(
+    thresh_classification_report_dict,
+):
+    thresh_def_recalls_list = []
+    thresh_nondef_recalls_list = []
+    thresh_accs_list = []
+    for x in [*thresh_classification_report_dict]:
+        thresh_def_recall = thresh_classification_report_dict[x]["Default"][
+            "recall"
+        ]
+        thresh_def_recalls_list.append(thresh_def_recall)
+        thresh_nondef_recall = thresh_classification_report_dict[x][
+            "Non-Default"
+        ]["recall"]
+        thresh_nondef_recalls_list.append(thresh_nondef_recall)
+        thresh_accs = thresh_classification_report_dict[x]["accuracy"]
+        thresh_accs_list.append(thresh_accs)
+    return [
+        thresh_def_recalls_list,
+        thresh_nondef_recalls_list,
+        thresh_accs_list,
+    ]
+def create_accept_rate_list(start, end, samples):
+    return np.linspace(start, end, samples, endpoint=True)
+def create_strategyTable_df(
+    start, end, samples, actual_probability_predicted_acc_rate, true, currency
+):
+    accept_rates = create_accept_rate_list(start, end, samples)
+    thresholds_strat = []
+    bad_rates_start = []
+    Avg_Loan_Amnt = actual_probability_predicted_acc_rate[true].mean()
+    num_accepted_loans_start = []
+    for rate in accept_rates:
+        # Calculate the threshold for the acceptance rate
+        thresh = np.quantile(
+            actual_probability_predicted_acc_rate["PROB_DEFAULT"], rate
+        ).round(3)
+        # Add the threshold value to the list of thresholds
+        thresholds_strat.append(
+            np.quantile(
+                actual_probability_predicted_acc_rate["PROB_DEFAULT"], rate
+            ).round(3)
+        )
+        # Reassign the loan_status value using the threshold
+        actual_probability_predicted_acc_rate[
+            "PREDICT_DEFAULT_STATUS"
+        ] = actual_probability_predicted_acc_rate["PROB_DEFAULT"].apply(
+            lambda x: 1 if x > thresh else 0
+        )
+        # Create a set of accepted loans using this acceptance rate
+        accepted_loans = actual_probability_predicted_acc_rate[
+            actual_probability_predicted_acc_rate["PREDICT_DEFAULT_STATUS"]
+            == 0
+        ]
+        # Calculate and append the bad rate using the acceptance rate
+        bad_rates_start.append(
+            np.sum((accepted_loans[true]) / len(accepted_loans[true])).round(3)
+        )
+        # Accepted loans
+        num_accepted_loans_start.append(len(accepted_loans))
+    # Calculate estimated value
+    money_accepted_loans = [
+        accepted_loans * Avg_Loan_Amnt
+        for accepted_loans in num_accepted_loans_start
+    ]
+    money_bad_accepted_loans = [
+        2 * money_accepted_loan * bad_rate
+        for money_accepted_loan, bad_rate in zip(
+            money_accepted_loans, bad_rates_start
+        )
+    ]
+    zip_object = zip(money_accepted_loans, money_bad_accepted_loans)
+    estimated_value = [
+        money_accepted_loan - money_bad_accepted_loan
+        for money_accepted_loan, money_bad_accepted_loan in zip_object
+    ]
+    accept_rates = ["{:.2f}".format(elem) for elem in accept_rates]
+    thresholds_strat = ["{:.2f}".format(elem) for elem in thresholds_strat]
+    bad_rates_start = ["{:.2f}".format(elem) for elem in bad_rates_start]
+    estimated_value = ["{:.2f}".format(elem) for elem in estimated_value]
+    return (
+        pd.DataFrame(
+            zip(
+                accept_rates,
+                thresholds_strat,
+                bad_rates_start,
+                num_accepted_loans_start,
+                estimated_value,
+            ),
+            columns=[
+                "Acceptance Rate",
+                "Threshold",
+                "Bad Rate",
+                "Num Accepted Loans",
+                f"Estimated Value ({currency})",
+            ],
+        )
+        .sort_values(by="Acceptance Rate", axis=0, ascending=False)
+        .reset_index(drop=True)
+    )

common/views.py ADDED Viewed

	@@ -0,0 +1,361 @@

+from typing import OrderedDict
+import streamlit as st  # works on command prompt
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import xgboost as xgb
+from sklearn.metrics import (
+    roc_curve,
+)
+from sklearn.calibration import calibration_curve
+from xgboost import plot_tree
+from views.typing import ModelView
+def plot_logistic_coeff_barh(coef_dict, x, y):
+    fig = plt.figure(figsize=(x, y))
+    coef_dict_sorted = dict(
+        sorted(coef_dict.items(), key=lambda item: item[1], reverse=False)
+    )
+    plt.barh(*zip(*coef_dict_sorted.items()))
+    return fig
+def print_negative_coefficients_logistic_model(coef_dict):
+    # Equal to or less than 0
+    NegativeCoefficients = dict(
+        filter(lambda x: x[1] <= 0.0, coef_dict.items())
+    )
+    NegativeCoefficientsSorted = sorted(
+        NegativeCoefficients.items(), key=lambda x: x[1], reverse=False
+    )
+    text = (
+        "\n\nFeatures the model found to be negatively correlated with probability of default are:"
+        "\n{negative_features}:"
+    )
+    st.markdown(text.format(negative_features=NegativeCoefficientsSorted))
+    st.markdown(type(NegativeCoefficientsSorted))
+    st.markdown(NegativeCoefficients.items())
+def print_positive_coefficients_logistic_model(coef_dict):
+    # Equal to or greater than 0
+    PositiveCoefficients = dict(
+        filter(lambda x: x[1] >= 0.0, coef_dict.items())
+    )
+    PositiveCoefficientsSorted = sorted(
+        PositiveCoefficients.items(), key=lambda x: x[1], reverse=True
+    )
+    text = (
+        "\n\nFeatures the model found to be positively correlated with probability of default are:"
+        "\n{positive_features}:"
+    )
+    st.markdown(text.format(positive_features=PositiveCoefficientsSorted))
+def plot_importance_gbt(clf_gbt_model, barxsize, barysize):
+    axobject1 = xgb.plot_importance(clf_gbt_model, importance_type="weight")
+    fig1 = axobject1.figure
+    st.write("Feature Importance Plot (Gradient Boosted Tree)")
+    fig1.set_size_inches(barxsize, barysize)
+    return fig1
+def download_importance_gbt(fig1, barxsize, barysize):
+    if st.button(
+        "Download Feature Importance Plot as png (Gradient Boosted Tree)"
+    ):
+        dpisize = max(barxsize, barysize)
+        plt.savefig("bar.png", dpi=dpisize * 96, bbox_inches="tight")
+        fig1.set_size_inches(barxsize, barysize)
+def plot_tree_gbt(treexsize, treeysize, clf_gbt_model):
+    plot_tree(clf_gbt_model)
+    fig2 = plt.gcf()
+    fig2.set_size_inches(treexsize, treeysize)
+    return fig2
+def download_tree_gbt(treexsize, treeysize):
+    if st.button("Download Decision Tree Plot as png (Gradient Boosted Tree)"):
+        dpisize = max(treexsize, treeysize)
+        plt.savefig("tree.png", dpi=dpisize * 96, bbox_inches="tight")
+def cross_validation_graph(cv, eval_metric, trees):
+    # Plot the test AUC scores for each iteration
+    fig = plt.figure()
+    plt.plot(cv[cv.columns[2]])
+    plt.title(
+        "Test {eval_metric} Score Over {it_numbr} Iterations".format(
+            eval_metric=eval_metric, it_numbr=trees
+        )
+    )
+    plt.xlabel("Iteration Number")
+    plt.ylabel("Test {eval_metric} Score".format(eval_metric=eval_metric))
+    return fig
+def recall_accuracy_threshold_tradeoff_fig(
+    widthsize,
+    heightsize,
+    threshold_list,
+    thresh_def_recalls_list,
+    thresh_nondef_recalls_list,
+    thresh_accs_list,
+):
+    fig = plt.figure(figsize=(widthsize, heightsize))
+    plt.plot(threshold_list, thresh_def_recalls_list, label="Default Recall")
+    plt.plot(
+        threshold_list, thresh_nondef_recalls_list, label="Non-Default Recall"
+    )
+    plt.plot(threshold_list, thresh_accs_list, label="Model Accuracy")
+    plt.xlabel("Probability Threshold")
+    plt.ylabel("Score")
+    plt.xlim(0, 1)
+    plt.ylim(0, 1)
+    plt.legend()
+    plt.title("Recall and Accuracy Score Tradeoff with Probability Threshold")
+    plt.grid(False)
+    return fig
+def roc_auc_compare_n_models(y, model_views: OrderedDict[str, ModelView]):
+    colors = ["blue", "green"]
+    fig = plt.figure()
+    for color_idx, (model_name, model_view) in enumerate(model_views.items()):
+        fpr, tpr, _thresholds = roc_curve(
+            y, model_view.prediction_probability_df
+        )
+        plt.plot(fpr, tpr, color=colors[color_idx], label=f"{model_name}")
+    plt.plot([0, 1], [0, 1], linestyle="--", label="Random Prediction")
+    model_names = list(model_views.keys())
+    if not model_names:
+        model_name_str = "None"
+    elif len(model_names) == 1:
+        model_name_str = model_names[0]
+    else:
+        model_name_str = " and ".join(
+            [", ".join(model_names[:-1]), model_names[-1]]
+        )
+    plt.title(f"ROC Chart for {model_name_str} on the Probability of Default")
+    plt.xlabel("False Positive Rate (FP Rate)")
+    plt.ylabel("True Positive Rate (TP Rate)")
+    plt.legend()
+    plt.grid(False)
+    plt.xlim(0, 1)
+    plt.ylim(0, 1)
+    return fig
+def calibration_curve_report_commented_n(
+    y, model_views: OrderedDict[str, ModelView], bins: int
+):
+    fig = plt.figure()
+    for model_name, model_view in model_views.items():
+        frac_of_pos, mean_pred_val = calibration_curve(
+            y,
+            model_view.prediction_probability_df,
+            n_bins=bins,
+            normalize=True,
+        )
+        plt.plot(mean_pred_val, frac_of_pos, "s-", label=f"{model_name}")
+    # Create the calibration curve plot with the guideline
+    plt.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
+    plt.ylabel("Fraction of positives")
+    plt.xlabel("Average Predicted Probability")
+    plt.title("Calibration Curve")
+    plt.legend()
+    plt.grid(False)
+    plt.xlim(0, 1)
+    plt.ylim(0, 1)
+    return fig
+def acceptance_rate_threshold_fig(probability_default, acceptancerate, bins):
+    # Probability distribution
+    probability_stat_distribution = probability_default.describe()
+    # Acceptance rate threshold
+    acc_rate_thresh = np.quantile(probability_default, acceptancerate)
+    fig = plt.figure()
+    plt.hist(
+        probability_default,
+        color="blue",
+        bins=bins,
+        histtype="bar",
+        ec="white",
+    )
+    # Add a reference line to the plot for the threshold
+    plt.axvline(x=acc_rate_thresh, color="red")
+    plt.title("Acceptance Rate Thershold")
+    return (
+        fig,
+        probability_stat_distribution,
+        acc_rate_thresh,
+    )
+def streamlit_2columns_metrics_pct_df(
+    column1name_label: str,
+    column2name_label: str,
+    df: pd.DataFrame,
+):
+    (
+        column1name,
+        column2name,
+    ) = st.columns(2)
+    with column1name:
+        st.metric(
+            label=column1name_label,
+            value="{:.0%}".format(df.value_counts().get(1) / df.shape[0]),
+            delta=None,
+            delta_color="normal",
+        )
+    with column2name:
+        st.metric(
+            label=column2name_label,
+            value="{:.0%}".format(df.value_counts().get(0) / df.shape[0]),
+            delta=None,
+            delta_color="normal",
+        )
+def streamlit_2columns_metrics_df(
+    column1name_label: str,
+    column2name_label: str,
+    df: pd.DataFrame,
+):
+    (
+        column1name,
+        column2name,
+    ) = st.columns(2)
+    with column1name:
+        st.metric(
+            label=column1name_label,
+            value=df.value_counts().get(1),
+            delta=None,
+            delta_color="normal",
+        )
+    with column2name:
+        st.metric(
+            label=column2name_label,
+            value=df.value_counts().get(0),
+            delta=None,
+            delta_color="normal",
+        )
+def streamlit_2columns_metrics_df_shape(df: pd.DataFrame):
+    (
+        column1name,
+        column2name,
+    ) = st.columns(2)
+    with column1name:
+        st.metric(
+            label="Rows",
+            value=df.shape[0],
+            delta=None,
+            delta_color="normal",
+        )
+    with column2name:
+        st.metric(
+            label="Columns",
+            value=df.shape[1],
+            delta=None,
+            delta_color="normal",
+        )
+def streamlit_2columns_metrics_pct_series(
+    column1name_label: str,
+    column2name_label: str,
+    series: pd.Series,
+):
+    (
+        column1name,
+        column2name,
+    ) = st.columns(2)
+    with column1name:
+        st.metric(
+            label=column1name_label,
+            value="{:.0%}".format(series.get(1) / series.sum()),
+            delta=None,
+            delta_color="normal",
+        )
+    with column2name:
+        st.metric(
+            label=column2name_label,
+            value="{:.0%}".format(series.get(0) / series.sum()),
+            delta=None,
+            delta_color="normal",
+        )
+def streamlit_2columns_metrics_series(
+    column1name_label: str,
+    column2name_label: str,
+    series: pd.Series,
+):
+    (
+        column1name,
+        column2name,
+    ) = st.columns(2)
+    with column1name:
+        st.metric(
+            label=column1name_label,
+            value=series.get(1),
+            delta=None,
+            delta_color="normal",
+        )
+    with column2name:
+        st.metric(
+            label=column2name_label,
+            value=series.get(0),
+            delta=None,
+            delta_color="normal",
+        )
+def streamlit_chart_setting_height_width(
+    title: str,
+    default_widthvalue: int,
+    default_heightvalue: int,
+    widthkey: str,
+    heightkey: str,
+):
+    with st.expander(title):
+        lbarx_col, lbary_col = st.columns(2)
+        with lbarx_col:
+            width_size = st.number_input(
+                label="Width in inches:",
+                value=default_widthvalue,
+                key=widthkey,
+            )
+        with lbary_col:
+            height_size = st.number_input(
+                label="Height in inches:",
+                value=default_heightvalue,
+                key=heightkey,
+            )
+    return width_size, height_size

data/processed/cr_loan_w2.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data_setup.py ADDED Viewed

	@@ -0,0 +1,180 @@

+from typing import Tuple, cast
+import pandas as pd
+import streamlit as st
+from common.data import Dataset, SplitDataset
+from common.util import (
+    undersample_training_data,
+)
+from common.views import (
+    streamlit_2columns_metrics_df_shape,
+    streamlit_2columns_metrics_series,
+    streamlit_2columns_metrics_pct_series,
+    streamlit_2columns_metrics_df,
+    streamlit_2columns_metrics_pct_df,
+)
+# Initialize dataframe session state
+def initialise_data() -> Tuple[Dataset, SplitDataset]:
+    if "input_data_frame" not in st.session_state:
+        st.session_state.input_data_frame = pd.read_csv(
+            r"./data/processed/cr_loan_w2.csv"
+        )
+    if "dataset" not in st.session_state:
+        df = cast(pd.DataFrame, st.session_state.input_data_frame)
+        dataset = Dataset(
+            df=df,
+            random_state=123235,
+            test_size=40,
+        )
+        st.session_state.dataset = dataset
+    else:
+        dataset = st.session_state.dataset
+    st.write(
+        "Assuming data is already cleaned and relevant features (predictors) added."
+    )
+    with st.expander("Input Dataframe (X and y)"):
+        st.dataframe(dataset.df)
+        streamlit_2columns_metrics_df_shape(dataset.df)
+    st.header("Predictors")
+    possible_columns = dataset.x_values_column_names
+    selected_columns = st.sidebar.multiselect(
+        label="Select Predictors",
+        options=possible_columns,
+        default=possible_columns,
+    )
+    selected_x_values = dataset.x_values_filtered_columns(selected_columns)
+    st.sidebar.metric(
+        label="# of Predictors Selected",
+        value=selected_x_values.shape[1],
+        delta=None,
+        delta_color="normal",
+    )
+    with st.expander("Predictors Dataframe (X)"):
+        st.dataframe(selected_x_values)
+        streamlit_2columns_metrics_df_shape(selected_x_values)
+    # 40% of data used for training
+    # 14321 as random seed for reproducability
+    st.header("Split Testing and Training Data")
+    test_size_slider_col, seed_col = st.columns(2)
+    with test_size_slider_col:
+        # Initialize test size
+        dataset.test_size = st.slider(
+            label="Test Size Percentage of Input Dataframe:",
+            min_value=0,
+            max_value=100,
+            value=dataset.test_size,
+            key="init_test_size",
+            format="%f%%",
+        )
+    with seed_col:
+        dataset.random_state = int(
+            st.number_input(label="Random State:", value=dataset.random_state)
+        )
+    split_dataset = dataset.train_test_split(selected_x_values)
+    # Series
+    true_status = split_dataset.y_test.to_frame().value_counts()
+    st.sidebar.metric(
+        label="Testing Data # of Actual Default (=1)",
+        value=true_status.get(1),
+    )
+    st.sidebar.metric(
+        label="Testing Data % of Actual Default",
+        value="{:.0%}".format(true_status.get(1) / true_status.sum()),
+    )
+    st.sidebar.metric(
+        label="Testing Data # of Actual Non-Default (=0)",
+        value=true_status.get(0),
+    )
+    st.sidebar.metric(
+        label="Testing Data % of Actual Non-Default",
+        value="{:.0%}".format(true_status.get(0) / true_status.sum()),
+    )
+    # Concat the testing sets
+    X_y_test = split_dataset.X_y_test
+    X_y_train = split_dataset.X_y_train
+    with st.expander("Testing Dataframe (X and y)"):
+        st.dataframe(X_y_test)
+        streamlit_2columns_metrics_df_shape(X_y_test)
+    streamlit_2columns_metrics_series(
+        "# Defaults(=1) (Testing Data)",
+        "# Non-Defaults(=0) (Testing Data)",
+        true_status,
+    )
+    streamlit_2columns_metrics_pct_series(
+        "% Defaults (Testing Data)",
+        "% Non-Defaults (Testing Data)",
+        true_status,
+    )
+    st.header("Training Data")
+    with st.expander("Training Dataframe (X and y)"):
+        st.dataframe(X_y_train)
+        streamlit_2columns_metrics_df_shape(X_y_train)
+    st.subheader("Class Count")
+    streamlit_2columns_metrics_df(
+        "# Defaults (Training Data Class Balance Check)",
+        "# Non-Defaults (Training Data Class Balance Check)",
+        split_dataset.y_train,
+    )
+    streamlit_2columns_metrics_pct_df(
+        "% Defaults (Training Data Class Balance Check)",
+        "% Non-Defaults (Training Data Class Balance Check)",
+        split_dataset.y_train,
+    )
+    balance_the_classes = st.radio(
+        label="Balance the Classes:", options=("Yes", "No")
+    )
+    if balance_the_classes == "Yes":
+        st.subheader("Balanced Classes (by Undersampling)")
+        (
+            split_dataset.X_train,
+            split_dataset.y_train,
+            _X_y_train,
+            class_balance_default,
+        ) = undersample_training_data(X_y_train, "loan_status", split_dataset)
+        streamlit_2columns_metrics_series(
+            "# Defaults (Training Data with Class Balance)",
+            "# Non-Defaults (Training Data with Class Balance)",
+            class_balance_default,
+        )
+        streamlit_2columns_metrics_pct_series(
+            "% of Defaults (Training Data with Class Balance)",
+            "% of Non-Defaults (Training Data with Class Balance)",
+            class_balance_default,
+        )
+    return dataset, split_dataset

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,30 @@

+[tool.poetry]
+name = "credit_risk_modelling"
+version = "0.1.0"
+description = ""
+authors = ["Your Name <you@example.com>"]
+[tool.poetry.dependencies]
+python = ">=3.8,<3.11"
+pandas = "^1.4.0"
+numpy = "^1.22.1"
+matplotlib = "^3.5.1"
+seaborn = "^0.11.2"
+notebook = "^6.4.7"
+scikit-learn = "^1.0.2"
+xgboost = "^1.5.2"
+streamlit = "^1.4.0"
+plotly = "^5.5.0"
+graphviz = "^0.19.1"
+[tool.poetry.dev-dependencies]
+pytest = "^6.2.5"
+black = "^21.12b0"
+flake8 = "^4.0.1"
+[tool.black]
+line-length = 79
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"

views/__init__.py ADDED Viewed

File without changes

views/decision_tree.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from common.data import SplitDataset
+import streamlit as st
+from common.util import (
+    test_variables_gbt,
+)
+from common.views import (
+    streamlit_chart_setting_height_width,
+    plot_importance_gbt,
+    plot_tree_gbt,
+    download_importance_gbt,
+    download_tree_gbt,
+)
+from views.typing import ModelView
+from views.threshold import decision_tree_threshold_view
+from views.evaluation import decision_tree_evaluation_view
+def decisiontree_view(split_dataset: SplitDataset, currency: str):
+    st.header("Decision Trees")
+    clf_gbt_model = test_variables_gbt(
+        split_dataset.X_train, split_dataset.y_train
+    )
+    st.subheader("Decision Tree Feature Importance")
+    (barxsize, barysize,) = streamlit_chart_setting_height_width(
+        "Chart Settings", 10, 15, "barxsize", "barysize"
+    )
+    fig1 = plot_importance_gbt(clf_gbt_model, barxsize, barysize)
+    st.pyplot(fig1)
+    download_importance_gbt(fig1, barxsize, barysize)
+    st.subheader("Decision Tree Structure")
+    (treexsize, treeysize,) = streamlit_chart_setting_height_width(
+        "Chart Settings", 15, 10, "treexsize", "treeysize"
+    )
+    fig2 = plot_tree_gbt(treexsize, treeysize, clf_gbt_model)
+    st.pyplot(fig2)
+    download_tree_gbt(treexsize, treeysize)
+    st.markdown(
+        "Note: The downloaded decision tree plot chart in png has higher resolution than that displayed here."
+    )
+    threshold = decision_tree_threshold_view(clf_gbt_model, split_dataset)
+    df_trueStatus_probabilityDefault_threshStatus_loanAmount = (
+        decision_tree_evaluation_view(
+            clf_gbt_model,
+            split_dataset,
+            currency,
+            threshold.probability_threshold_selected,
+            threshold.predicted_default_status,
+        )
+    )
+    return ModelView(
+        model=clf_gbt_model,
+        trueStatus_probabilityDefault_threshStatus_loanAmount_df=df_trueStatus_probabilityDefault_threshStatus_loanAmount,
+        probability_threshold_selected=threshold.probability_threshold_selected,
+        predicted_default_status=threshold.predicted_default_status,
+        prediction_probability_df=threshold.prediction_probability_df,
+    )

views/evaluation.py ADDED Viewed

	@@ -0,0 +1,410 @@

+from typing import Union
+import pandas as pd
+import streamlit as st
+import numpy as np
+from sklearn.metrics import (
+    classification_report,
+    confusion_matrix,
+)
+from sklearn.linear_model import LogisticRegression
+from xgboost.sklearn import XGBClassifier
+from common.data import SplitDataset
+from common.util import (
+    create_cross_validation_df,
+    cross_validation_scores,
+    get_df_trueStatus_probabilityDefault_threshStatus_loanAmount,
+)
+from common.views import (
+    cross_validation_graph,
+)
+def make_evaluation_view(
+    model_name_short: str,
+    model_name_generic: str,
+):
+    def view(
+        clf_gbt_model: Union[XGBClassifier, LogisticRegression],
+        split_dataset: SplitDataset,
+        currency: str,
+        prob_thresh_selected,
+        predicted_default_status,
+    ):
+        st.header(f"Model Evaluation - {model_name_generic}")
+        st.subheader("Cross Validation")
+        st.write("Shows how our model will perform as new loans come in.")
+        st.write(
+            "If evaluation metric for test and train set improve as models \
+            train on each fold suggests performance will be stable."
+        )
+        st.write(f"XGBoost cross validation test:")
+        stcol_seed, stcol_eval_metric = st.columns(2)
+        with stcol_seed:
+            cv_seed = int(
+                st.number_input(
+                    label="Random State Seed for Cross Validation:",
+                    value=123235,
+                    key=f"cv_seed_{model_name_short}",
+                )
+            )
+        with stcol_eval_metric:
+            eval_metric = st.selectbox(
+                label="Select evaluation metric",
+                options=[
+                    "auc",
+                    "aucpr",
+                    "rmse",
+                    "mae",
+                    "logloss",
+                    "error",
+                    "merror",
+                    "mlogloss",
+                ],
+                key=f"eval_metric_{model_name_short}",
+            )
+        stcol_trees, stcol_eval_nfold, stcol_earlystoppingrounds = st.columns(
+            3
+        )
+        with stcol_trees:
+            trees = int(
+                st.number_input(
+                    label="Number of trees",
+                    value=5,
+                    key=f"trees_{model_name_short}",
+                )
+            )
+        with stcol_eval_nfold:
+            nfolds = int(
+                st.number_input(
+                    label="Number of folds",
+                    value=5,
+                    key=f"nfolds_{model_name_short}",
+                )
+            )
+        with stcol_earlystoppingrounds:
+            early_stopping_rounds = int(
+                st.number_input(
+                    label="Early stopping rounds",
+                    value=10,
+                    key=f"early_stopping_rounds_{model_name_short}",
+                )
+            )
+        DTrain, cv_df = create_cross_validation_df(
+            split_dataset.X_test,
+            split_dataset.y_test,
+            eval_metric,
+            cv_seed,
+            trees,
+            nfolds,
+            early_stopping_rounds,
+        )
+        st.write(cv_df)
+        scoring_options = [
+            "roc_auc",
+            "accuracy",
+            "precision",
+            "recall",
+            "f1",
+            "jaccard",
+        ]
+        overfit_test = st.radio(
+            label="Overfit test:",
+            options=("No", "Yes"),
+            key=f"overfit_test_{model_name_short}",
+        )
+        if overfit_test == "Yes":
+            st.write("Overfit test:")
+            iterations = int(
+                st.number_input(
+                    label="Number of folds (iterations)",
+                    value=500,
+                    key=f"iterations_{model_name_short}",
+                )
+            )
+            DTrain, cv_df_it = create_cross_validation_df(
+                split_dataset.X_test,
+                split_dataset.y_test,
+                eval_metric,
+                cv_seed,
+                iterations,
+                nfolds,
+                iterations,
+            )
+            fig_it = cross_validation_graph(cv_df_it, eval_metric, iterations)
+            st.pyplot(fig_it)
+        st.write("Sklearn cross validation test:")
+        stcol_scoringmetric, st_nfold = st.columns(2)
+        with stcol_scoringmetric:
+            score_metric = st.selectbox(
+                label="Select score",
+                options=scoring_options,
+                key=f"stcol_scoringmetric_{model_name_short}",
+            )
+        with st_nfold:
+            nfolds_score = int(
+                st.number_input(
+                    label="Number of folds",
+                    value=5,
+                    key=f"st_nfold_{model_name_short}",
+                )
+            )
+        cv_scores = cross_validation_scores(
+            clf_gbt_model,
+            split_dataset.X_test,
+            split_dataset.y_test,
+            nfolds_score,
+            score_metric,
+            cv_seed,
+        )
+        stcol_vals, stcol_mean, st_std = st.columns(3)
+        with stcol_vals:
+            st.markdown(f"{score_metric} scores:")
+            st.write(
+                pd.DataFrame(
+                    cv_scores,
+                    columns=[score_metric],
+                )
+            )
+        with stcol_mean:
+            st.metric(
+                label=f"Average {score_metric} score ",
+                value="{:.4f}".format(cv_scores.mean()),
+                delta=None,
+                delta_color="normal",
+            )
+        with st_std:
+            st.metric(
+                label=f"{score_metric} standard deviation (+/-)",
+                value="{:.4f}".format(cv_scores.std()),
+                delta=None,
+                delta_color="normal",
+            )
+        st.subheader("Classification Report")
+        target_names = ["Non-Default", "Default"]
+        classification_report_dict = classification_report(
+            split_dataset.y_test,
+            predicted_default_status,
+            target_names=target_names,
+            output_dict=True,
+        )
+        (
+            stcol_defaultpres,
+            stcol_defaultrecall,
+            stcol_defaultf1score,
+            stcol_f1score,
+        ) = st.columns(4)
+        with stcol_defaultpres:
+            st.metric(
+                label="Default Precision",
+                value="{:.0%}".format(
+                    classification_report_dict["Default"]["precision"]
+                ),
+                delta=None,
+                delta_color="normal",
+            )
+        with stcol_defaultrecall:
+            st.metric(
+                label="Default Recall",
+                value="{:.0%}".format(
+                    classification_report_dict["Default"]["recall"]
+                ),
+                delta=None,
+                delta_color="normal",
+            )
+        with stcol_defaultf1score:
+            st.metric(
+                label="Default F1 Score",
+                value="{:.2f}".format(
+                    classification_report_dict["Default"]["f1-score"]
+                ),
+                delta=None,
+                delta_color="normal",
+            )
+        with stcol_f1score:
+            st.metric(
+                label="Macro avg F1 Score (Model F1 Score):",
+                value="{:.2f}".format(
+                    classification_report_dict["macro avg"]["f1-score"]
+                ),
+                delta=None,
+                delta_color="normal",
+            )
+        with st.expander("Classification Report Dictionary:"):
+            st.write(classification_report_dict)
+        st.markdown(
+            f'Default precision: {"{:.0%}".format(classification_report_dict["Default"]["precision"])} of loans predicted as default were actually default.'
+        )
+        st.markdown(
+            f'Default recall: {"{:.0%}".format(classification_report_dict["Default"]["recall"])} of true defaults predicted correctly.'
+        )
+        f1_gap = 1 - classification_report_dict["Default"]["f1-score"]
+        st.markdown(
+            f'Default F1 score: {"{:.2f}".format(classification_report_dict["Default"]["f1-score"])}\
+                is {"{:.2f}".format(f1_gap)} away from perfect precision and recall (no false positive rate).'
+        )
+        st.markdown(
+            f'macro avg F1 score: {"{:.2f}".format(classification_report_dict["macro avg"]["f1-score"])} is the models F1 score.'
+        )
+        st.subheader("Confusion Matrix")
+        confuctiomatrix_dict = confusion_matrix(
+            split_dataset.y_test, predicted_default_status
+        )
+        tn, fp, fn, tp = confusion_matrix(
+            split_dataset.y_test, predicted_default_status
+        ).ravel()
+        with st.expander(
+            "Confusion matrix (column name = classification model prediction, row name = true status, values = number of loans"
+        ):
+            st.write(confuctiomatrix_dict)
+        st.markdown(
+            f'{tp} ,\
+            {"{:.0%}".format(tp / len(predicted_default_status))} \
+                true positives (defaults correctly predicted as defaults).'
+        )
+        st.markdown(
+            f'{fp} ,\
+            {"{:.0%}".format(fp / len(predicted_default_status))} \
+                false positives (non-defaults incorrectly predicted as defaults).'
+        )
+        st.markdown(
+            f'{fn} ,\
+            {"{:.0%}".format(fn / len(predicted_default_status))} \
+                false negatives (defaults incorrectly predicted as non-defaults).'
+        )
+        st.markdown(
+            f'{tn} ,\
+            {"{:.0%}".format(tn / len(predicted_default_status))} \
+                true negatives (non-defaults correctly predicted as non-defaults).'
+        )
+        st.subheader("Bad Rate")
+        df_trueStatus_probabilityDefault_threshStatus_loanAmount = (
+            get_df_trueStatus_probabilityDefault_threshStatus_loanAmount(
+                clf_gbt_model,
+                split_dataset.X_test,
+                split_dataset.y_test,
+                prob_thresh_selected,
+                "loan_amnt",
+            )
+        )
+        with st.expander(
+            "Loan Status, Probability of Default, & Loan Amount DataFrame"
+        ):
+            st.write(df_trueStatus_probabilityDefault_threshStatus_loanAmount)
+        accepted_loans = (
+            df_trueStatus_probabilityDefault_threshStatus_loanAmount[
+                df_trueStatus_probabilityDefault_threshStatus_loanAmount[
+                    "PREDICT_DEFAULT_STATUS"
+                ]
+                == 0
+            ]
+        )
+        bad_rate = (
+            np.sum(accepted_loans["loan_status"])
+            / accepted_loans["loan_status"].count()
+        )
+        with st.expander("Loan Amount Summary Statistics"):
+            st.write(
+                df_trueStatus_probabilityDefault_threshStatus_loanAmount[
+                    "loan_amnt"
+                ].describe()
+            )
+        avg_loan = np.mean(
+            df_trueStatus_probabilityDefault_threshStatus_loanAmount[
+                "loan_amnt"
+            ]
+        )
+        crosstab_df = pd.crosstab(
+            df_trueStatus_probabilityDefault_threshStatus_loanAmount[
+                "loan_status"
+            ],  # row label
+            df_trueStatus_probabilityDefault_threshStatus_loanAmount[
+                "PREDICT_DEFAULT_STATUS"
+            ],
+        ).apply(
+            lambda x: x * avg_loan, axis=0
+        )  # column label
+        with st.expander(
+            "Cross tabulation (column name = classification model prediction, row name = true status, values = number of loans * average loan value"
+        ):
+            st.write(crosstab_df)
+        st.write(
+            f'Bad rate: {"{:.2%}".format(bad_rate)} of all the loans the model accepted (classified as non-default) from the test set were actually defaults.'
+        )
+        st.write(
+            f'Estimated value of the bad rate is {currency} {"{:,.2f}".format(crosstab_df[0][1])}.'
+        )
+        st.write(
+            f'Total estimated value of actual non-default loans is {currency} {"{:,.2f}".format(crosstab_df[0][0]+crosstab_df[0][1])}'
+        )
+        st.write(
+            f'Estimated value of loans incorrectly predicted as default is {currency} {"{:,.2f}".format(crosstab_df[1][0])}'
+        )
+        st.write(
+            f'Estimated value of loans correctly predicted as defaults is {currency} {"{:,.2f}".format(crosstab_df[1][1])}'
+        )
+        return df_trueStatus_probabilityDefault_threshStatus_loanAmount
+    return view
+decision_tree_evaluation_view = make_evaluation_view("gbt", "Decision Tree")
+logistic_evaluation_view = make_evaluation_view("lg", "Logistic Regression")

views/logistic.py ADDED Viewed

	@@ -0,0 +1,119 @@

+from common.data import SplitDataset
+import streamlit as st
+import pandas as pd
+import plotly.express as px
+from views.threshold import logistic_threshold_view
+from views.evaluation import logistic_evaluation_view
+from common.util import (
+    test_variables_logistic,
+    print_coeff_logistic,
+    model_probability_values_df,
+    apply_threshold_to_probability_values,
+)
+from common.views import (
+    streamlit_2columns_metrics_df,
+    streamlit_2columns_metrics_pct_df,
+)
+from views.typing import ModelView
+def logistic_view(split_dataset: SplitDataset, currency: str) -> ModelView:
+    # ### Test and create variables logically
+    st.header("Logistic Regression")
+    clf_logistic_model = test_variables_logistic(
+        split_dataset.X_train, split_dataset.y_train
+    )
+    st.metric(
+        label="# of Coefficients in Logistic Regression",
+        value=clf_logistic_model.n_features_in_,
+        delta=None,
+        delta_color="normal",
+    )
+    coef_dict = print_coeff_logistic(clf_logistic_model, split_dataset)
+    st.subheader("Logistic Regression Coefficient Values")
+    coef_dict_sorted = dict(
+        sorted(coef_dict.items(), key=lambda item: item[1], reverse=False)
+    )
+    data_items = coef_dict_sorted.items()
+    data_list = list(data_items)
+    df = pd.DataFrame(data_list, columns=["Coefficient", "Value"])
+    fig1 = px.bar(data_frame=df, x="Value", y="Coefficient", orientation="h")
+    fig1.update_layout(
+        title="Logistic Regression Coefficients",
+        xaxis_title="Value",
+        yaxis_title="Coefficient",
+    )
+    st.plotly_chart(fig1)
+    st.subheader("Classification Probability Threshold")
+    st.write(
+        """
+        The logistic regression model (obtained using training data) is applied on testing data to predict the loans probabilities of defaulting.\n
+        Probabilities of defaulting of the loans are compared to a probability threshold.\n
+        A loan is predicted to default if its predicted probability of defaulting is greater than the probability threshold.
+        """
+    )
+    threshold = st.slider(
+        label="Default Probability Threshold:",
+        min_value=0.0,
+        max_value=1.0,
+        value=0.7,
+        key="key_threshold",
+    )
+    clf_prediction_prob_df_log = model_probability_values_df(
+        clf_logistic_model,
+        split_dataset.X_test,
+    )
+    clf_thresh_predicted_default_status_user = (
+        apply_threshold_to_probability_values(
+            clf_prediction_prob_df_log,
+            threshold,
+        )
+    )
+    streamlit_2columns_metrics_df(
+        "# of Predicted Defaults",
+        "# of Predicted Non-Default",
+        clf_thresh_predicted_default_status_user,
+    )
+    streamlit_2columns_metrics_pct_df(
+        "% of Loans Predicted to Default",
+        "% of Loans Predicted not to Default",
+        clf_thresh_predicted_default_status_user,
+    )
+    threshold = logistic_threshold_view(clf_logistic_model, split_dataset)
+    df_trueStatus_probabilityDefault_threshStatus_loanAmount = (
+        logistic_evaluation_view(
+            clf_logistic_model,
+            split_dataset,
+            currency,
+            threshold.probability_threshold_selected,
+            threshold.predicted_default_status,
+        )
+    )
+    return ModelView(
+        model=clf_logistic_model,
+        trueStatus_probabilityDefault_threshStatus_loanAmount_df=df_trueStatus_probabilityDefault_threshStatus_loanAmount,
+        probability_threshold_selected=threshold.probability_threshold_selected,
+        predicted_default_status=threshold.predicted_default_status,
+        prediction_probability_df=threshold.prediction_probability_df,
+    )

views/model_comparison.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from typing import OrderedDict
+import streamlit as st
+from sklearn.metrics import roc_auc_score
+from common.data import SplitDataset
+from common.views import (
+    roc_auc_compare_n_models,
+    streamlit_chart_setting_height_width,
+    calibration_curve_report_commented_n,
+)
+from views.typing import ModelView
+def roc_auc_for_model(split_dataset: SplitDataset, model_view: ModelView):
+    roc_auc_model = roc_auc_score(
+        split_dataset.y_test, model_view.predicted_default_status
+    )
+    if roc_auc_model > 0.9:
+        roc_auc_lvl = f'Very good {"{:.2f}".format(roc_auc_model)} > 0.9)'
+    elif 0.8 < roc_auc_model < 0.9:
+        roc_auc_lvl = f'Good (0.8 < {"{:.2f}".format(roc_auc_model)} <0.9)'
+    elif 0.7 < roc_auc_model < 0.8:
+        roc_auc_lvl = f'Fair (0.7 <  {"{:.2f}".format(roc_auc_model)} < 0.8)'
+    elif 0.6 < roc_auc_model < 0.7:
+        roc_auc_lvl = f'Poor (0.6 <  {"{:.2f}".format(roc_auc_model)} < 0.7)'
+    else:
+        roc_auc_lvl = f'Fail ( {"{:.2f}".format(roc_auc_model)} < 0.6)'
+    return roc_auc_model, roc_auc_lvl
+def model_comparison_view(
+    split_dataset: SplitDataset,
+    model_views: OrderedDict[str, ModelView],
+):
+    st.header("Model Comparison")
+    for model_name, model_view in model_views.items():
+        roc_auc_model, roc_auc_lvl = roc_auc_for_model(
+            split_dataset, model_view
+        )
+        st.subheader(
+            f"Receiver Operating Characteristic (ROC) Curve - {model_name}"
+        )
+        st.markdown(
+            f'Area Under the Receiver Operating Characteristic Curve from prediction scores from "{model_name}" model is {roc_auc_model}.\n'
+        )
+        st.markdown(
+            f'The score of {"{:.2f}".format(roc_auc_model)} is in the {roc_auc_lvl} ROC AUC score category.'
+        )
+    fig1 = roc_auc_compare_n_models(
+        split_dataset.y_test,
+        model_views,
+    )
+    fig1 = fig1.figure
+    (xsize_roc, ysize_roc) = streamlit_chart_setting_height_width(
+        "Chart Settings", 7, 7, "xsize_roc", "ysize_roc"
+    )
+    fig1.set_size_inches(xsize_roc, ysize_roc)
+    st.pyplot(fig1)
+    st.subheader("Models Calibration Curve")
+    fig2 = calibration_curve_report_commented_n(
+        split_dataset.y_test,
+        model_views,
+        10,
+    )
+    fig2 = fig2.figure
+    (xsize_cal, ysize_cal) = streamlit_chart_setting_height_width(
+        "Chart Settings", 7, 7, "xsize_cal", "ysize_cal"
+    )
+    fig2.set_size_inches(xsize_cal, ysize_cal)
+    st.pyplot(fig2.figure)

views/strategy_table.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from typing import OrderedDict
+import plotly.express as px
+import numpy as np
+import streamlit as st
+from common.util import create_strategyTable_df
+from views.typing import ModelView
+def strategy_table_view(
+    currency: str, model_views: OrderedDict[str, ModelView]
+):
+    st.header("Strategy Table")
+    for (model_name, model_view) in model_views.items():
+        st.subheader(model_name)
+        strat_df = create_strategyTable_df(
+            0.05,
+            1,
+            20,
+            model_view.trueStatus_probabilityDefault_threshStatus_loanAmount_df,
+            "loan_status",
+            currency,
+        )
+        columns = strat_df.columns
+        with st.expander("Strategy Table:"):
+            st.write(strat_df)
+        for i in columns:
+            strat_df[i] = strat_df[i].astype(np.float64)
+        strat_df_boxPlot_data = strat_df.iloc[:, 0:3]
+        plot = px.box(data_frame=strat_df_boxPlot_data)
+        st.plotly_chart(plot)
+        # Plot the strategy curve
+        fig1 = px.line(
+            strat_df_boxPlot_data,
+            x="Acceptance Rate",
+            y="Bad Rate",
+            title="Acceptance and Bad Rates",
+        )
+        st.plotly_chart(fig1)
+        fig2 = px.line(
+            strat_df,
+            x="Acceptance Rate",
+            y=f"Estimated Value ({currency})",
+            title=f"Estimated Value ({currency}) by Acceptance Rate",
+        )
+        st.plotly_chart(fig2)
+        st.write("Row with the greatest estimated value:")
+        max_estimated_value = np.max(
+            strat_df[f"Estimated Value ({currency})"].astype(np.float64)
+        )
+        columns = strat_df.columns
+        max_estimated_value = np.max(strat_df[f"Estimated Value ({currency})"])
+        st.write(
+            strat_df.loc[
+                strat_df[f"Estimated Value ({currency})"]
+                == max_estimated_value
+            ]
+        )
+        loss_given_default = 1
+        df_trueStatus_probabilityDefault_threshStatus_loanAmount = (
+            model_view.trueStatus_probabilityDefault_threshStatus_loanAmount_df[
+                "PROB_DEFAULT"
+            ]
+            * loss_given_default
+            * model_view.trueStatus_probabilityDefault_threshStatus_loanAmount_df[
+                "loan_amnt"
+            ]
+        )
+        tot_exp_loss = round(
+            np.sum(df_trueStatus_probabilityDefault_threshStatus_loanAmount),
+            2,
+        )
+        st.metric(
+            label=f"Total expected loss:",
+            value=f"{currency} {tot_exp_loss:,.2f}",
+            delta=None,
+            delta_color="normal",
+        )

views/threshold.py ADDED Viewed

	@@ -0,0 +1,272 @@

+from dataclasses import dataclass
+from typing import Union, cast
+import numpy as np
+import streamlit as st
+import plotly.express as px
+import pandas as pd
+from xgboost.sklearn import XGBClassifier
+from sklearn.linear_model import LogisticRegression
+from common.data import SplitDataset
+from common.util import (
+    model_probability_values_df,
+    apply_threshold_to_probability_values,
+    find_best_threshold_J_statistic,
+    default_status_per_threshold,
+    classification_report_per_threshold,
+    thresh_classification_report_recall_accuracy,
+)
+from common.views import (
+    streamlit_2columns_metrics_df,
+    streamlit_2columns_metrics_pct_df,
+)
+@dataclass(frozen=True)
+class Threshold:
+    probability_threshold_selected: float
+    predicted_default_status: pd.Series
+    prediction_probability_df: pd.DataFrame
+def make_threshold_view(
+    model_name_short: str,
+    model_name: str,
+):
+    def view(
+        clf_gbt_model: Union[XGBClassifier, LogisticRegression],
+        split_dataset: SplitDataset,
+    ) -> Threshold:
+        st.subheader("Classification Probability Threshold - User Defined")
+        st.write(
+            f"""
+            The {model_name} model (obtained using training data) is applied on testing data to predict the loans probabilities of defaulting.\n
+            Probabilities of defaulting of the loans are compared to a probability threshold.\n
+            A loan is predicted to default if its predicted probability of defaulting is greater than the probability threshold.
+            """
+        )
+        threshold_gbt_default = st.slider(
+            label="Default Probability Threshold:",
+            min_value=0.0,
+            max_value=1.0,
+            value=0.8,
+            key=f"threshold_{model_name_short}_default",
+        )
+        clf_prediction_prob_df_gbt = model_probability_values_df(
+            clf_gbt_model,
+            split_dataset.X_test,
+        )
+        clf_thresh_predicted_default_status_user_gbt = (
+            apply_threshold_to_probability_values(
+                clf_prediction_prob_df_gbt,
+                threshold_gbt_default,
+            )
+        )
+        streamlit_2columns_metrics_df(
+            "# of Predicted Defaults",
+            "# of Predicted Non-Default",
+            clf_thresh_predicted_default_status_user_gbt,
+        )
+        streamlit_2columns_metrics_pct_df(
+            "% of Loans Predicted to Default",
+            "% of Loans Predicted not to Default",
+            clf_thresh_predicted_default_status_user_gbt,
+        )
+        st.subheader("J Statistic Driven Classification Probability Threshold")
+        J_statistic_best_threshold = find_best_threshold_J_statistic(
+            split_dataset.y_test, clf_prediction_prob_df_gbt
+        )
+        st.metric(
+            label="Youden's J statistic calculated best threshold",
+            value=J_statistic_best_threshold,
+        )
+        clf_thresh_predicted_default_status_Jstatistic_gbt = (
+            apply_threshold_to_probability_values(
+                clf_prediction_prob_df_gbt,
+                J_statistic_best_threshold,
+            )
+        )
+        streamlit_2columns_metrics_df(
+            "# of Predicted Defaults",
+            "# of Predicted Non-Default",
+            clf_thresh_predicted_default_status_Jstatistic_gbt,
+        )
+        streamlit_2columns_metrics_pct_df(
+            "% of Loans Predicted to Default",
+            "% of Loans Predicted not to Default",
+            clf_thresh_predicted_default_status_Jstatistic_gbt,
+        )
+        st.subheader(
+            "Recall and Accuracy Tradeoff with given Probability Threshold"
+        )
+        # Steps
+        # Get list of thresholds
+        # Get default status per threshold
+        # Get classification report per threshold
+        # Get recall, nondef recall, and accuracy per threshold
+        threshold_list = np.arange(0, 1, 0.025).round(decimals=3).tolist()
+        threshold_default_status_list = default_status_per_threshold(
+            threshold_list, clf_prediction_prob_df_gbt["PROB_DEFAULT"]
+        )
+        thresh_classification_report_dict = (
+            classification_report_per_threshold(
+                threshold_list,
+                threshold_default_status_list,
+                split_dataset.y_test,
+            )
+        )
+        (
+            thresh_def_recalls_list,
+            thresh_nondef_recalls_list,
+            thresh_accs_list,
+        ) = thresh_classification_report_recall_accuracy(
+            thresh_classification_report_dict
+        )
+        namelist = [
+            "Default Recall",
+            "Non Default Recall",
+            "Accuracy",
+            "Threshold",
+        ]
+        df = pd.DataFrame(
+            [
+                thresh_def_recalls_list,
+                thresh_nondef_recalls_list,
+                thresh_accs_list,
+                threshold_list,
+            ],
+            index=namelist,
+        )
+        df = df.T
+        fig2 = px.line(
+            data_frame=df,
+            y=["Default Recall", "Non Default Recall", "Accuracy"],
+            x="Threshold",
+        )
+        fig2.update_layout(
+            title="Recall and Accuracy score Trade-off with Probability Threshold",
+            xaxis_title="Probability Threshold",
+            yaxis_title="Score",
+        )
+        fig2.update_yaxes(range=[0.0, 1.0])
+        st.plotly_chart(fig2)
+        st.subheader("Acceptance Rate Driven Probability Threshold")
+        # Steps
+        # Set acceptance rate
+        # Get default status per threshold
+        # Get classification report per threshold
+        # Get recall, nondef recall, and accuracy per threshold
+        acceptance_rate = (
+            st.slider(
+                label="% of loans accepted (acceptance rate):",
+                min_value=0,
+                max_value=100,
+                value=85,
+                key=f"acceptance_rate_{model_name_short}",
+                format="%f%%",
+            )
+            / 100
+        )
+        acc_rate_thresh_gbt = np.quantile(
+            clf_prediction_prob_df_gbt["PROB_DEFAULT"], acceptance_rate
+        )
+        st.write(
+            f"An acceptance rate of {acceptance_rate} results in probability threshold of {acc_rate_thresh_gbt}"
+        )
+        figa = px.histogram(clf_prediction_prob_df_gbt["PROB_DEFAULT"])
+        figa.update_layout(
+            title="Acceptance Rate Threshold vs. Loans Accepted",
+            xaxis_title="Acceptance Rate Threshold",
+            yaxis_title="Loans Accepted",
+        )
+        figa.update_traces(marker_line_width=1, marker_line_color="white")
+        figa.add_vline(
+            x=acc_rate_thresh_gbt,
+            line_width=3,
+            line_dash="solid",
+            line_color="red",
+        )
+        st.plotly_chart(figa)
+        clf_thresh_predicted_default_status_acceptance_gbt = (
+            apply_threshold_to_probability_values(
+                clf_prediction_prob_df_gbt,
+                acc_rate_thresh_gbt,
+            )
+        )
+        st.write()
+        st.subheader("Selected Probability Threshold")
+        options = [
+            "User Defined",
+            "J Statistic Driven",
+            "Acceptance Rate Driven",
+        ]
+        prob_thresh_option = st.radio(
+            label="Selected Probability Threshold",
+            options=options,
+            key=f"{model_name_short}_radio_thresh",
+        )
+        if prob_thresh_option == "User Defined":
+            prob_thresh_selected_gbt = threshold_gbt_default
+            predicted_default_status_gbt = (
+                clf_thresh_predicted_default_status_user_gbt
+            )
+        elif prob_thresh_option == "J Statistic Driven":
+            prob_thresh_selected_gbt = J_statistic_best_threshold
+            predicted_default_status_gbt = (
+                clf_thresh_predicted_default_status_Jstatistic_gbt
+            )
+        else:
+            prob_thresh_selected_gbt = acc_rate_thresh_gbt
+            predicted_default_status_gbt = (
+                clf_thresh_predicted_default_status_acceptance_gbt
+            )
+        st.write(
+            f"Selected probability threshold is {prob_thresh_selected_gbt}"
+        )
+        return Threshold(
+            probability_threshold_selected=cast(
+                float, prob_thresh_selected_gbt
+            ),
+            predicted_default_status=predicted_default_status_gbt,
+            prediction_probability_df=clf_prediction_prob_df_gbt,
+        )
+    return view
+decision_tree_threshold_view = make_threshold_view("gbt", "decision tree")
+logistic_threshold_view = make_threshold_view("lg", "logistic")

views/typing.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from dataclasses import dataclass
+from typing import Union
+import pandas as pd
+from xgboost.sklearn import XGBClassifier
+from sklearn.linear_model import LogisticRegression
+@dataclass(frozen=True)
+class ModelView:
+    model: Union[XGBClassifier, LogisticRegression]
+    probability_threshold_selected: float
+    predicted_default_status: pd.Series
+    trueStatus_probabilityDefault_threshStatus_loanAmount_df: pd.DataFrame
+    prediction_probability_df: pd.DataFrame