Spaces:
Build error
Build error
Paul Kiage
commited on
Commit
β’
7d861ad
1
Parent(s):
9af3e2b
Hugging Face Deployment Setup (#11)
Browse files* refactor for hugging face space deployment
* docs: HF space branch
- .github/workflows/check_file_size.yml +16 -0
- .github/workflows/sync_to_hf_hub.yml +20 -0
- Dockerfile +20 -0
- Procfile +0 -1
- README.md +22 -58
- src/app.py β app.py +27 -29
- {src β common}/__init__.py +0 -0
- src/features/util_build_features.py β common/data.py +2 -93
- common/util.py +391 -0
- common/views.py +361 -0
- src/features/build_features.py β data_setup.py +42 -15
- requirements.txt +0 -0
- setup.py +0 -10
- setup.sh +0 -13
- src/__main__.py +0 -0
- src/models/__init__.py +0 -0
- src/models/logistic_model.py +0 -33
- src/models/logistic_predict_model.py +0 -4
- src/models/logistic_test_model.py +0 -4
- src/models/logistic_train_model.py +0 -69
- src/models/util_predict_model.py +0 -87
- src/models/util_predict_model_threshold.py +0 -310
- src/models/xgboost_model.py +0 -33
- src/models/xgboost_predict_model.py +0 -4
- src/models/xgboost_test_model.py +0 -4
- src/models/xgboost_train_model.py +0 -68
- src/visualization/__init__.py +0 -0
- src/visualization/graphs_decision_tree.py +0 -23
- src/visualization/graphs_download.py +0 -17
- src/visualization/graphs_logistic.py +0 -12
- src/visualization/graphs_settings.py +0 -28
- src/visualization/graphs_test.py +0 -78
- src/visualization/graphs_threshold.py +0 -80
- src/visualization/metrics.py +0 -132
- {src/features β views}/__init__.py +0 -0
- views/decision_tree.py +70 -0
- src/models/util_test.py β views/evaluation.py +11 -169
- views/logistic.py +119 -0
- src/models/util_model_comparison.py β views/model_comparison.py +9 -14
- src/models/util_strategy_table.py β views/strategy_table.py +4 -4
- views/threshold.py +272 -0
- src/models/util_model_class.py β views/typing.py +1 -1
.github/workflows/check_file_size.yml
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Check file size
|
2 |
+
on: # or directly `on: [push]` to run the action on every push on any branch
|
3 |
+
pull_request:
|
4 |
+
branches: [main]
|
5 |
+
|
6 |
+
# to run this workflow manually from the Actions tab
|
7 |
+
workflow_dispatch:
|
8 |
+
|
9 |
+
jobs:
|
10 |
+
sync-to-hub:
|
11 |
+
runs-on: ubuntu-latest
|
12 |
+
steps:
|
13 |
+
- name: Check large files
|
14 |
+
uses: ActionsDesk/lfs-warning@v2.0
|
15 |
+
with:
|
16 |
+
filesizelimit: 10485760 # this is 10MB so we can sync to HF Spaces
|
.github/workflows/sync_to_hf_hub.yml
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Sync to Hugging Face hub
|
2 |
+
on:
|
3 |
+
push:
|
4 |
+
branches: [main]
|
5 |
+
|
6 |
+
# to run this workflow manually from the Actions tab
|
7 |
+
workflow_dispatch:
|
8 |
+
|
9 |
+
jobs:
|
10 |
+
sync-to-hub:
|
11 |
+
runs-on: ubuntu-latest
|
12 |
+
steps:
|
13 |
+
- uses: actions/checkout@v3
|
14 |
+
with:
|
15 |
+
fetch-depth: 0
|
16 |
+
lfs: true
|
17 |
+
- name: Push to hub
|
18 |
+
env:
|
19 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
20 |
+
run: git push https://pkiage:$HF_TOKEN@huggingface.co/spaces/pkiage/credit_risk_modeling_demo main
|
Dockerfile
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
|
2 |
+
# you will also find guides on how best to write your Dockerfile
|
3 |
+
|
4 |
+
FROM python:3.9
|
5 |
+
|
6 |
+
RUN apt update
|
7 |
+
|
8 |
+
RUN apt install -y graphviz
|
9 |
+
|
10 |
+
WORKDIR /code
|
11 |
+
|
12 |
+
COPY ./requirements.txt /code/requirements.txt
|
13 |
+
|
14 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
15 |
+
|
16 |
+
COPY . .
|
17 |
+
|
18 |
+
CMD ["streamlit", "run", "app.py", "--server.address", "0.0.0.0"]
|
19 |
+
|
20 |
+
# CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
|
Procfile
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
web: sh setup.sh && streamlit run src/app.py
|
|
|
|
README.md
CHANGED
@@ -1,3 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# Credit Risk Modelling
|
2 |
|
3 |
# About
|
@@ -72,68 +83,29 @@ pip install -r requirements.txt
|
|
72 |
|
73 |
https://graphviz.org/download/
|
74 |
|
75 |
-
## Build and install local package
|
76 |
-
|
77 |
-
```shell
|
78 |
-
python setup.py build
|
79 |
-
```
|
80 |
-
|
81 |
-
```shell
|
82 |
-
python setup.py install
|
83 |
-
```
|
84 |
|
85 |
### Run the streamlit app (app.py) by running the following in terminal (from repository root folder):
|
86 |
|
87 |
```shell
|
88 |
-
streamlit
|
89 |
```
|
90 |
|
91 |
## Deployed setup details
|
92 |
|
93 |
-
|
94 |
-
|
95 |
-
β οΈβ οΈβ οΈ
|
96 |
-
|
97 |
-
***UPDATE: In [Herokuβs Next Chapter](https://blog.heroku.com/next-chapter) free dynos will be removed starting [November 28, 2022](https://help.heroku.com/RSBRUH58/removal-of-heroku-free-product-plans-faq)***
|
98 |
-
|
99 |
-
*[Hosting Streamlit app would require](https://discuss.streamlit.io/t/can-i-host-streamlit-on-now-sh-vercel/3189) a Platform as a service (PaaS) since [Streamlit apps aren't static thus can't run on static web host](https://discuss.streamlit.io/t/hosting-streamlit-on-github-pages/356/2).*
|
100 |
-
|
101 |
-
*Viable alternatives include paid services such as AWS, Azure, GCP, DigitalOcean, Heroku, [Replit](https://replit.com/heroku) paid version (due to Repl Resources used) etc.*
|
102 |
-
|
103 |
-
*Platforms such as Github Pages, Netifly, & Vercel currenty mostly require the app to [output a static website](https://answers.netlify.com/t/how-to-run-streamlit-hello-on-netlify/11899/2) since most of those services will not run Python ([or any server process](https://answers.netlify.com/t/support-guide-can-i-run-a-web-server-http-listener-and-or-database-at-netlify/3078)) at browse time. Netifly for instance is designed for the [Jamstack](https://jamstack.org/) that doesn't depend on a "web server". Vercel on the other hand requires either a [`handler` that inherits from the `BaseHTTPRequestHandler` class or an app that exposes a WSGI or ASGI Application](https://vercel.com/docs/runtimes#advanced-usage/advanced-python-usage) - [Tornado](https://www.tornadoweb.org/en/stable/index.html?highlight=wsgi#threads-and-wsgi) a [dependency of Streamlit](https://openbase.com/python/streamlit/dependencies) is [currently not compatible with WSGI](https://www.reddit.com/r/learnpython/comments/grmjfo/comment/fs4elmx/).*
|
104 |
-
|
105 |
-
Currently hosted on [Streamlit Community Cloud](https://blog.streamlit.io/host-your-streamlit-app-for-free/)
|
106 |
-
|
107 |
-
β οΈβ οΈβ οΈ
|
108 |
-
|
109 |
-
[Free Heroku dyno type](https://devcenter.heroku.com/articles/dyno-types) was used to deploy the app
|
110 |
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
Compute: 1x-4x
|
116 |
-
|
117 |
-
Dedicated: no
|
118 |
-
|
119 |
-
Sleeps: yes
|
120 |
-
|
121 |
-
[Enabled Autodeploy from Github](https://devcenter.heroku.com/articles/github-integration) if want to [manually deploy to Heroku](https://devcenter.heroku.com/articles/git#deploy-your-code) the steps are as follows:
|
122 |
-
|
123 |
-
From main branch:
|
124 |
-
```shell
|
125 |
-
heroku login
|
126 |
-
|
127 |
-
git push heroku main
|
128 |
-
```
|
129 |
-
|
130 |
-
From branch beside main:
|
131 |
|
132 |
```shell
|
133 |
-
|
134 |
|
135 |
-
git push
|
136 |
```
|
|
|
|
|
137 |
|
138 |
# Roadmap
|
139 |
|
@@ -222,12 +194,4 @@ code2flow src/models/util_model_comparison.py -o docs/call-graph/util_model_comp
|
|
222 |
|
223 |
[A Gentle Introduction to Threshold-Moving for Imbalanced Classification](https://machinelearningmastery.com/threshold-moving-for-imbalanced-classification/)
|
224 |
|
225 |
-
- Selecting optimal threshold using Youden's J statistic
|
226 |
-
|
227 |
-
[Cookiecutter Data Science](https://drivendata.github.io/cookiecutter-data-science/)
|
228 |
-
|
229 |
-
- Project structure
|
230 |
-
|
231 |
-
[GraphViz Buildpack](https://github.com/weibeld/heroku-buildpack-graphviz)
|
232 |
-
|
233 |
-
- Buildpack used for Heroku deployment
|
|
|
1 |
+
---
|
2 |
+
title: Credit Risk Modeling
|
3 |
+
emoji: π
|
4 |
+
colorFrom: indigo
|
5 |
+
colorTo: blue
|
6 |
+
sdk: docker
|
7 |
+
app_port: 8501
|
8 |
+
pinned: false
|
9 |
+
license: openrail
|
10 |
+
---
|
11 |
+
|
12 |
# Credit Risk Modelling
|
13 |
|
14 |
# About
|
|
|
83 |
|
84 |
https://graphviz.org/download/
|
85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
|
87 |
### Run the streamlit app (app.py) by running the following in terminal (from repository root folder):
|
88 |
|
89 |
```shell
|
90 |
+
streamlit app.py
|
91 |
```
|
92 |
|
93 |
## Deployed setup details
|
94 |
|
95 |
+
**Hugging Face Space Deployment Tips**
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
+
Initial Setup
|
98 |
+
- [When creating the Spaces Configuration Reference](https://huggingface.co/docs/hub/spaces-config-reference) check logs to specify the [Docker Space](https://huggingface.co/docs/hub/spaces-sdks-docker) app_port based on build
|
99 |
+
- In Dockerfile bind Streamlit to a port e.g. 0.0.0.0
|
100 |
+
- [Install Graphiz on Debian](https://installati.one/debian/11/graphviz/) rather than use Streamlit Space to solve ```failed to execute posixpath('dot'), make sure the graphviz executables are on your systems' path``` error given don't have access to terminal with Streamlit Space
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
|
102 |
```shell
|
103 |
+
git remote add space https://huggingface.co/spaces/pkiage/credit_risk_modeling_demo
|
104 |
|
105 |
+
git push --force space main
|
106 |
```
|
107 |
+
- [When syncing with Hugging Face via Github Actions](https://huggingface.co/docs/hub/spaces-github-actions) the [User Access Token](https://huggingface.co/docs/hub/security-tokens) created on Hugging Face (HF) should have write access
|
108 |
+
- Run space from main branch since running from [other branches currently isn't suppported](https://discuss.huggingface.co/t/is-it-possible-to-run-apps-off-of-non-main-branches-in-a-space/18086)
|
109 |
|
110 |
# Roadmap
|
111 |
|
|
|
194 |
|
195 |
[A Gentle Introduction to Threshold-Moving for Imbalanced Classification](https://machinelearningmastery.com/threshold-moving-for-imbalanced-classification/)
|
196 |
|
197 |
+
- Selecting optimal threshold using Youden's J statistic
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/app.py β app.py
RENAMED
@@ -1,24 +1,17 @@
|
|
1 |
-
import streamlit as st
|
2 |
from typing import OrderedDict
|
3 |
-
|
4 |
-
|
5 |
-
from
|
6 |
-
|
7 |
-
from
|
8 |
-
from
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
from models.util_strategy_table import strategy_table_view
|
13 |
|
14 |
|
15 |
def main():
|
16 |
-
|
17 |
-
st.write("Source code: https://github.com/pkiage/tool-credit-risk-modelling")
|
18 |
currency_options = ["USD", "KES", "GBP"]
|
19 |
|
20 |
-
model_options = ["XGBoost", "Logistic"]
|
21 |
-
|
22 |
currency = st.sidebar.selectbox(
|
23 |
label="What currency will you be using?", options=currency_options
|
24 |
)
|
@@ -31,25 +24,30 @@ def main():
|
|
31 |
|
32 |
st.title("Modelling")
|
33 |
|
|
|
|
|
|
|
34 |
models_selected_list = st.sidebar.multiselect(
|
35 |
label="Select model", options=model_options, default=model_options
|
36 |
)
|
37 |
|
38 |
models_selected_set = set(models_selected_list)
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
|
|
|
|
53 |
|
54 |
|
55 |
if __name__ == "__main__":
|
|
|
|
|
1 |
from typing import OrderedDict
|
2 |
+
import streamlit as st
|
3 |
+
from data_setup import initialise_data
|
4 |
+
from views.decision_tree import decisiontree_view
|
5 |
+
from views.logistic import logistic_view
|
6 |
+
from views.model_comparison import model_comparison_view
|
7 |
+
from views.strategy_table import strategy_table_view
|
8 |
+
import os
|
9 |
+
os.environ["PATH"] += os.pathsep + 'C:\Program Files (x86)\Graphviz0.19.1/bin/'
|
|
|
|
|
10 |
|
11 |
|
12 |
def main():
|
|
|
|
|
13 |
currency_options = ["USD", "KES", "GBP"]
|
14 |
|
|
|
|
|
15 |
currency = st.sidebar.selectbox(
|
16 |
label="What currency will you be using?", options=currency_options
|
17 |
)
|
|
|
24 |
|
25 |
st.title("Modelling")
|
26 |
|
27 |
+
model_options = ["Logistic Regression", "Decision Trees"]
|
28 |
+
|
29 |
+
# Returns list
|
30 |
models_selected_list = st.sidebar.multiselect(
|
31 |
label="Select model", options=model_options, default=model_options
|
32 |
)
|
33 |
|
34 |
models_selected_set = set(models_selected_list)
|
35 |
+
model_views = OrderedDict()
|
36 |
+
|
37 |
+
if "Logistic Regression" in models_selected_set:
|
38 |
+
logistic_model_view = logistic_view(split_dataset, currency)
|
39 |
+
model_views["Logistic Regression"] = logistic_model_view
|
40 |
+
|
41 |
+
if "Decision Trees" in models_selected_set:
|
42 |
+
decision_tree_model_view = decisiontree_view(split_dataset, currency)
|
43 |
+
model_views["Decision Trees"] = decision_tree_model_view
|
44 |
+
|
45 |
+
if models_selected_list:
|
46 |
+
model_comparison_view(
|
47 |
+
split_dataset,
|
48 |
+
model_views,
|
49 |
+
)
|
50 |
+
strategy_table_view(currency, model_views)
|
51 |
|
52 |
|
53 |
if __name__ == "__main__":
|
{src β common}/__init__.py
RENAMED
File without changes
|
src/features/util_build_features.py β common/data.py
RENAMED
@@ -1,13 +1,10 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
|
3 |
from typing import List, Union, cast
|
4 |
-
|
5 |
from dataclasses import dataclass
|
6 |
-
|
7 |
from sklearn.model_selection import train_test_split
|
8 |
-
|
9 |
import pandas as pd
|
10 |
|
|
|
|
|
11 |
|
12 |
@dataclass
|
13 |
class SplitDataset:
|
@@ -95,91 +92,3 @@ class Dataset:
|
|
95 |
y_train=cast(pd.Series, y_train),
|
96 |
y_test=cast(pd.Series, y_test),
|
97 |
)
|
98 |
-
|
99 |
-
|
100 |
-
def drop_columns(df, columns):
|
101 |
-
return df.drop(columns, axis=1)
|
102 |
-
|
103 |
-
|
104 |
-
def remove_less_than_0_columns(df, column):
|
105 |
-
df[column].dropna()
|
106 |
-
return df.loc[(df[column] != 0).any(1)]
|
107 |
-
|
108 |
-
|
109 |
-
def boolean_int_condition_label(df, label_column_name, condition):
|
110 |
-
df[label_column_name] = condition
|
111 |
-
y = df[label_column_name].astype(int)
|
112 |
-
df = drop_columns(df, label_column_name)
|
113 |
-
return y, df
|
114 |
-
|
115 |
-
|
116 |
-
@st.cache(suppress_st_warning=True)
|
117 |
-
def undersample_training_data(
|
118 |
-
df: pd.DataFrame, column_name: str, split_dataset
|
119 |
-
):
|
120 |
-
count_nondefault, count_default = split_dataset.X_y_train[
|
121 |
-
column_name
|
122 |
-
].value_counts()
|
123 |
-
|
124 |
-
nondefaults = df[df[column_name] == 0] # 0
|
125 |
-
|
126 |
-
defaults = df[df[column_name] == 1]
|
127 |
-
|
128 |
-
under_sample = min(count_nondefault, count_default)
|
129 |
-
|
130 |
-
nondefaults_under = nondefaults.sample(under_sample)
|
131 |
-
|
132 |
-
defaults_under = defaults.sample(under_sample)
|
133 |
-
|
134 |
-
X_y_train_under = pd.concat(
|
135 |
-
[
|
136 |
-
nondefaults_under.reset_index(drop=True),
|
137 |
-
defaults_under.reset_index(drop=True),
|
138 |
-
],
|
139 |
-
axis=0,
|
140 |
-
)
|
141 |
-
|
142 |
-
X_train_under = X_y_train_under.drop([column_name], axis=1) # remove label
|
143 |
-
|
144 |
-
y_train_under = X_y_train_under[column_name] # label only
|
145 |
-
|
146 |
-
class_balance_default = X_y_train_under[column_name].value_counts()
|
147 |
-
|
148 |
-
return [
|
149 |
-
X_train_under,
|
150 |
-
y_train_under,
|
151 |
-
X_y_train_under,
|
152 |
-
class_balance_default,
|
153 |
-
]
|
154 |
-
|
155 |
-
|
156 |
-
def select_predictors(dataset):
|
157 |
-
st.header("Predictors")
|
158 |
-
|
159 |
-
possible_columns = dataset.x_values_column_names
|
160 |
-
|
161 |
-
selected_columns = st.sidebar.multiselect(
|
162 |
-
label="Select Predictors",
|
163 |
-
options=possible_columns,
|
164 |
-
default=possible_columns,
|
165 |
-
)
|
166 |
-
return dataset.x_values_filtered_columns(selected_columns)
|
167 |
-
|
168 |
-
|
169 |
-
def import_data():
|
170 |
-
if "input_data_frame" not in st.session_state:
|
171 |
-
st.session_state.input_data_frame = pd.read_csv(
|
172 |
-
r"./data/processed/cr_loan_w2.csv"
|
173 |
-
)
|
174 |
-
if "dataset" not in st.session_state:
|
175 |
-
df = cast(pd.DataFrame, st.session_state.input_data_frame)
|
176 |
-
dataset = Dataset(
|
177 |
-
df=df,
|
178 |
-
random_state=123235,
|
179 |
-
test_size=40,
|
180 |
-
)
|
181 |
-
st.session_state.dataset = dataset
|
182 |
-
else:
|
183 |
-
dataset = st.session_state.dataset
|
184 |
-
|
185 |
-
return dataset
|
|
|
|
|
|
|
1 |
from typing import List, Union, cast
|
|
|
2 |
from dataclasses import dataclass
|
|
|
3 |
from sklearn.model_selection import train_test_split
|
|
|
4 |
import pandas as pd
|
5 |
|
6 |
+
from common.util import drop_columns
|
7 |
+
|
8 |
|
9 |
@dataclass
|
10 |
class SplitDataset:
|
|
|
92 |
y_train=cast(pd.Series, y_train),
|
93 |
y_test=cast(pd.Series, y_test),
|
94 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
common/util.py
ADDED
@@ -0,0 +1,391 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# DATA MANIPULATION & ANALYSIS
|
2 |
+
|
3 |
+
import pickle
|
4 |
+
import streamlit as st
|
5 |
+
|
6 |
+
# Arrays
|
7 |
+
import numpy as np
|
8 |
+
|
9 |
+
# DataFrames and Series
|
10 |
+
import pandas as pd
|
11 |
+
|
12 |
+
# Returns the indices of the maximum values along an axis
|
13 |
+
from numpy import argmax
|
14 |
+
|
15 |
+
# MODELLING
|
16 |
+
|
17 |
+
# Logistic regression
|
18 |
+
from sklearn.linear_model import LogisticRegression
|
19 |
+
|
20 |
+
from sklearn.model_selection import StratifiedKFold
|
21 |
+
|
22 |
+
# XGBoosted Decision Trees
|
23 |
+
import xgboost as xgb
|
24 |
+
|
25 |
+
|
26 |
+
# REPORTING, EVALUATION, AND INTERPRETATION
|
27 |
+
|
28 |
+
# Classification report
|
29 |
+
from sklearn.metrics import classification_report
|
30 |
+
|
31 |
+
# Reciever Operator Curve
|
32 |
+
from sklearn.metrics import roc_curve
|
33 |
+
|
34 |
+
|
35 |
+
# Evaluate a score by cross-validation
|
36 |
+
from sklearn.model_selection import cross_val_score
|
37 |
+
|
38 |
+
|
39 |
+
# # Functions
|
40 |
+
|
41 |
+
|
42 |
+
def drop_columns(df, columns):
|
43 |
+
return df.drop(columns, axis=1)
|
44 |
+
|
45 |
+
|
46 |
+
def remove_less_than_0_columns(df, column):
|
47 |
+
df[column].dropna()
|
48 |
+
return df.loc[(df[column] != 0).any(1)]
|
49 |
+
|
50 |
+
|
51 |
+
def boolean_int_condition_label(df, label_column_name, condition):
|
52 |
+
df[label_column_name] = condition
|
53 |
+
y = df[label_column_name].astype(int)
|
54 |
+
df = drop_columns(df, label_column_name)
|
55 |
+
return y, df
|
56 |
+
|
57 |
+
|
58 |
+
@st.cache(suppress_st_warning=True)
|
59 |
+
def undersample_training_data(
|
60 |
+
df: pd.DataFrame, column_name: str, split_dataset
|
61 |
+
):
|
62 |
+
count_nondefault, count_default = split_dataset.X_y_train[
|
63 |
+
column_name
|
64 |
+
].value_counts()
|
65 |
+
|
66 |
+
nondefaults = df[df[column_name] == 0] # 0
|
67 |
+
|
68 |
+
defaults = df[df[column_name] == 1]
|
69 |
+
|
70 |
+
under_sample = min(count_nondefault, count_default)
|
71 |
+
|
72 |
+
nondefaults_under = nondefaults.sample(under_sample)
|
73 |
+
|
74 |
+
defaults_under = defaults.sample(under_sample)
|
75 |
+
|
76 |
+
X_y_train_under = pd.concat(
|
77 |
+
[
|
78 |
+
nondefaults_under.reset_index(drop=True),
|
79 |
+
defaults_under.reset_index(drop=True),
|
80 |
+
],
|
81 |
+
axis=0,
|
82 |
+
)
|
83 |
+
|
84 |
+
X_train_under = X_y_train_under.drop([column_name], axis=1) # remove label
|
85 |
+
|
86 |
+
y_train_under = X_y_train_under[column_name] # label only
|
87 |
+
|
88 |
+
class_balance_default = X_y_train_under[column_name].value_counts()
|
89 |
+
|
90 |
+
return [
|
91 |
+
X_train_under,
|
92 |
+
y_train_under,
|
93 |
+
X_y_train_under,
|
94 |
+
class_balance_default,
|
95 |
+
]
|
96 |
+
|
97 |
+
|
98 |
+
def create_coeffient_feature_dictionary_logistic_model(
|
99 |
+
logistic_model, training_data
|
100 |
+
):
|
101 |
+
return {
|
102 |
+
feat: coef
|
103 |
+
for coef, feat in zip(
|
104 |
+
logistic_model.coef_[0, :], training_data.columns
|
105 |
+
)
|
106 |
+
}
|
107 |
+
|
108 |
+
|
109 |
+
@st.cache(suppress_st_warning=True)
|
110 |
+
def test_variables_logistic(X_train, y_train):
|
111 |
+
# Create and fit the logistic regression model
|
112 |
+
return LogisticRegression(solver="lbfgs").fit(X_train, np.ravel(y_train))
|
113 |
+
|
114 |
+
|
115 |
+
@st.cache(suppress_st_warning=True)
|
116 |
+
def print_coeff_logistic(clf_logistic_model, split_dataset):
|
117 |
+
# Dictionary of features and their coefficients
|
118 |
+
return create_coeffient_feature_dictionary_logistic_model(
|
119 |
+
clf_logistic_model, split_dataset.X_train
|
120 |
+
)
|
121 |
+
|
122 |
+
|
123 |
+
@st.cache(suppress_st_warning=True, hash_funcs={
|
124 |
+
xgb.XGBClassifier: pickle.dumps
|
125 |
+
})
|
126 |
+
def test_variables_gbt(X_train, y_train):
|
127 |
+
# Using hyperparameters learning_rate and max_depth
|
128 |
+
return xgb.XGBClassifier(
|
129 |
+
learning_rate=0.1,
|
130 |
+
max_depth=7,
|
131 |
+
use_label_encoder=False,
|
132 |
+
eval_metric="logloss",
|
133 |
+
).fit(X_train, np.ravel(y_train), eval_metric="logloss")
|
134 |
+
|
135 |
+
|
136 |
+
# In[398]:
|
137 |
+
|
138 |
+
|
139 |
+
def get_df_trueStatus_probabilityDefault_threshStatus_loanAmount(
|
140 |
+
model, X, y, threshold, loan_amount_col_name
|
141 |
+
):
|
142 |
+
true_status = y.to_frame()
|
143 |
+
|
144 |
+
loan_amount = X[loan_amount_col_name]
|
145 |
+
|
146 |
+
clf_prediction_prob = model.predict_proba(np.ascontiguousarray(X))
|
147 |
+
|
148 |
+
clf_prediction_prob_df = pd.DataFrame(
|
149 |
+
clf_prediction_prob[:, 1], columns=["PROB_DEFAULT"]
|
150 |
+
)
|
151 |
+
|
152 |
+
clf_thresh_predicted_default_status = (
|
153 |
+
clf_prediction_prob_df["PROB_DEFAULT"]
|
154 |
+
.apply(lambda x: 1 if x > threshold else 0)
|
155 |
+
.rename("PREDICT_DEFAULT_STATUS")
|
156 |
+
)
|
157 |
+
|
158 |
+
return pd.concat(
|
159 |
+
[
|
160 |
+
true_status.reset_index(drop=True),
|
161 |
+
clf_prediction_prob_df.reset_index(drop=True),
|
162 |
+
clf_thresh_predicted_default_status.reset_index(drop=True),
|
163 |
+
loan_amount.reset_index(drop=True),
|
164 |
+
],
|
165 |
+
axis=1,
|
166 |
+
)
|
167 |
+
|
168 |
+
|
169 |
+
def find_best_threshold_J_statistic(y, clf_prediction_prob_df):
|
170 |
+
fpr, tpr, thresholds = roc_curve(y, clf_prediction_prob_df)
|
171 |
+
# get the best threshold
|
172 |
+
# Youdenβs J statistic tpr-fpr
|
173 |
+
# Argmax to get the index in
|
174 |
+
# thresholds
|
175 |
+
return thresholds[argmax(tpr - fpr)]
|
176 |
+
|
177 |
+
|
178 |
+
# In[399]:
|
179 |
+
|
180 |
+
|
181 |
+
# Function that makes dataframe with probability of default, predicted default status based on threshold
|
182 |
+
# and actual default status
|
183 |
+
|
184 |
+
|
185 |
+
def model_probability_values_df(model, X):
|
186 |
+
return pd.DataFrame(model.predict_proba(X)[:, 1], columns=["PROB_DEFAULT"])
|
187 |
+
|
188 |
+
|
189 |
+
def apply_threshold_to_probability_values(probability_values, threshold):
|
190 |
+
return (
|
191 |
+
probability_values["PROB_DEFAULT"]
|
192 |
+
.apply(lambda x: 1 if x > threshold else 0)
|
193 |
+
.rename("PREDICT_DEFAULT_STATUS")
|
194 |
+
)
|
195 |
+
|
196 |
+
|
197 |
+
@st.cache(suppress_st_warning=True)
|
198 |
+
def find_best_threshold_J_statistic(y, clf_prediction_prob_df):
|
199 |
+
fpr, tpr, thresholds = roc_curve(y, clf_prediction_prob_df)
|
200 |
+
# get the best threshold
|
201 |
+
J = tpr - fpr # Youdenβs J statistic
|
202 |
+
ix = argmax(J)
|
203 |
+
return thresholds[ix]
|
204 |
+
|
205 |
+
|
206 |
+
# In[401]:
|
207 |
+
|
208 |
+
|
209 |
+
def create_cross_validation_df(
|
210 |
+
X, y, eval_metric, seed, trees, n_folds, early_stopping_rounds
|
211 |
+
):
|
212 |
+
# Test data x and y
|
213 |
+
DTrain = xgb.DMatrix(X, label=y)
|
214 |
+
|
215 |
+
# auc or logloss
|
216 |
+
params = {
|
217 |
+
"eval_metric": eval_metric,
|
218 |
+
"objective": "binary:logistic", # logistic say 0 or 1 for loan status
|
219 |
+
"seed": seed,
|
220 |
+
}
|
221 |
+
|
222 |
+
# Create the data frame of cross validations
|
223 |
+
cv_df = xgb.cv(
|
224 |
+
params,
|
225 |
+
DTrain,
|
226 |
+
num_boost_round=trees,
|
227 |
+
nfold=n_folds,
|
228 |
+
early_stopping_rounds=early_stopping_rounds,
|
229 |
+
shuffle=True,
|
230 |
+
)
|
231 |
+
|
232 |
+
return [DTrain, cv_df]
|
233 |
+
|
234 |
+
|
235 |
+
# In[450]:
|
236 |
+
|
237 |
+
|
238 |
+
def cross_validation_scores(model, X, y, nfold, score, seed):
|
239 |
+
# return cv scores of metric
|
240 |
+
return cross_val_score(
|
241 |
+
model,
|
242 |
+
np.ascontiguousarray(X),
|
243 |
+
np.ravel(np.ascontiguousarray(y)),
|
244 |
+
cv=StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seed),
|
245 |
+
scoring=score,
|
246 |
+
)
|
247 |
+
|
248 |
+
|
249 |
+
def default_status_per_threshold(threshold_list, prob_default):
|
250 |
+
threshold_default_status_list = []
|
251 |
+
for threshold in threshold_list:
|
252 |
+
threshold_default_status = prob_default.apply(
|
253 |
+
lambda x: 1 if x > threshold else 0
|
254 |
+
)
|
255 |
+
threshold_default_status_list.append(threshold_default_status)
|
256 |
+
return threshold_default_status_list
|
257 |
+
|
258 |
+
|
259 |
+
def classification_report_per_threshold(
|
260 |
+
threshold_list, threshold_default_status_list, y_test
|
261 |
+
):
|
262 |
+
target_names = ["Non-Default", "Default"]
|
263 |
+
classification_report_list = []
|
264 |
+
for threshold_default_status in threshold_default_status_list:
|
265 |
+
thresh_classification_report = classification_report(
|
266 |
+
y_test,
|
267 |
+
threshold_default_status,
|
268 |
+
target_names=target_names,
|
269 |
+
output_dict=True,
|
270 |
+
zero_division=0,
|
271 |
+
)
|
272 |
+
classification_report_list.append(thresh_classification_report)
|
273 |
+
# Return threshold classification report dict
|
274 |
+
return dict(zip(threshold_list, classification_report_list))
|
275 |
+
|
276 |
+
|
277 |
+
def thresh_classification_report_recall_accuracy(
|
278 |
+
thresh_classification_report_dict,
|
279 |
+
):
|
280 |
+
thresh_def_recalls_list = []
|
281 |
+
thresh_nondef_recalls_list = []
|
282 |
+
thresh_accs_list = []
|
283 |
+
for x in [*thresh_classification_report_dict]:
|
284 |
+
thresh_def_recall = thresh_classification_report_dict[x]["Default"][
|
285 |
+
"recall"
|
286 |
+
]
|
287 |
+
thresh_def_recalls_list.append(thresh_def_recall)
|
288 |
+
thresh_nondef_recall = thresh_classification_report_dict[x][
|
289 |
+
"Non-Default"
|
290 |
+
]["recall"]
|
291 |
+
thresh_nondef_recalls_list.append(thresh_nondef_recall)
|
292 |
+
thresh_accs = thresh_classification_report_dict[x]["accuracy"]
|
293 |
+
thresh_accs_list.append(thresh_accs)
|
294 |
+
return [
|
295 |
+
thresh_def_recalls_list,
|
296 |
+
thresh_nondef_recalls_list,
|
297 |
+
thresh_accs_list,
|
298 |
+
]
|
299 |
+
|
300 |
+
|
301 |
+
def create_accept_rate_list(start, end, samples):
|
302 |
+
return np.linspace(start, end, samples, endpoint=True)
|
303 |
+
|
304 |
+
|
305 |
+
def create_strategyTable_df(
|
306 |
+
start, end, samples, actual_probability_predicted_acc_rate, true, currency
|
307 |
+
):
|
308 |
+
accept_rates = create_accept_rate_list(start, end, samples)
|
309 |
+
thresholds_strat = []
|
310 |
+
bad_rates_start = []
|
311 |
+
Avg_Loan_Amnt = actual_probability_predicted_acc_rate[true].mean()
|
312 |
+
num_accepted_loans_start = []
|
313 |
+
|
314 |
+
for rate in accept_rates:
|
315 |
+
# Calculate the threshold for the acceptance rate
|
316 |
+
thresh = np.quantile(
|
317 |
+
actual_probability_predicted_acc_rate["PROB_DEFAULT"], rate
|
318 |
+
).round(3)
|
319 |
+
# Add the threshold value to the list of thresholds
|
320 |
+
thresholds_strat.append(
|
321 |
+
np.quantile(
|
322 |
+
actual_probability_predicted_acc_rate["PROB_DEFAULT"], rate
|
323 |
+
).round(3)
|
324 |
+
)
|
325 |
+
|
326 |
+
# Reassign the loan_status value using the threshold
|
327 |
+
actual_probability_predicted_acc_rate[
|
328 |
+
"PREDICT_DEFAULT_STATUS"
|
329 |
+
] = actual_probability_predicted_acc_rate["PROB_DEFAULT"].apply(
|
330 |
+
lambda x: 1 if x > thresh else 0
|
331 |
+
)
|
332 |
+
|
333 |
+
# Create a set of accepted loans using this acceptance rate
|
334 |
+
accepted_loans = actual_probability_predicted_acc_rate[
|
335 |
+
actual_probability_predicted_acc_rate["PREDICT_DEFAULT_STATUS"]
|
336 |
+
== 0
|
337 |
+
]
|
338 |
+
# Calculate and append the bad rate using the acceptance rate
|
339 |
+
bad_rates_start.append(
|
340 |
+
np.sum((accepted_loans[true]) / len(accepted_loans[true])).round(3)
|
341 |
+
)
|
342 |
+
# Accepted loans
|
343 |
+
num_accepted_loans_start.append(len(accepted_loans))
|
344 |
+
|
345 |
+
# Calculate estimated value
|
346 |
+
money_accepted_loans = [
|
347 |
+
accepted_loans * Avg_Loan_Amnt
|
348 |
+
for accepted_loans in num_accepted_loans_start
|
349 |
+
]
|
350 |
+
|
351 |
+
money_bad_accepted_loans = [
|
352 |
+
2 * money_accepted_loan * bad_rate
|
353 |
+
for money_accepted_loan, bad_rate in zip(
|
354 |
+
money_accepted_loans, bad_rates_start
|
355 |
+
)
|
356 |
+
]
|
357 |
+
|
358 |
+
zip_object = zip(money_accepted_loans, money_bad_accepted_loans)
|
359 |
+
estimated_value = [
|
360 |
+
money_accepted_loan - money_bad_accepted_loan
|
361 |
+
for money_accepted_loan, money_bad_accepted_loan in zip_object
|
362 |
+
]
|
363 |
+
|
364 |
+
accept_rates = ["{:.2f}".format(elem) for elem in accept_rates]
|
365 |
+
|
366 |
+
thresholds_strat = ["{:.2f}".format(elem) for elem in thresholds_strat]
|
367 |
+
|
368 |
+
bad_rates_start = ["{:.2f}".format(elem) for elem in bad_rates_start]
|
369 |
+
|
370 |
+
estimated_value = ["{:.2f}".format(elem) for elem in estimated_value]
|
371 |
+
|
372 |
+
return (
|
373 |
+
pd.DataFrame(
|
374 |
+
zip(
|
375 |
+
accept_rates,
|
376 |
+
thresholds_strat,
|
377 |
+
bad_rates_start,
|
378 |
+
num_accepted_loans_start,
|
379 |
+
estimated_value,
|
380 |
+
),
|
381 |
+
columns=[
|
382 |
+
"Acceptance Rate",
|
383 |
+
"Threshold",
|
384 |
+
"Bad Rate",
|
385 |
+
"Num Accepted Loans",
|
386 |
+
f"Estimated Value ({currency})",
|
387 |
+
],
|
388 |
+
)
|
389 |
+
.sort_values(by="Acceptance Rate", axis=0, ascending=False)
|
390 |
+
.reset_index(drop=True)
|
391 |
+
)
|
common/views.py
ADDED
@@ -0,0 +1,361 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import OrderedDict
|
2 |
+
import streamlit as st # works on command prompt
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
import numpy as np
|
5 |
+
import pandas as pd
|
6 |
+
import xgboost as xgb
|
7 |
+
from sklearn.metrics import (
|
8 |
+
roc_curve,
|
9 |
+
)
|
10 |
+
from sklearn.calibration import calibration_curve
|
11 |
+
from xgboost import plot_tree
|
12 |
+
from views.typing import ModelView
|
13 |
+
|
14 |
+
|
15 |
+
def plot_logistic_coeff_barh(coef_dict, x, y):
|
16 |
+
fig = plt.figure(figsize=(x, y))
|
17 |
+
coef_dict_sorted = dict(
|
18 |
+
sorted(coef_dict.items(), key=lambda item: item[1], reverse=False)
|
19 |
+
)
|
20 |
+
plt.barh(*zip(*coef_dict_sorted.items()))
|
21 |
+
return fig
|
22 |
+
|
23 |
+
|
24 |
+
def print_negative_coefficients_logistic_model(coef_dict):
|
25 |
+
# Equal to or less than 0
|
26 |
+
NegativeCoefficients = dict(
|
27 |
+
filter(lambda x: x[1] <= 0.0, coef_dict.items())
|
28 |
+
)
|
29 |
+
|
30 |
+
NegativeCoefficientsSorted = sorted(
|
31 |
+
NegativeCoefficients.items(), key=lambda x: x[1], reverse=False
|
32 |
+
)
|
33 |
+
text = (
|
34 |
+
"\n\nFeatures the model found to be negatively correlated with probability of default are:"
|
35 |
+
"\n{negative_features}:"
|
36 |
+
)
|
37 |
+
st.markdown(text.format(negative_features=NegativeCoefficientsSorted))
|
38 |
+
st.markdown(type(NegativeCoefficientsSorted))
|
39 |
+
st.markdown(NegativeCoefficients.items())
|
40 |
+
|
41 |
+
|
42 |
+
def print_positive_coefficients_logistic_model(coef_dict):
|
43 |
+
# Equal to or greater than 0
|
44 |
+
PositiveCoefficients = dict(
|
45 |
+
filter(lambda x: x[1] >= 0.0, coef_dict.items())
|
46 |
+
)
|
47 |
+
|
48 |
+
PositiveCoefficientsSorted = sorted(
|
49 |
+
PositiveCoefficients.items(), key=lambda x: x[1], reverse=True
|
50 |
+
)
|
51 |
+
text = (
|
52 |
+
"\n\nFeatures the model found to be positively correlated with probability of default are:"
|
53 |
+
"\n{positive_features}:"
|
54 |
+
)
|
55 |
+
st.markdown(text.format(positive_features=PositiveCoefficientsSorted))
|
56 |
+
|
57 |
+
|
58 |
+
def plot_importance_gbt(clf_gbt_model, barxsize, barysize):
|
59 |
+
axobject1 = xgb.plot_importance(clf_gbt_model, importance_type="weight")
|
60 |
+
fig1 = axobject1.figure
|
61 |
+
st.write("Feature Importance Plot (Gradient Boosted Tree)")
|
62 |
+
fig1.set_size_inches(barxsize, barysize)
|
63 |
+
return fig1
|
64 |
+
|
65 |
+
|
66 |
+
def download_importance_gbt(fig1, barxsize, barysize):
|
67 |
+
if st.button(
|
68 |
+
"Download Feature Importance Plot as png (Gradient Boosted Tree)"
|
69 |
+
):
|
70 |
+
dpisize = max(barxsize, barysize)
|
71 |
+
plt.savefig("bar.png", dpi=dpisize * 96, bbox_inches="tight")
|
72 |
+
fig1.set_size_inches(barxsize, barysize)
|
73 |
+
|
74 |
+
|
75 |
+
def plot_tree_gbt(treexsize, treeysize, clf_gbt_model):
|
76 |
+
plot_tree(clf_gbt_model)
|
77 |
+
fig2 = plt.gcf()
|
78 |
+
fig2.set_size_inches(treexsize, treeysize)
|
79 |
+
return fig2
|
80 |
+
|
81 |
+
|
82 |
+
def download_tree_gbt(treexsize, treeysize):
|
83 |
+
if st.button("Download Decision Tree Plot as png (Gradient Boosted Tree)"):
|
84 |
+
dpisize = max(treexsize, treeysize)
|
85 |
+
plt.savefig("tree.png", dpi=dpisize * 96, bbox_inches="tight")
|
86 |
+
|
87 |
+
|
88 |
+
def cross_validation_graph(cv, eval_metric, trees):
|
89 |
+
|
90 |
+
# Plot the test AUC scores for each iteration
|
91 |
+
fig = plt.figure()
|
92 |
+
plt.plot(cv[cv.columns[2]])
|
93 |
+
plt.title(
|
94 |
+
"Test {eval_metric} Score Over {it_numbr} Iterations".format(
|
95 |
+
eval_metric=eval_metric, it_numbr=trees
|
96 |
+
)
|
97 |
+
)
|
98 |
+
plt.xlabel("Iteration Number")
|
99 |
+
plt.ylabel("Test {eval_metric} Score".format(eval_metric=eval_metric))
|
100 |
+
return fig
|
101 |
+
|
102 |
+
|
103 |
+
def recall_accuracy_threshold_tradeoff_fig(
|
104 |
+
widthsize,
|
105 |
+
heightsize,
|
106 |
+
threshold_list,
|
107 |
+
thresh_def_recalls_list,
|
108 |
+
thresh_nondef_recalls_list,
|
109 |
+
thresh_accs_list,
|
110 |
+
):
|
111 |
+
fig = plt.figure(figsize=(widthsize, heightsize))
|
112 |
+
plt.plot(threshold_list, thresh_def_recalls_list, label="Default Recall")
|
113 |
+
plt.plot(
|
114 |
+
threshold_list, thresh_nondef_recalls_list, label="Non-Default Recall"
|
115 |
+
)
|
116 |
+
plt.plot(threshold_list, thresh_accs_list, label="Model Accuracy")
|
117 |
+
plt.xlabel("Probability Threshold")
|
118 |
+
plt.ylabel("Score")
|
119 |
+
plt.xlim(0, 1)
|
120 |
+
plt.ylim(0, 1)
|
121 |
+
plt.legend()
|
122 |
+
plt.title("Recall and Accuracy Score Tradeoff with Probability Threshold")
|
123 |
+
plt.grid(False)
|
124 |
+
return fig
|
125 |
+
|
126 |
+
|
127 |
+
def roc_auc_compare_n_models(y, model_views: OrderedDict[str, ModelView]):
|
128 |
+
colors = ["blue", "green"]
|
129 |
+
fig = plt.figure()
|
130 |
+
for color_idx, (model_name, model_view) in enumerate(model_views.items()):
|
131 |
+
fpr, tpr, _thresholds = roc_curve(
|
132 |
+
y, model_view.prediction_probability_df
|
133 |
+
)
|
134 |
+
plt.plot(fpr, tpr, color=colors[color_idx], label=f"{model_name}")
|
135 |
+
plt.plot([0, 1], [0, 1], linestyle="--", label="Random Prediction")
|
136 |
+
model_names = list(model_views.keys())
|
137 |
+
if not model_names:
|
138 |
+
model_name_str = "None"
|
139 |
+
elif len(model_names) == 1:
|
140 |
+
model_name_str = model_names[0]
|
141 |
+
else:
|
142 |
+
model_name_str = " and ".join(
|
143 |
+
[", ".join(model_names[:-1]), model_names[-1]]
|
144 |
+
)
|
145 |
+
plt.title(f"ROC Chart for {model_name_str} on the Probability of Default")
|
146 |
+
plt.xlabel("False Positive Rate (FP Rate)")
|
147 |
+
plt.ylabel("True Positive Rate (TP Rate)")
|
148 |
+
plt.legend()
|
149 |
+
plt.grid(False)
|
150 |
+
plt.xlim(0, 1)
|
151 |
+
plt.ylim(0, 1)
|
152 |
+
return fig
|
153 |
+
|
154 |
+
|
155 |
+
def calibration_curve_report_commented_n(
|
156 |
+
y, model_views: OrderedDict[str, ModelView], bins: int
|
157 |
+
):
|
158 |
+
fig = plt.figure()
|
159 |
+
for model_name, model_view in model_views.items():
|
160 |
+
frac_of_pos, mean_pred_val = calibration_curve(
|
161 |
+
y,
|
162 |
+
model_view.prediction_probability_df,
|
163 |
+
n_bins=bins,
|
164 |
+
normalize=True,
|
165 |
+
)
|
166 |
+
plt.plot(mean_pred_val, frac_of_pos, "s-", label=f"{model_name}")
|
167 |
+
|
168 |
+
# Create the calibration curve plot with the guideline
|
169 |
+
plt.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
|
170 |
+
|
171 |
+
plt.ylabel("Fraction of positives")
|
172 |
+
plt.xlabel("Average Predicted Probability")
|
173 |
+
plt.title("Calibration Curve")
|
174 |
+
plt.legend()
|
175 |
+
plt.grid(False)
|
176 |
+
plt.xlim(0, 1)
|
177 |
+
plt.ylim(0, 1)
|
178 |
+
return fig
|
179 |
+
|
180 |
+
|
181 |
+
def acceptance_rate_threshold_fig(probability_default, acceptancerate, bins):
|
182 |
+
# Probability distribution
|
183 |
+
probability_stat_distribution = probability_default.describe()
|
184 |
+
|
185 |
+
# Acceptance rate threshold
|
186 |
+
acc_rate_thresh = np.quantile(probability_default, acceptancerate)
|
187 |
+
fig = plt.figure()
|
188 |
+
|
189 |
+
plt.hist(
|
190 |
+
probability_default,
|
191 |
+
color="blue",
|
192 |
+
bins=bins,
|
193 |
+
histtype="bar",
|
194 |
+
ec="white",
|
195 |
+
)
|
196 |
+
|
197 |
+
# Add a reference line to the plot for the threshold
|
198 |
+
plt.axvline(x=acc_rate_thresh, color="red")
|
199 |
+
plt.title("Acceptance Rate Thershold")
|
200 |
+
|
201 |
+
return (
|
202 |
+
fig,
|
203 |
+
probability_stat_distribution,
|
204 |
+
acc_rate_thresh,
|
205 |
+
)
|
206 |
+
|
207 |
+
|
208 |
+
def streamlit_2columns_metrics_pct_df(
|
209 |
+
column1name_label: str,
|
210 |
+
column2name_label: str,
|
211 |
+
df: pd.DataFrame,
|
212 |
+
):
|
213 |
+
(
|
214 |
+
column1name,
|
215 |
+
column2name,
|
216 |
+
) = st.columns(2)
|
217 |
+
|
218 |
+
with column1name:
|
219 |
+
st.metric(
|
220 |
+
label=column1name_label,
|
221 |
+
value="{:.0%}".format(df.value_counts().get(1) / df.shape[0]),
|
222 |
+
delta=None,
|
223 |
+
delta_color="normal",
|
224 |
+
)
|
225 |
+
|
226 |
+
with column2name:
|
227 |
+
st.metric(
|
228 |
+
label=column2name_label,
|
229 |
+
value="{:.0%}".format(df.value_counts().get(0) / df.shape[0]),
|
230 |
+
delta=None,
|
231 |
+
delta_color="normal",
|
232 |
+
)
|
233 |
+
|
234 |
+
|
235 |
+
def streamlit_2columns_metrics_df(
|
236 |
+
column1name_label: str,
|
237 |
+
column2name_label: str,
|
238 |
+
df: pd.DataFrame,
|
239 |
+
):
|
240 |
+
(
|
241 |
+
column1name,
|
242 |
+
column2name,
|
243 |
+
) = st.columns(2)
|
244 |
+
|
245 |
+
with column1name:
|
246 |
+
st.metric(
|
247 |
+
label=column1name_label,
|
248 |
+
value=df.value_counts().get(1),
|
249 |
+
delta=None,
|
250 |
+
delta_color="normal",
|
251 |
+
)
|
252 |
+
|
253 |
+
with column2name:
|
254 |
+
st.metric(
|
255 |
+
label=column2name_label,
|
256 |
+
value=df.value_counts().get(0),
|
257 |
+
delta=None,
|
258 |
+
delta_color="normal",
|
259 |
+
)
|
260 |
+
|
261 |
+
|
262 |
+
def streamlit_2columns_metrics_df_shape(df: pd.DataFrame):
|
263 |
+
(
|
264 |
+
column1name,
|
265 |
+
column2name,
|
266 |
+
) = st.columns(2)
|
267 |
+
|
268 |
+
with column1name:
|
269 |
+
st.metric(
|
270 |
+
label="Rows",
|
271 |
+
value=df.shape[0],
|
272 |
+
delta=None,
|
273 |
+
delta_color="normal",
|
274 |
+
)
|
275 |
+
|
276 |
+
with column2name:
|
277 |
+
st.metric(
|
278 |
+
label="Columns",
|
279 |
+
value=df.shape[1],
|
280 |
+
delta=None,
|
281 |
+
delta_color="normal",
|
282 |
+
)
|
283 |
+
|
284 |
+
|
285 |
+
def streamlit_2columns_metrics_pct_series(
|
286 |
+
column1name_label: str,
|
287 |
+
column2name_label: str,
|
288 |
+
series: pd.Series,
|
289 |
+
):
|
290 |
+
(
|
291 |
+
column1name,
|
292 |
+
column2name,
|
293 |
+
) = st.columns(2)
|
294 |
+
with column1name:
|
295 |
+
st.metric(
|
296 |
+
label=column1name_label,
|
297 |
+
value="{:.0%}".format(series.get(1) / series.sum()),
|
298 |
+
delta=None,
|
299 |
+
delta_color="normal",
|
300 |
+
)
|
301 |
+
|
302 |
+
with column2name:
|
303 |
+
st.metric(
|
304 |
+
label=column2name_label,
|
305 |
+
value="{:.0%}".format(series.get(0) / series.sum()),
|
306 |
+
delta=None,
|
307 |
+
delta_color="normal",
|
308 |
+
)
|
309 |
+
|
310 |
+
|
311 |
+
def streamlit_2columns_metrics_series(
|
312 |
+
column1name_label: str,
|
313 |
+
column2name_label: str,
|
314 |
+
series: pd.Series,
|
315 |
+
):
|
316 |
+
(
|
317 |
+
column1name,
|
318 |
+
column2name,
|
319 |
+
) = st.columns(2)
|
320 |
+
with column1name:
|
321 |
+
st.metric(
|
322 |
+
label=column1name_label,
|
323 |
+
value=series.get(1),
|
324 |
+
delta=None,
|
325 |
+
delta_color="normal",
|
326 |
+
)
|
327 |
+
|
328 |
+
with column2name:
|
329 |
+
st.metric(
|
330 |
+
label=column2name_label,
|
331 |
+
value=series.get(0),
|
332 |
+
delta=None,
|
333 |
+
delta_color="normal",
|
334 |
+
)
|
335 |
+
|
336 |
+
|
337 |
+
def streamlit_chart_setting_height_width(
|
338 |
+
title: str,
|
339 |
+
default_widthvalue: int,
|
340 |
+
default_heightvalue: int,
|
341 |
+
widthkey: str,
|
342 |
+
heightkey: str,
|
343 |
+
):
|
344 |
+
with st.expander(title):
|
345 |
+
|
346 |
+
lbarx_col, lbary_col = st.columns(2)
|
347 |
+
|
348 |
+
with lbarx_col:
|
349 |
+
width_size = st.number_input(
|
350 |
+
label="Width in inches:",
|
351 |
+
value=default_widthvalue,
|
352 |
+
key=widthkey,
|
353 |
+
)
|
354 |
+
|
355 |
+
with lbary_col:
|
356 |
+
height_size = st.number_input(
|
357 |
+
label="Height in inches:",
|
358 |
+
value=default_heightvalue,
|
359 |
+
key=heightkey,
|
360 |
+
)
|
361 |
+
return width_size, height_size
|
src/features/build_features.py β data_setup.py
RENAMED
@@ -1,19 +1,13 @@
|
|
1 |
-
from typing import
|
2 |
-
from dataclasses import dataclass
|
3 |
-
from sklearn.model_selection import train_test_split
|
4 |
-
import pandas as pd
|
5 |
|
|
|
6 |
import streamlit as st
|
7 |
|
8 |
-
|
9 |
-
from
|
10 |
-
Dataset,
|
11 |
-
SplitDataset,
|
12 |
undersample_training_data,
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
from visualization.metrics import (
|
17 |
streamlit_2columns_metrics_df_shape,
|
18 |
streamlit_2columns_metrics_series,
|
19 |
streamlit_2columns_metrics_pct_series,
|
@@ -22,9 +16,22 @@ from visualization.metrics import (
|
|
22 |
)
|
23 |
|
24 |
|
|
|
25 |
def initialise_data() -> Tuple[Dataset, SplitDataset]:
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
st.write(
|
30 |
"Assuming data is already cleaned and relevant features (predictors) added."
|
@@ -34,12 +41,31 @@ def initialise_data() -> Tuple[Dataset, SplitDataset]:
|
|
34 |
st.dataframe(dataset.df)
|
35 |
streamlit_2columns_metrics_df_shape(dataset.df)
|
36 |
|
37 |
-
|
38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
with st.expander("Predictors Dataframe (X)"):
|
40 |
st.dataframe(selected_x_values)
|
41 |
streamlit_2columns_metrics_df_shape(selected_x_values)
|
42 |
|
|
|
|
|
|
|
43 |
st.header("Split Testing and Training Data")
|
44 |
|
45 |
test_size_slider_col, seed_col = st.columns(2)
|
@@ -62,6 +88,7 @@ def initialise_data() -> Tuple[Dataset, SplitDataset]:
|
|
62 |
|
63 |
split_dataset = dataset.train_test_split(selected_x_values)
|
64 |
|
|
|
65 |
true_status = split_dataset.y_test.to_frame().value_counts()
|
66 |
|
67 |
st.sidebar.metric(
|
|
|
1 |
+
from typing import Tuple, cast
|
|
|
|
|
|
|
2 |
|
3 |
+
import pandas as pd
|
4 |
import streamlit as st
|
5 |
|
6 |
+
from common.data import Dataset, SplitDataset
|
7 |
+
from common.util import (
|
|
|
|
|
8 |
undersample_training_data,
|
9 |
+
)
|
10 |
+
from common.views import (
|
|
|
|
|
11 |
streamlit_2columns_metrics_df_shape,
|
12 |
streamlit_2columns_metrics_series,
|
13 |
streamlit_2columns_metrics_pct_series,
|
|
|
16 |
)
|
17 |
|
18 |
|
19 |
+
# Initialize dataframe session state
|
20 |
def initialise_data() -> Tuple[Dataset, SplitDataset]:
|
21 |
+
if "input_data_frame" not in st.session_state:
|
22 |
+
st.session_state.input_data_frame = pd.read_csv(
|
23 |
+
r"./data/processed/cr_loan_w2.csv"
|
24 |
+
)
|
25 |
+
if "dataset" not in st.session_state:
|
26 |
+
df = cast(pd.DataFrame, st.session_state.input_data_frame)
|
27 |
+
dataset = Dataset(
|
28 |
+
df=df,
|
29 |
+
random_state=123235,
|
30 |
+
test_size=40,
|
31 |
+
)
|
32 |
+
st.session_state.dataset = dataset
|
33 |
+
else:
|
34 |
+
dataset = st.session_state.dataset
|
35 |
|
36 |
st.write(
|
37 |
"Assuming data is already cleaned and relevant features (predictors) added."
|
|
|
41 |
st.dataframe(dataset.df)
|
42 |
streamlit_2columns_metrics_df_shape(dataset.df)
|
43 |
|
44 |
+
st.header("Predictors")
|
45 |
|
46 |
+
possible_columns = dataset.x_values_column_names
|
47 |
+
|
48 |
+
selected_columns = st.sidebar.multiselect(
|
49 |
+
label="Select Predictors",
|
50 |
+
options=possible_columns,
|
51 |
+
default=possible_columns,
|
52 |
+
)
|
53 |
+
|
54 |
+
selected_x_values = dataset.x_values_filtered_columns(selected_columns)
|
55 |
+
|
56 |
+
st.sidebar.metric(
|
57 |
+
label="# of Predictors Selected",
|
58 |
+
value=selected_x_values.shape[1],
|
59 |
+
delta=None,
|
60 |
+
delta_color="normal",
|
61 |
+
)
|
62 |
with st.expander("Predictors Dataframe (X)"):
|
63 |
st.dataframe(selected_x_values)
|
64 |
streamlit_2columns_metrics_df_shape(selected_x_values)
|
65 |
|
66 |
+
# 40% of data used for training
|
67 |
+
# 14321 as random seed for reproducability
|
68 |
+
|
69 |
st.header("Split Testing and Training Data")
|
70 |
|
71 |
test_size_slider_col, seed_col = st.columns(2)
|
|
|
88 |
|
89 |
split_dataset = dataset.train_test_split(selected_x_values)
|
90 |
|
91 |
+
# Series
|
92 |
true_status = split_dataset.y_test.to_frame().value_counts()
|
93 |
|
94 |
st.sidebar.metric(
|
requirements.txt
CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
|
|
setup.py
DELETED
@@ -1,10 +0,0 @@
|
|
1 |
-
from setuptools import find_packages, setup
|
2 |
-
|
3 |
-
setup(
|
4 |
-
name='src',
|
5 |
-
packages=find_packages(),
|
6 |
-
version='0.1.0',
|
7 |
-
description='Tool for credit risk modelling',
|
8 |
-
author='Author',
|
9 |
-
license='MIT',
|
10 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
setup.sh
DELETED
@@ -1,13 +0,0 @@
|
|
1 |
-
mkdir -p ~/.streamlit/
|
2 |
-
|
3 |
-
cat << EOF > ~/.streamlit/credentials.toml
|
4 |
-
[general]
|
5 |
-
email = "paul.r.kiage@gmail.com"
|
6 |
-
EOF
|
7 |
-
|
8 |
-
cat << EOF > ~/.streamlit/config.toml
|
9 |
-
[server]
|
10 |
-
headless = true
|
11 |
-
enableCORS = true
|
12 |
-
port = $PORT
|
13 |
-
EOF
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/__main__.py
DELETED
File without changes
|
src/models/__init__.py
DELETED
File without changes
|
src/models/logistic_model.py
DELETED
@@ -1,33 +0,0 @@
|
|
1 |
-
from features.build_features import SplitDataset
|
2 |
-
|
3 |
-
from models.logistic_train_model import logistic_train_model
|
4 |
-
from models.logistic_predict_model import logistic_predict_model
|
5 |
-
from models.logistic_test_model import logistic_test_model
|
6 |
-
|
7 |
-
from models.util_model_class import ModelClass
|
8 |
-
|
9 |
-
|
10 |
-
def logistic_class(split_dataset: SplitDataset, currency: str) -> ModelClass:
|
11 |
-
|
12 |
-
# Train Model
|
13 |
-
clf_logistic_model = logistic_train_model(split_dataset)
|
14 |
-
|
15 |
-
# Predict using Trained Model
|
16 |
-
clf_logistic_predictions = logistic_predict_model(
|
17 |
-
clf_logistic_model, split_dataset)
|
18 |
-
|
19 |
-
# Test and Evaluate Model
|
20 |
-
df_trueStatus_probabilityDefault_threshStatus_loanAmount_logistic = logistic_test_model(
|
21 |
-
clf_logistic_model,
|
22 |
-
split_dataset,
|
23 |
-
currency,
|
24 |
-
clf_logistic_predictions.probability_threshold_selected,
|
25 |
-
clf_logistic_predictions.predicted_default_status)
|
26 |
-
|
27 |
-
return ModelClass(
|
28 |
-
model=clf_logistic_model,
|
29 |
-
trueStatus_probabilityDefault_threshStatus_loanAmount_df=df_trueStatus_probabilityDefault_threshStatus_loanAmount_logistic,
|
30 |
-
probability_threshold_selected=clf_logistic_predictions.probability_threshold_selected,
|
31 |
-
predicted_default_status=clf_logistic_predictions.predicted_default_status,
|
32 |
-
prediction_probability_df=clf_logistic_predictions.prediction_probability_df,
|
33 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/models/logistic_predict_model.py
DELETED
@@ -1,4 +0,0 @@
|
|
1 |
-
from models.util_predict_model import make_prediction_view
|
2 |
-
|
3 |
-
logistic_predict_model = make_prediction_view(
|
4 |
-
"Logistic", "Logisitic Model")
|
|
|
|
|
|
|
|
|
|
src/models/logistic_test_model.py
DELETED
@@ -1,4 +0,0 @@
|
|
1 |
-
from models.util_test import make_tests_view
|
2 |
-
|
3 |
-
logistic_test_model = make_tests_view(
|
4 |
-
"Logistic", "Logistic Model")
|
|
|
|
|
|
|
|
|
|
src/models/logistic_train_model.py
DELETED
@@ -1,69 +0,0 @@
|
|
1 |
-
|
2 |
-
import numpy as np
|
3 |
-
from sklearn.linear_model import LogisticRegression
|
4 |
-
from features.build_features import SplitDataset
|
5 |
-
import streamlit as st
|
6 |
-
import pandas as pd
|
7 |
-
|
8 |
-
from visualization.graphs_logistic import plot_logistic_coeff_barh
|
9 |
-
|
10 |
-
|
11 |
-
@st.cache(suppress_st_warning=True)
|
12 |
-
def create_clf_logistic_model(X_train, y_train):
|
13 |
-
# Create and fit the logistic regression model
|
14 |
-
return LogisticRegression(solver="lbfgs").fit(X_train, np.ravel(y_train))
|
15 |
-
|
16 |
-
|
17 |
-
@st.cache(suppress_st_warning=True)
|
18 |
-
def create_coeff_dict_logistic_model(
|
19 |
-
logistic_model, training_data
|
20 |
-
):
|
21 |
-
return {
|
22 |
-
feat: coef
|
23 |
-
for coef, feat in zip(
|
24 |
-
logistic_model.coef_[0, :], training_data.columns
|
25 |
-
)
|
26 |
-
}
|
27 |
-
|
28 |
-
|
29 |
-
def coeff_dict_to_sorted_df(coef_dict):
|
30 |
-
coef_dict_sorted = dict(
|
31 |
-
sorted(coef_dict.items(), key=lambda item: item[1], reverse=False)
|
32 |
-
)
|
33 |
-
|
34 |
-
data_items = coef_dict_sorted.items()
|
35 |
-
data_list = list(data_items)
|
36 |
-
|
37 |
-
return pd.DataFrame(data_list, columns=["Coefficient", "Value"])
|
38 |
-
|
39 |
-
|
40 |
-
def interpret_clf_logistic_model(clf_logistic_model, split_dataset):
|
41 |
-
st.metric(
|
42 |
-
label="# of Coefficients in Logistic Regression",
|
43 |
-
value=clf_logistic_model.n_features_in_,
|
44 |
-
delta=None,
|
45 |
-
delta_color="normal",
|
46 |
-
)
|
47 |
-
|
48 |
-
st.subheader("Logistic Regression Coefficient Values")
|
49 |
-
|
50 |
-
coef_dict = create_coeff_dict_logistic_model(
|
51 |
-
clf_logistic_model, split_dataset.X_y_train)
|
52 |
-
|
53 |
-
df = coeff_dict_to_sorted_df(coef_dict)
|
54 |
-
|
55 |
-
fig = plot_logistic_coeff_barh(df)
|
56 |
-
|
57 |
-
st.plotly_chart(fig)
|
58 |
-
|
59 |
-
|
60 |
-
def logistic_train_model(split_dataset: SplitDataset):
|
61 |
-
st.header("Logistic Regression Model")
|
62 |
-
|
63 |
-
clf_logistic_model = create_clf_logistic_model(
|
64 |
-
split_dataset.X_train, split_dataset.y_train
|
65 |
-
)
|
66 |
-
|
67 |
-
interpret_clf_logistic_model(clf_logistic_model, split_dataset)
|
68 |
-
|
69 |
-
return clf_logistic_model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/models/util_predict_model.py
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
from typing import Union, cast
|
2 |
-
from sklearn.linear_model import LogisticRegression
|
3 |
-
|
4 |
-
|
5 |
-
import pandas as pd
|
6 |
-
|
7 |
-
from dataclasses import dataclass
|
8 |
-
|
9 |
-
from xgboost import XGBClassifier
|
10 |
-
from features.util_build_features import SplitDataset
|
11 |
-
|
12 |
-
from models.util_predict_model_threshold import (
|
13 |
-
user_defined_probability_threshold,
|
14 |
-
J_statistic_driven_probability_threshold,
|
15 |
-
tradeoff_threshold,
|
16 |
-
acceptance_rate_driven_threshold,
|
17 |
-
select_probability_threshold,
|
18 |
-
model_probability_values_df)
|
19 |
-
|
20 |
-
import streamlit as st
|
21 |
-
|
22 |
-
|
23 |
-
def probability_threshold_explainer(model_name):
|
24 |
-
st.write(
|
25 |
-
f"""
|
26 |
-
The {model_name} model (obtained using training data) is applied on testing data to predict the loans probabilities of defaulting.\n
|
27 |
-
Probabilities of defaulting of the loans are compared to a probability threshold.\n
|
28 |
-
A loan is predicted to default if its predicted probability of defaulting is greater than the probability threshold.
|
29 |
-
"""
|
30 |
-
)
|
31 |
-
|
32 |
-
|
33 |
-
@dataclass(frozen=True)
|
34 |
-
class Threshold:
|
35 |
-
probability_threshold_selected: float
|
36 |
-
predicted_default_status: pd.Series
|
37 |
-
prediction_probability_df: pd.DataFrame
|
38 |
-
|
39 |
-
|
40 |
-
def make_prediction_view(
|
41 |
-
model_name_short: str,
|
42 |
-
model_name: str,
|
43 |
-
):
|
44 |
-
def view(
|
45 |
-
clf_xgbt_model: Union[XGBClassifier, LogisticRegression],
|
46 |
-
split_dataset: SplitDataset,
|
47 |
-
) -> Threshold:
|
48 |
-
|
49 |
-
probability_threshold_explainer(model_name)
|
50 |
-
|
51 |
-
clf_prediction_prob_df_gbt = model_probability_values_df(
|
52 |
-
clf_xgbt_model,
|
53 |
-
split_dataset.X_test,
|
54 |
-
)
|
55 |
-
|
56 |
-
(clf_thresh_predicted_default_status_user_gbt,
|
57 |
-
user_threshold
|
58 |
-
) = user_defined_probability_threshold(
|
59 |
-
model_name_short, clf_xgbt_model, split_dataset)
|
60 |
-
|
61 |
-
(clf_thresh_predicted_default_status_Jstatistic_gbt,
|
62 |
-
J_statistic_best_threshold) = J_statistic_driven_probability_threshold(
|
63 |
-
clf_prediction_prob_df_gbt, clf_xgbt_model, split_dataset)
|
64 |
-
|
65 |
-
tradeoff_threshold(clf_prediction_prob_df_gbt, split_dataset)
|
66 |
-
|
67 |
-
(acc_rate_thresh_gbt,
|
68 |
-
clf_thresh_predicted_default_status_acceptance_gbt) = acceptance_rate_driven_threshold(model_name_short, clf_prediction_prob_df_gbt)
|
69 |
-
|
70 |
-
(prob_thresh_selected_gbt,
|
71 |
-
predicted_default_status_gbt) = select_probability_threshold(model_name_short,
|
72 |
-
user_threshold,
|
73 |
-
clf_thresh_predicted_default_status_user_gbt,
|
74 |
-
J_statistic_best_threshold,
|
75 |
-
clf_thresh_predicted_default_status_Jstatistic_gbt,
|
76 |
-
acc_rate_thresh_gbt,
|
77 |
-
clf_thresh_predicted_default_status_acceptance_gbt)
|
78 |
-
|
79 |
-
return Threshold(
|
80 |
-
probability_threshold_selected=cast(
|
81 |
-
float, prob_thresh_selected_gbt
|
82 |
-
),
|
83 |
-
predicted_default_status=predicted_default_status_gbt,
|
84 |
-
prediction_probability_df=clf_prediction_prob_df_gbt,
|
85 |
-
)
|
86 |
-
|
87 |
-
return view
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/models/util_predict_model_threshold.py
DELETED
@@ -1,310 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
|
3 |
-
from sklearn.metrics import classification_report, roc_curve
|
4 |
-
|
5 |
-
import numpy as np
|
6 |
-
|
7 |
-
import plotly.express as px
|
8 |
-
|
9 |
-
import pandas as pd
|
10 |
-
|
11 |
-
from numpy import argmax
|
12 |
-
|
13 |
-
from visualization.metrics import streamlit_2columns_metrics_df, streamlit_2columns_metrics_pct_df
|
14 |
-
|
15 |
-
from visualization.graphs_threshold import acceptance_rate_driven_threshold_graph
|
16 |
-
|
17 |
-
|
18 |
-
def model_probability_values_df(model, X):
|
19 |
-
return pd.DataFrame(model.predict_proba(X)[:, 1], columns=["PROB_DEFAULT"])
|
20 |
-
|
21 |
-
|
22 |
-
def find_best_threshold_J_statistic(y, clf_prediction_prob_df):
|
23 |
-
fpr, tpr, thresholds = roc_curve(y, clf_prediction_prob_df)
|
24 |
-
# get the best threshold
|
25 |
-
# Youdenβs J statistic tpr-fpr
|
26 |
-
# Argmax to get the index in
|
27 |
-
# thresholds
|
28 |
-
return thresholds[argmax(tpr - fpr)]
|
29 |
-
|
30 |
-
# Function that makes dataframe with probability of default, predicted default status based on threshold
|
31 |
-
# and actual default status
|
32 |
-
|
33 |
-
|
34 |
-
def classification_report_per_threshold(
|
35 |
-
threshold_list, threshold_default_status_list, y_test
|
36 |
-
):
|
37 |
-
target_names = ["Non-Default", "Default"]
|
38 |
-
classification_report_list = []
|
39 |
-
for threshold_default_status in threshold_default_status_list:
|
40 |
-
thresh_classification_report = classification_report(
|
41 |
-
y_test,
|
42 |
-
threshold_default_status,
|
43 |
-
target_names=target_names,
|
44 |
-
output_dict=True,
|
45 |
-
zero_division=0,
|
46 |
-
)
|
47 |
-
classification_report_list.append(thresh_classification_report)
|
48 |
-
# Return threshold classification report dict
|
49 |
-
return dict(zip(threshold_list, classification_report_list))
|
50 |
-
|
51 |
-
|
52 |
-
def thresh_classification_report_recall_accuracy(
|
53 |
-
thresh_classification_report_dict,
|
54 |
-
):
|
55 |
-
thresh_def_recalls_list = []
|
56 |
-
thresh_nondef_recalls_list = []
|
57 |
-
thresh_accs_list = []
|
58 |
-
for x in [*thresh_classification_report_dict]:
|
59 |
-
thresh_def_recall = thresh_classification_report_dict[x]["Default"][
|
60 |
-
"recall"
|
61 |
-
]
|
62 |
-
thresh_def_recalls_list.append(thresh_def_recall)
|
63 |
-
thresh_nondef_recall = thresh_classification_report_dict[x][
|
64 |
-
"Non-Default"
|
65 |
-
]["recall"]
|
66 |
-
thresh_nondef_recalls_list.append(thresh_nondef_recall)
|
67 |
-
thresh_accs = thresh_classification_report_dict[x]["accuracy"]
|
68 |
-
thresh_accs_list.append(thresh_accs)
|
69 |
-
return [
|
70 |
-
thresh_def_recalls_list,
|
71 |
-
thresh_nondef_recalls_list,
|
72 |
-
thresh_accs_list,
|
73 |
-
]
|
74 |
-
|
75 |
-
|
76 |
-
def apply_threshold_to_probability_values(probability_values, threshold):
|
77 |
-
return (
|
78 |
-
probability_values["PROB_DEFAULT"]
|
79 |
-
.apply(lambda x: 1 if x > threshold else 0)
|
80 |
-
.rename("PREDICT_DEFAULT_STATUS")
|
81 |
-
)
|
82 |
-
|
83 |
-
|
84 |
-
@st.cache(suppress_st_warning=True)
|
85 |
-
def find_best_threshold_J_statistic(y, clf_prediction_prob_df):
|
86 |
-
fpr, tpr, thresholds = roc_curve(y, clf_prediction_prob_df)
|
87 |
-
# get the best threshold
|
88 |
-
J = tpr - fpr # Youdenβs J statistic
|
89 |
-
ix = argmax(J)
|
90 |
-
return thresholds[ix]
|
91 |
-
|
92 |
-
|
93 |
-
def default_status_per_threshold(threshold_list, prob_default):
|
94 |
-
threshold_default_status_list = []
|
95 |
-
for threshold in threshold_list:
|
96 |
-
threshold_default_status = prob_default.apply(
|
97 |
-
lambda x: 1 if x > threshold else 0
|
98 |
-
)
|
99 |
-
threshold_default_status_list.append(threshold_default_status)
|
100 |
-
return threshold_default_status_list
|
101 |
-
|
102 |
-
|
103 |
-
def threshold_and_predictions(clf_xgbt_model, split_dataset, threshold):
|
104 |
-
|
105 |
-
clf_prediction_prob_df_gbt = model_probability_values_df(
|
106 |
-
clf_xgbt_model,
|
107 |
-
split_dataset.X_test,
|
108 |
-
)
|
109 |
-
clf_thresh_predicted_default_status = (
|
110 |
-
apply_threshold_to_probability_values(
|
111 |
-
clf_prediction_prob_df_gbt,
|
112 |
-
threshold,
|
113 |
-
)
|
114 |
-
)
|
115 |
-
|
116 |
-
streamlit_2columns_metrics_df(
|
117 |
-
"# of Predicted Defaults",
|
118 |
-
"# of Predicted Non-Default",
|
119 |
-
clf_thresh_predicted_default_status,
|
120 |
-
)
|
121 |
-
|
122 |
-
streamlit_2columns_metrics_pct_df(
|
123 |
-
"% of Loans Predicted to Default",
|
124 |
-
"% of Loans Predicted not to Default",
|
125 |
-
clf_thresh_predicted_default_status,
|
126 |
-
)
|
127 |
-
|
128 |
-
return clf_thresh_predicted_default_status
|
129 |
-
|
130 |
-
|
131 |
-
def user_defined_probability_threshold(model_name_short, clf_xgbt_model, split_dataset):
|
132 |
-
st.subheader("Classification Probability Threshold - User Defined")
|
133 |
-
|
134 |
-
user_defined_threshold = st.slider(
|
135 |
-
label="Default Probability Threshold:",
|
136 |
-
min_value=0.0,
|
137 |
-
max_value=1.0,
|
138 |
-
value=0.8,
|
139 |
-
key=f"threshold_{model_name_short}_default",
|
140 |
-
)
|
141 |
-
|
142 |
-
clf_thresh_predicted_default_status = threshold_and_predictions(
|
143 |
-
clf_xgbt_model, split_dataset, user_defined_threshold)
|
144 |
-
|
145 |
-
return clf_thresh_predicted_default_status, user_defined_threshold
|
146 |
-
|
147 |
-
|
148 |
-
def J_statistic_driven_probability_threshold(clf_prediction_prob_df_gbt, clf_xgbt_model, split_dataset):
|
149 |
-
st.subheader("J Statistic Driven Classification Probability Threshold")
|
150 |
-
|
151 |
-
J_statistic_best_threshold = find_best_threshold_J_statistic(
|
152 |
-
split_dataset.y_test, clf_prediction_prob_df_gbt
|
153 |
-
)
|
154 |
-
st.metric(
|
155 |
-
label="Youden's J statistic calculated best threshold",
|
156 |
-
value=J_statistic_best_threshold,
|
157 |
-
)
|
158 |
-
|
159 |
-
clf_thresh_predicted_default_status = threshold_and_predictions(
|
160 |
-
clf_xgbt_model, split_dataset, J_statistic_best_threshold)
|
161 |
-
|
162 |
-
return clf_thresh_predicted_default_status, J_statistic_best_threshold
|
163 |
-
|
164 |
-
|
165 |
-
def create_tradeoff_graph(df):
|
166 |
-
fig2 = px.line(
|
167 |
-
data_frame=df,
|
168 |
-
y=["Default Recall", "Non Default Recall", "Accuracy"],
|
169 |
-
x="Threshold",
|
170 |
-
)
|
171 |
-
|
172 |
-
fig2.update_layout(
|
173 |
-
title="Recall and Accuracy score Trade-off with Probability Threshold",
|
174 |
-
xaxis_title="Probability Threshold",
|
175 |
-
yaxis_title="Score",
|
176 |
-
)
|
177 |
-
fig2.update_yaxes(range=[0.0, 1.0])
|
178 |
-
|
179 |
-
st.plotly_chart(fig2)
|
180 |
-
|
181 |
-
|
182 |
-
def tradeoff_threshold(clf_prediction_prob_df_gbt, split_dataset):
|
183 |
-
st.subheader(
|
184 |
-
"Recall and Accuracy Tradeoff with given Probability Threshold"
|
185 |
-
)
|
186 |
-
|
187 |
-
threshold_list = np.arange(
|
188 |
-
0, 1, 0.025).round(decimals=3).tolist()
|
189 |
-
|
190 |
-
threshold_default_status_list = default_status_per_threshold(
|
191 |
-
threshold_list, clf_prediction_prob_df_gbt["PROB_DEFAULT"]
|
192 |
-
)
|
193 |
-
thresh_classification_report_dict = (
|
194 |
-
classification_report_per_threshold(
|
195 |
-
threshold_list,
|
196 |
-
threshold_default_status_list,
|
197 |
-
split_dataset.y_test,
|
198 |
-
)
|
199 |
-
)
|
200 |
-
|
201 |
-
(
|
202 |
-
thresh_def_recalls_list,
|
203 |
-
thresh_nondef_recalls_list,
|
204 |
-
thresh_accs_list,
|
205 |
-
) = thresh_classification_report_recall_accuracy(
|
206 |
-
thresh_classification_report_dict
|
207 |
-
)
|
208 |
-
|
209 |
-
namelist = [
|
210 |
-
"Default Recall",
|
211 |
-
"Non Default Recall",
|
212 |
-
"Accuracy",
|
213 |
-
"Threshold",
|
214 |
-
]
|
215 |
-
|
216 |
-
df = pd.DataFrame(
|
217 |
-
[
|
218 |
-
thresh_def_recalls_list,
|
219 |
-
thresh_nondef_recalls_list,
|
220 |
-
thresh_accs_list,
|
221 |
-
threshold_list,
|
222 |
-
],
|
223 |
-
index=namelist,
|
224 |
-
)
|
225 |
-
|
226 |
-
df = df.T
|
227 |
-
|
228 |
-
create_tradeoff_graph(df)
|
229 |
-
|
230 |
-
|
231 |
-
def select_probability_threshold(model_name_short,
|
232 |
-
user_defined_threshold,
|
233 |
-
clf_thresh_predicted_default_status_user_gbt,
|
234 |
-
J_statistic_best_threshold,
|
235 |
-
clf_thresh_predicted_default_status_Jstatistic_gbt,
|
236 |
-
acc_rate_thresh_gbt,
|
237 |
-
clf_thresh_predicted_default_status_acceptance_gbt):
|
238 |
-
st.subheader("Selected Probability Threshold")
|
239 |
-
|
240 |
-
options = [
|
241 |
-
"User Defined",
|
242 |
-
"J Statistic Driven",
|
243 |
-
"Acceptance Rate Driven",
|
244 |
-
]
|
245 |
-
prob_thresh_option = st.radio(
|
246 |
-
label="Selected Probability Threshold",
|
247 |
-
options=options,
|
248 |
-
key=f"{model_name_short}_radio_thresh",
|
249 |
-
)
|
250 |
-
|
251 |
-
if prob_thresh_option == "User Defined":
|
252 |
-
prob_thresh_selected_gbt = user_defined_threshold
|
253 |
-
predicted_default_status_gbt = (
|
254 |
-
clf_thresh_predicted_default_status_user_gbt
|
255 |
-
)
|
256 |
-
elif prob_thresh_option == "J Statistic Driven":
|
257 |
-
prob_thresh_selected_gbt = J_statistic_best_threshold
|
258 |
-
predicted_default_status_gbt = (
|
259 |
-
clf_thresh_predicted_default_status_Jstatistic_gbt
|
260 |
-
)
|
261 |
-
else:
|
262 |
-
prob_thresh_selected_gbt = acc_rate_thresh_gbt
|
263 |
-
predicted_default_status_gbt = (
|
264 |
-
clf_thresh_predicted_default_status_acceptance_gbt
|
265 |
-
)
|
266 |
-
|
267 |
-
st.write(
|
268 |
-
f"Selected probability threshold is {prob_thresh_selected_gbt}"
|
269 |
-
)
|
270 |
-
|
271 |
-
return prob_thresh_selected_gbt, predicted_default_status_gbt
|
272 |
-
|
273 |
-
|
274 |
-
def acceptance_rate_driven_threshold(model_name_short, clf_prediction_prob_df_gbt):
|
275 |
-
st.subheader("Acceptance Rate Driven Probability Threshold")
|
276 |
-
# Steps
|
277 |
-
# Set acceptance rate
|
278 |
-
# Get default status per threshold
|
279 |
-
# Get classification report per threshold
|
280 |
-
# Get recall, nondef recall, and accuracy per threshold
|
281 |
-
|
282 |
-
acceptance_rate = (
|
283 |
-
st.slider(
|
284 |
-
label="% of loans accepted (acceptance rate):",
|
285 |
-
min_value=0,
|
286 |
-
max_value=100,
|
287 |
-
value=85,
|
288 |
-
key=f"acceptance_rate_{model_name_short}",
|
289 |
-
format="%f%%",
|
290 |
-
)
|
291 |
-
/ 100
|
292 |
-
)
|
293 |
-
|
294 |
-
acc_rate_thresh_gbt = np.quantile(
|
295 |
-
clf_prediction_prob_df_gbt["PROB_DEFAULT"], acceptance_rate
|
296 |
-
)
|
297 |
-
|
298 |
-
st.write(
|
299 |
-
f"An acceptance rate of {acceptance_rate} results in probability threshold of {acc_rate_thresh_gbt}"
|
300 |
-
)
|
301 |
-
|
302 |
-
acceptance_rate_driven_threshold_graph(
|
303 |
-
clf_prediction_prob_df_gbt, acc_rate_thresh_gbt)
|
304 |
-
|
305 |
-
clf_thresh_predicted_default_status_acceptance_gbt = apply_threshold_to_probability_values(
|
306 |
-
clf_prediction_prob_df_gbt,
|
307 |
-
acc_rate_thresh_gbt,
|
308 |
-
)
|
309 |
-
|
310 |
-
return acc_rate_thresh_gbt, clf_thresh_predicted_default_status_acceptance_gbt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/models/xgboost_model.py
DELETED
@@ -1,33 +0,0 @@
|
|
1 |
-
from features.build_features import SplitDataset
|
2 |
-
|
3 |
-
from models.xgboost_train_model import xgboost_train_model
|
4 |
-
from models.xgboost_predict_model import xgboost_predit_model
|
5 |
-
from models.xgboost_test_model import xgboost_test_model
|
6 |
-
|
7 |
-
from models.util_model_class import ModelClass
|
8 |
-
|
9 |
-
|
10 |
-
def xgboost_class(split_dataset: SplitDataset, currency: str):
|
11 |
-
|
12 |
-
# Train Model
|
13 |
-
clf_xgbt_model = xgboost_train_model(split_dataset)
|
14 |
-
|
15 |
-
# Predit using Trained Model
|
16 |
-
clf_xgbt_predictions = xgboost_predit_model(
|
17 |
-
clf_xgbt_model, split_dataset)
|
18 |
-
|
19 |
-
# Test and Evaluate Model
|
20 |
-
df_trueStatus_probabilityDefault_threshStatus_loanAmount_xgbt = xgboost_test_model(
|
21 |
-
clf_xgbt_model,
|
22 |
-
split_dataset,
|
23 |
-
currency,
|
24 |
-
clf_xgbt_predictions.probability_threshold_selected,
|
25 |
-
clf_xgbt_predictions.predicted_default_status)
|
26 |
-
|
27 |
-
return ModelClass(
|
28 |
-
model=clf_xgbt_model,
|
29 |
-
trueStatus_probabilityDefault_threshStatus_loanAmount_df=df_trueStatus_probabilityDefault_threshStatus_loanAmount_xgbt,
|
30 |
-
probability_threshold_selected=clf_xgbt_predictions.probability_threshold_selected,
|
31 |
-
predicted_default_status=clf_xgbt_predictions.predicted_default_status,
|
32 |
-
prediction_probability_df=clf_xgbt_predictions.prediction_probability_df,
|
33 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/models/xgboost_predict_model.py
DELETED
@@ -1,4 +0,0 @@
|
|
1 |
-
from models.util_predict_model import make_prediction_view
|
2 |
-
|
3 |
-
xgboost_predit_model = make_prediction_view(
|
4 |
-
"XGBoost", "Gradient Boosted Tree with XGBoost")
|
|
|
|
|
|
|
|
|
|
src/models/xgboost_test_model.py
DELETED
@@ -1,4 +0,0 @@
|
|
1 |
-
from models.util_test import make_tests_view
|
2 |
-
|
3 |
-
xgboost_test_model = make_tests_view(
|
4 |
-
"XGBoost", "Gradient Boosted Tree with XGBoost")
|
|
|
|
|
|
|
|
|
|
src/models/xgboost_train_model.py
DELETED
@@ -1,68 +0,0 @@
|
|
1 |
-
import pickle
|
2 |
-
|
3 |
-
import numpy as np
|
4 |
-
import xgboost as xgb
|
5 |
-
from features.build_features import SplitDataset
|
6 |
-
import streamlit as st
|
7 |
-
|
8 |
-
from visualization.graphs_decision_tree import(plot_importance_gbt,
|
9 |
-
plot_tree_gbt)
|
10 |
-
|
11 |
-
from visualization.graphs_settings import streamlit_chart_setting_height_width
|
12 |
-
|
13 |
-
from visualization.graphs_download import (download_importance_gbt,
|
14 |
-
download_tree_gbt)
|
15 |
-
|
16 |
-
|
17 |
-
@ st.cache(suppress_st_warning=True, hash_funcs={
|
18 |
-
xgb.XGBClassifier: pickle.dumps
|
19 |
-
})
|
20 |
-
def create_clf_xgbt_model(X_train, y_train):
|
21 |
-
# Using hyperparameters learning_rate and max_depth
|
22 |
-
return xgb.XGBClassifier(
|
23 |
-
learning_rate=0.1,
|
24 |
-
max_depth=7,
|
25 |
-
use_label_encoder=False,
|
26 |
-
eval_metric="logloss",
|
27 |
-
).fit(X_train, np.ravel(y_train), eval_metric="logloss")
|
28 |
-
|
29 |
-
|
30 |
-
def interpret_clf_xgbt_model(clf_xgbt_model):
|
31 |
-
st.subheader("XGBoost Decision Tree Feature Importance")
|
32 |
-
|
33 |
-
(barxsize, barysize,) = streamlit_chart_setting_height_width(
|
34 |
-
"Chart Settings", 10, 15, "barxsize", "barysize"
|
35 |
-
)
|
36 |
-
|
37 |
-
fig1 = plot_importance_gbt(clf_xgbt_model, barxsize, barysize)
|
38 |
-
|
39 |
-
st.pyplot(fig1)
|
40 |
-
|
41 |
-
download_importance_gbt(fig1, barxsize, barysize)
|
42 |
-
|
43 |
-
st.subheader("XGBoost Decision Tree Structure")
|
44 |
-
|
45 |
-
(treexsize, treeysize,) = streamlit_chart_setting_height_width(
|
46 |
-
"Chart Settings", 5, 5, "treexsize", "treeysize"
|
47 |
-
)
|
48 |
-
|
49 |
-
fig2 = plot_tree_gbt(treexsize, treeysize, clf_xgbt_model)
|
50 |
-
|
51 |
-
st.pyplot(fig2)
|
52 |
-
|
53 |
-
download_tree_gbt(treexsize, treeysize)
|
54 |
-
st.markdown(
|
55 |
-
"Note: The downloaded XGBoost Decision Tree plot chart in png has higher resolution than that displayed here."
|
56 |
-
)
|
57 |
-
|
58 |
-
|
59 |
-
def xgboost_train_model(split_dataset: SplitDataset):
|
60 |
-
st.header("XGBoost Decision Trees")
|
61 |
-
|
62 |
-
clf_xgbt_model = create_clf_xgbt_model(
|
63 |
-
split_dataset.X_train, split_dataset.y_train
|
64 |
-
)
|
65 |
-
|
66 |
-
interpret_clf_xgbt_model(clf_xgbt_model)
|
67 |
-
|
68 |
-
return clf_xgbt_model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/visualization/__init__.py
DELETED
File without changes
|
src/visualization/graphs_decision_tree.py
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
|
2 |
-
import xgboost as xgb
|
3 |
-
|
4 |
-
import streamlit as st
|
5 |
-
|
6 |
-
import matplotlib.pyplot as plt
|
7 |
-
|
8 |
-
from xgboost import plot_tree
|
9 |
-
|
10 |
-
|
11 |
-
def plot_importance_gbt(clf_xgbt_model, barxsize, barysize):
|
12 |
-
axobject1 = xgb.plot_importance(clf_xgbt_model, importance_type="weight")
|
13 |
-
fig1 = axobject1.figure
|
14 |
-
st.write("Feature Importance Plot (Gradient Boosted Tree)")
|
15 |
-
fig1.set_size_inches(barxsize, barysize)
|
16 |
-
return fig1
|
17 |
-
|
18 |
-
|
19 |
-
def plot_tree_gbt(treexsize, treeysize, clf_xgbt_model):
|
20 |
-
plot_tree(clf_xgbt_model)
|
21 |
-
fig2 = plt.gcf()
|
22 |
-
fig2.set_size_inches(treexsize, treeysize)
|
23 |
-
return fig2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/visualization/graphs_download.py
DELETED
@@ -1,17 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
import matplotlib.pyplot as plt
|
3 |
-
|
4 |
-
|
5 |
-
def download_importance_gbt(fig1, barxsize, barysize):
|
6 |
-
if st.button(
|
7 |
-
"Download Feature Importance Plot as png (Gradient Boosted Tree)"
|
8 |
-
):
|
9 |
-
dpisize = max(barxsize, barysize)
|
10 |
-
plt.savefig("bar.png", dpi=dpisize * 96, bbox_inches="tight")
|
11 |
-
fig1.set_size_inches(barxsize, barysize)
|
12 |
-
|
13 |
-
|
14 |
-
def download_tree_gbt(treexsize, treeysize):
|
15 |
-
if st.button("Download XGBoost Decision Tree Plot as png (Gradient Boosted Tree)"):
|
16 |
-
dpisize = max(treexsize, treeysize)
|
17 |
-
plt.savefig("tree.png", dpi=dpisize * 96, bbox_inches="tight")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/visualization/graphs_logistic.py
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
import plotly.express as px
|
2 |
-
|
3 |
-
|
4 |
-
def plot_logistic_coeff_barh(df):
|
5 |
-
fig = px.bar(data_frame=df, x="Value",
|
6 |
-
y="Coefficient", orientation="h")
|
7 |
-
|
8 |
-
fig.update_layout(
|
9 |
-
title="Logistic Regression Coefficients",
|
10 |
-
xaxis_title="Value",
|
11 |
-
yaxis_title="Coefficient",)
|
12 |
-
return fig
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/visualization/graphs_settings.py
DELETED
@@ -1,28 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
|
3 |
-
|
4 |
-
def streamlit_chart_setting_height_width(
|
5 |
-
title: str,
|
6 |
-
default_widthvalue: int,
|
7 |
-
default_heightvalue: int,
|
8 |
-
widthkey: str,
|
9 |
-
heightkey: str,
|
10 |
-
):
|
11 |
-
with st.expander(title):
|
12 |
-
|
13 |
-
lbarx_col, lbary_col = st.columns(2)
|
14 |
-
|
15 |
-
with lbarx_col:
|
16 |
-
width_size = st.number_input(
|
17 |
-
label="Width in inches:",
|
18 |
-
value=default_widthvalue,
|
19 |
-
key=widthkey,
|
20 |
-
)
|
21 |
-
|
22 |
-
with lbary_col:
|
23 |
-
height_size = st.number_input(
|
24 |
-
label="Height in inches:",
|
25 |
-
value=default_heightvalue,
|
26 |
-
key=heightkey,
|
27 |
-
)
|
28 |
-
return width_size, height_size
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/visualization/graphs_test.py
DELETED
@@ -1,78 +0,0 @@
|
|
1 |
-
from matplotlib import pyplot as plt
|
2 |
-
|
3 |
-
from sklearn.metrics import roc_curve
|
4 |
-
|
5 |
-
from typing import OrderedDict
|
6 |
-
|
7 |
-
from models.util_model_class import ModelClass
|
8 |
-
|
9 |
-
from sklearn.calibration import calibration_curve
|
10 |
-
|
11 |
-
|
12 |
-
def cross_validation_graph(cv, eval_metric, trees):
|
13 |
-
|
14 |
-
# Plot the test AUC scores for each iteration
|
15 |
-
fig = plt.figure()
|
16 |
-
plt.plot(cv[cv.columns[2]])
|
17 |
-
plt.title(
|
18 |
-
"Test {eval_metric} Score Over {it_numbr} Iterations".format(
|
19 |
-
eval_metric=eval_metric, it_numbr=trees
|
20 |
-
)
|
21 |
-
)
|
22 |
-
plt.xlabel("Iteration Number")
|
23 |
-
plt.ylabel("Test {eval_metric} Score".format(eval_metric=eval_metric))
|
24 |
-
return fig
|
25 |
-
|
26 |
-
|
27 |
-
def roc_auc_compare_n_models(y, model_views: OrderedDict[str, ModelClass]):
|
28 |
-
colors = ["blue", "green"]
|
29 |
-
fig = plt.figure()
|
30 |
-
for color_idx, (model_name, model_view) in enumerate(model_views.items()):
|
31 |
-
fpr, tpr, _thresholds = roc_curve(
|
32 |
-
y, model_view.prediction_probability_df
|
33 |
-
)
|
34 |
-
plt.plot(fpr, tpr, color=colors[color_idx], label=f"{model_name}")
|
35 |
-
plt.plot([0, 1], [0, 1], linestyle="--", label="Random Prediction")
|
36 |
-
model_names = list(model_views.keys())
|
37 |
-
if not model_names:
|
38 |
-
model_name_str = "None"
|
39 |
-
elif len(model_names) == 1:
|
40 |
-
model_name_str = model_names[0]
|
41 |
-
else:
|
42 |
-
model_name_str = " and ".join(
|
43 |
-
[", ".join(model_names[:-1]), model_names[-1]]
|
44 |
-
)
|
45 |
-
plt.title(f"ROC Chart for {model_name_str} on the Probability of Default")
|
46 |
-
plt.xlabel("False Positive Rate (FP Rate)")
|
47 |
-
plt.ylabel("True Positive Rate (TP Rate)")
|
48 |
-
plt.legend()
|
49 |
-
plt.grid(False)
|
50 |
-
plt.xlim(0, 1)
|
51 |
-
plt.ylim(0, 1)
|
52 |
-
return fig
|
53 |
-
|
54 |
-
|
55 |
-
def calibration_curve_report_commented_n(
|
56 |
-
y, model_views: OrderedDict[str, ModelClass], bins: int
|
57 |
-
):
|
58 |
-
fig = plt.figure()
|
59 |
-
for model_name, model_view in model_views.items():
|
60 |
-
frac_of_pos, mean_pred_val = calibration_curve(
|
61 |
-
y,
|
62 |
-
model_view.prediction_probability_df,
|
63 |
-
n_bins=bins,
|
64 |
-
normalize=True,
|
65 |
-
)
|
66 |
-
plt.plot(mean_pred_val, frac_of_pos, "s-", label=f"{model_name}")
|
67 |
-
|
68 |
-
# Create the calibration curve plot with the guideline
|
69 |
-
plt.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
|
70 |
-
|
71 |
-
plt.ylabel("Fraction of positives")
|
72 |
-
plt.xlabel("Average Predicted Probability")
|
73 |
-
plt.title("Calibration Curve")
|
74 |
-
plt.legend()
|
75 |
-
plt.grid(False)
|
76 |
-
plt.xlim(0, 1)
|
77 |
-
plt.ylim(0, 1)
|
78 |
-
return fig
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/visualization/graphs_threshold.py
DELETED
@@ -1,80 +0,0 @@
|
|
1 |
-
|
2 |
-
import plotly.express as px
|
3 |
-
|
4 |
-
import streamlit as st
|
5 |
-
|
6 |
-
import matplotlib.pyplot as plt
|
7 |
-
|
8 |
-
import numpy as np
|
9 |
-
|
10 |
-
|
11 |
-
def acceptance_rate_driven_threshold_graph(clf_prediction_prob_df_gbt, acc_rate_thresh_gbt):
|
12 |
-
figa = px.histogram(clf_prediction_prob_df_gbt["PROB_DEFAULT"])
|
13 |
-
|
14 |
-
figa.update_layout(
|
15 |
-
title="Acceptance Rate Threshold vs. Loans Accepted",
|
16 |
-
xaxis_title="Acceptance Rate Threshold",
|
17 |
-
yaxis_title="Loans Accepted",
|
18 |
-
)
|
19 |
-
|
20 |
-
figa.update_traces(marker_line_width=1, marker_line_color="white")
|
21 |
-
|
22 |
-
figa.add_vline(
|
23 |
-
x=acc_rate_thresh_gbt,
|
24 |
-
line_width=3,
|
25 |
-
line_dash="solid",
|
26 |
-
line_color="red",
|
27 |
-
)
|
28 |
-
|
29 |
-
st.plotly_chart(figa)
|
30 |
-
|
31 |
-
|
32 |
-
def recall_accuracy_threshold_tradeoff_fig(
|
33 |
-
widthsize,
|
34 |
-
heightsize,
|
35 |
-
threshold_list,
|
36 |
-
thresh_def_recalls_list,
|
37 |
-
thresh_nondef_recalls_list,
|
38 |
-
thresh_accs_list,
|
39 |
-
):
|
40 |
-
fig = plt.figure(figsize=(widthsize, heightsize))
|
41 |
-
plt.plot(threshold_list, thresh_def_recalls_list, label="Default Recall")
|
42 |
-
plt.plot(
|
43 |
-
threshold_list, thresh_nondef_recalls_list, label="Non-Default Recall"
|
44 |
-
)
|
45 |
-
plt.plot(threshold_list, thresh_accs_list, label="Model Accuracy")
|
46 |
-
plt.xlabel("Probability Threshold")
|
47 |
-
plt.ylabel("Score")
|
48 |
-
plt.xlim(0, 1)
|
49 |
-
plt.ylim(0, 1)
|
50 |
-
plt.legend()
|
51 |
-
plt.title("Recall and Accuracy Score Tradeoff with Probability Threshold")
|
52 |
-
plt.grid(False)
|
53 |
-
return fig
|
54 |
-
|
55 |
-
|
56 |
-
def acceptance_rate_threshold_fig(probability_default, acceptancerate, bins):
|
57 |
-
# Probability distribution
|
58 |
-
probability_stat_distribution = probability_default.describe()
|
59 |
-
|
60 |
-
# Acceptance rate threshold
|
61 |
-
acc_rate_thresh = np.quantile(probability_default, acceptancerate)
|
62 |
-
fig = plt.figure()
|
63 |
-
|
64 |
-
plt.hist(
|
65 |
-
probability_default,
|
66 |
-
color="blue",
|
67 |
-
bins=bins,
|
68 |
-
histtype="bar",
|
69 |
-
ec="white",
|
70 |
-
)
|
71 |
-
|
72 |
-
# Add a reference line to the plot for the threshold
|
73 |
-
plt.axvline(x=acc_rate_thresh, color="red")
|
74 |
-
plt.title("Acceptance Rate Thershold")
|
75 |
-
|
76 |
-
return (
|
77 |
-
fig,
|
78 |
-
probability_stat_distribution,
|
79 |
-
acc_rate_thresh,
|
80 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/visualization/metrics.py
DELETED
@@ -1,132 +0,0 @@
|
|
1 |
-
|
2 |
-
import pandas as pd
|
3 |
-
import streamlit as st
|
4 |
-
|
5 |
-
|
6 |
-
def streamlit_2columns_metrics_pct_df(
|
7 |
-
column1name_label: str,
|
8 |
-
column2name_label: str,
|
9 |
-
df: pd.DataFrame,
|
10 |
-
):
|
11 |
-
(
|
12 |
-
column1name,
|
13 |
-
column2name,
|
14 |
-
) = st.columns(2)
|
15 |
-
|
16 |
-
with column1name:
|
17 |
-
st.metric(
|
18 |
-
label=column1name_label,
|
19 |
-
value="{:.0%}".format(df.value_counts().get(1) / df.shape[0]),
|
20 |
-
delta=None,
|
21 |
-
delta_color="normal",
|
22 |
-
)
|
23 |
-
|
24 |
-
with column2name:
|
25 |
-
st.metric(
|
26 |
-
label=column2name_label,
|
27 |
-
value="{:.0%}".format(df.value_counts().get(0) / df.shape[0]),
|
28 |
-
delta=None,
|
29 |
-
delta_color="normal",
|
30 |
-
)
|
31 |
-
|
32 |
-
|
33 |
-
def streamlit_2columns_metrics_df(
|
34 |
-
column1name_label: str,
|
35 |
-
column2name_label: str,
|
36 |
-
df: pd.DataFrame,
|
37 |
-
):
|
38 |
-
(
|
39 |
-
column1name,
|
40 |
-
column2name,
|
41 |
-
) = st.columns(2)
|
42 |
-
|
43 |
-
with column1name:
|
44 |
-
st.metric(
|
45 |
-
label=column1name_label,
|
46 |
-
value=df.value_counts().get(1),
|
47 |
-
delta=None,
|
48 |
-
delta_color="normal",
|
49 |
-
)
|
50 |
-
|
51 |
-
with column2name:
|
52 |
-
st.metric(
|
53 |
-
label=column2name_label,
|
54 |
-
value=df.value_counts().get(0),
|
55 |
-
delta=None,
|
56 |
-
delta_color="normal",
|
57 |
-
)
|
58 |
-
|
59 |
-
|
60 |
-
def streamlit_2columns_metrics_df_shape(df: pd.DataFrame):
|
61 |
-
(
|
62 |
-
column1name,
|
63 |
-
column2name,
|
64 |
-
) = st.columns(2)
|
65 |
-
|
66 |
-
with column1name:
|
67 |
-
st.metric(
|
68 |
-
label="Rows",
|
69 |
-
value=df.shape[0],
|
70 |
-
delta=None,
|
71 |
-
delta_color="normal",
|
72 |
-
)
|
73 |
-
|
74 |
-
with column2name:
|
75 |
-
st.metric(
|
76 |
-
label="Columns",
|
77 |
-
value=df.shape[1],
|
78 |
-
delta=None,
|
79 |
-
delta_color="normal",
|
80 |
-
)
|
81 |
-
|
82 |
-
|
83 |
-
def streamlit_2columns_metrics_pct_series(
|
84 |
-
column1name_label: str,
|
85 |
-
column2name_label: str,
|
86 |
-
series: pd.Series,
|
87 |
-
):
|
88 |
-
(
|
89 |
-
column1name,
|
90 |
-
column2name,
|
91 |
-
) = st.columns(2)
|
92 |
-
with column1name:
|
93 |
-
st.metric(
|
94 |
-
label=column1name_label,
|
95 |
-
value="{:.0%}".format(series.get(1) / series.sum()),
|
96 |
-
delta=None,
|
97 |
-
delta_color="normal",
|
98 |
-
)
|
99 |
-
|
100 |
-
with column2name:
|
101 |
-
st.metric(
|
102 |
-
label=column2name_label,
|
103 |
-
value="{:.0%}".format(series.get(0) / series.sum()),
|
104 |
-
delta=None,
|
105 |
-
delta_color="normal",
|
106 |
-
)
|
107 |
-
|
108 |
-
|
109 |
-
def streamlit_2columns_metrics_series(
|
110 |
-
column1name_label: str,
|
111 |
-
column2name_label: str,
|
112 |
-
series: pd.Series,
|
113 |
-
):
|
114 |
-
(
|
115 |
-
column1name,
|
116 |
-
column2name,
|
117 |
-
) = st.columns(2)
|
118 |
-
with column1name:
|
119 |
-
st.metric(
|
120 |
-
label=column1name_label,
|
121 |
-
value=series.get(1),
|
122 |
-
delta=None,
|
123 |
-
delta_color="normal",
|
124 |
-
)
|
125 |
-
|
126 |
-
with column2name:
|
127 |
-
st.metric(
|
128 |
-
label=column2name_label,
|
129 |
-
value=series.get(0),
|
130 |
-
delta=None,
|
131 |
-
delta_color="normal",
|
132 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
{src/features β views}/__init__.py
RENAMED
File without changes
|
views/decision_tree.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from common.data import SplitDataset
|
2 |
+
import streamlit as st
|
3 |
+
from common.util import (
|
4 |
+
test_variables_gbt,
|
5 |
+
)
|
6 |
+
from common.views import (
|
7 |
+
streamlit_chart_setting_height_width,
|
8 |
+
plot_importance_gbt,
|
9 |
+
plot_tree_gbt,
|
10 |
+
download_importance_gbt,
|
11 |
+
download_tree_gbt,
|
12 |
+
)
|
13 |
+
from views.typing import ModelView
|
14 |
+
from views.threshold import decision_tree_threshold_view
|
15 |
+
from views.evaluation import decision_tree_evaluation_view
|
16 |
+
|
17 |
+
|
18 |
+
def decisiontree_view(split_dataset: SplitDataset, currency: str):
|
19 |
+
st.header("Decision Trees")
|
20 |
+
|
21 |
+
clf_gbt_model = test_variables_gbt(
|
22 |
+
split_dataset.X_train, split_dataset.y_train
|
23 |
+
)
|
24 |
+
|
25 |
+
st.subheader("Decision Tree Feature Importance")
|
26 |
+
|
27 |
+
(barxsize, barysize,) = streamlit_chart_setting_height_width(
|
28 |
+
"Chart Settings", 10, 15, "barxsize", "barysize"
|
29 |
+
)
|
30 |
+
|
31 |
+
fig1 = plot_importance_gbt(clf_gbt_model, barxsize, barysize)
|
32 |
+
|
33 |
+
st.pyplot(fig1)
|
34 |
+
|
35 |
+
download_importance_gbt(fig1, barxsize, barysize)
|
36 |
+
|
37 |
+
st.subheader("Decision Tree Structure")
|
38 |
+
|
39 |
+
(treexsize, treeysize,) = streamlit_chart_setting_height_width(
|
40 |
+
"Chart Settings", 15, 10, "treexsize", "treeysize"
|
41 |
+
)
|
42 |
+
|
43 |
+
fig2 = plot_tree_gbt(treexsize, treeysize, clf_gbt_model)
|
44 |
+
|
45 |
+
st.pyplot(fig2)
|
46 |
+
|
47 |
+
download_tree_gbt(treexsize, treeysize)
|
48 |
+
st.markdown(
|
49 |
+
"Note: The downloaded decision tree plot chart in png has higher resolution than that displayed here."
|
50 |
+
)
|
51 |
+
|
52 |
+
threshold = decision_tree_threshold_view(clf_gbt_model, split_dataset)
|
53 |
+
|
54 |
+
df_trueStatus_probabilityDefault_threshStatus_loanAmount = (
|
55 |
+
decision_tree_evaluation_view(
|
56 |
+
clf_gbt_model,
|
57 |
+
split_dataset,
|
58 |
+
currency,
|
59 |
+
threshold.probability_threshold_selected,
|
60 |
+
threshold.predicted_default_status,
|
61 |
+
)
|
62 |
+
)
|
63 |
+
|
64 |
+
return ModelView(
|
65 |
+
model=clf_gbt_model,
|
66 |
+
trueStatus_probabilityDefault_threshStatus_loanAmount_df=df_trueStatus_probabilityDefault_threshStatus_loanAmount,
|
67 |
+
probability_threshold_selected=threshold.probability_threshold_selected,
|
68 |
+
predicted_default_status=threshold.predicted_default_status,
|
69 |
+
prediction_probability_df=threshold.prediction_probability_df,
|
70 |
+
)
|
src/models/util_test.py β views/evaluation.py
RENAMED
@@ -1,6 +1,5 @@
|
|
1 |
from typing import Union
|
2 |
import pandas as pd
|
3 |
-
from sklearn.model_selection import StratifiedKFold, cross_val_score
|
4 |
import streamlit as st
|
5 |
import numpy as np
|
6 |
from sklearn.metrics import (
|
@@ -8,25 +7,24 @@ from sklearn.metrics import (
|
|
8 |
confusion_matrix,
|
9 |
)
|
10 |
from sklearn.linear_model import LogisticRegression
|
11 |
-
import xgboost as xgb
|
12 |
from xgboost.sklearn import XGBClassifier
|
13 |
-
from
|
14 |
-
|
15 |
create_cross_validation_df,
|
16 |
cross_validation_scores,
|
17 |
get_df_trueStatus_probabilityDefault_threshStatus_loanAmount,
|
18 |
-
)
|
19 |
-
from
|
20 |
cross_validation_graph,
|
21 |
)
|
22 |
|
23 |
|
24 |
-
def
|
25 |
model_name_short: str,
|
26 |
model_name_generic: str,
|
27 |
):
|
28 |
def view(
|
29 |
-
|
30 |
split_dataset: SplitDataset,
|
31 |
currency: str,
|
32 |
prob_thresh_selected,
|
@@ -42,7 +40,7 @@ def make_tests_view(
|
|
42 |
train on each fold suggests performance will be stable."
|
43 |
)
|
44 |
|
45 |
-
st.write(f
|
46 |
|
47 |
stcol_seed, stcol_eval_metric = st.columns(2)
|
48 |
|
@@ -172,7 +170,7 @@ def make_tests_view(
|
|
172 |
)
|
173 |
|
174 |
cv_scores = cross_validation_scores(
|
175 |
-
|
176 |
split_dataset.X_test,
|
177 |
split_dataset.y_test,
|
178 |
nfolds_score,
|
@@ -327,7 +325,7 @@ def make_tests_view(
|
|
327 |
|
328 |
df_trueStatus_probabilityDefault_threshStatus_loanAmount = (
|
329 |
get_df_trueStatus_probabilityDefault_threshStatus_loanAmount(
|
330 |
-
|
331 |
split_dataset.X_test,
|
332 |
split_dataset.y_test,
|
333 |
prob_thresh_selected,
|
@@ -408,161 +406,5 @@ def make_tests_view(
|
|
408 |
return view
|
409 |
|
410 |
|
411 |
-
|
412 |
-
|
413 |
-
return cross_val_score(
|
414 |
-
model,
|
415 |
-
np.ascontiguousarray(X),
|
416 |
-
np.ravel(np.ascontiguousarray(y)),
|
417 |
-
cv=StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seed),
|
418 |
-
scoring=score,
|
419 |
-
)
|
420 |
-
|
421 |
-
|
422 |
-
def create_cross_validation_df(
|
423 |
-
X, y, eval_metric, seed, trees, n_folds, early_stopping_rounds
|
424 |
-
):
|
425 |
-
# Test data x and y
|
426 |
-
DTrain = xgb.DMatrix(X, label=y)
|
427 |
-
|
428 |
-
# auc or logloss
|
429 |
-
params = {
|
430 |
-
"eval_metric": eval_metric,
|
431 |
-
"objective": "binary:logistic", # logistic say 0 or 1 for loan status
|
432 |
-
"seed": seed,
|
433 |
-
}
|
434 |
-
|
435 |
-
# Create the data frame of cross validations
|
436 |
-
cv_df = xgb.cv(
|
437 |
-
params,
|
438 |
-
DTrain,
|
439 |
-
num_boost_round=trees,
|
440 |
-
nfold=n_folds,
|
441 |
-
early_stopping_rounds=early_stopping_rounds,
|
442 |
-
shuffle=True,
|
443 |
-
)
|
444 |
-
|
445 |
-
return [DTrain, cv_df]
|
446 |
-
|
447 |
-
|
448 |
-
def create_accept_rate_list(start, end, samples):
|
449 |
-
return np.linspace(start, end, samples, endpoint=True)
|
450 |
-
|
451 |
-
|
452 |
-
def create_strategyTable_df(
|
453 |
-
start, end, samples, actual_probability_predicted_acc_rate, true, currency
|
454 |
-
):
|
455 |
-
accept_rates = create_accept_rate_list(start, end, samples)
|
456 |
-
thresholds_strat = []
|
457 |
-
bad_rates_start = []
|
458 |
-
Avg_Loan_Amnt = actual_probability_predicted_acc_rate[true].mean()
|
459 |
-
num_accepted_loans_start = []
|
460 |
-
|
461 |
-
for rate in accept_rates:
|
462 |
-
# Calculate the threshold for the acceptance rate
|
463 |
-
thresh = np.quantile(
|
464 |
-
actual_probability_predicted_acc_rate["PROB_DEFAULT"], rate
|
465 |
-
).round(3)
|
466 |
-
# Add the threshold value to the list of thresholds
|
467 |
-
thresholds_strat.append(
|
468 |
-
np.quantile(
|
469 |
-
actual_probability_predicted_acc_rate["PROB_DEFAULT"], rate
|
470 |
-
).round(3)
|
471 |
-
)
|
472 |
-
|
473 |
-
# Reassign the loan_status value using the threshold
|
474 |
-
actual_probability_predicted_acc_rate[
|
475 |
-
"PREDICT_DEFAULT_STATUS"
|
476 |
-
] = actual_probability_predicted_acc_rate["PROB_DEFAULT"].apply(
|
477 |
-
lambda x: 1 if x > thresh else 0
|
478 |
-
)
|
479 |
-
|
480 |
-
# Create a set of accepted loans using this acceptance rate
|
481 |
-
accepted_loans = actual_probability_predicted_acc_rate[
|
482 |
-
actual_probability_predicted_acc_rate["PREDICT_DEFAULT_STATUS"]
|
483 |
-
== 0
|
484 |
-
]
|
485 |
-
# Calculate and append the bad rate using the acceptance rate
|
486 |
-
bad_rates_start.append(
|
487 |
-
np.sum((accepted_loans[true]) / len(accepted_loans[true])).round(3)
|
488 |
-
)
|
489 |
-
# Accepted loans
|
490 |
-
num_accepted_loans_start.append(len(accepted_loans))
|
491 |
-
|
492 |
-
# Calculate estimated value
|
493 |
-
money_accepted_loans = [
|
494 |
-
accepted_loans * Avg_Loan_Amnt
|
495 |
-
for accepted_loans in num_accepted_loans_start
|
496 |
-
]
|
497 |
-
|
498 |
-
money_bad_accepted_loans = [
|
499 |
-
2 * money_accepted_loan * bad_rate
|
500 |
-
for money_accepted_loan, bad_rate in zip(
|
501 |
-
money_accepted_loans, bad_rates_start
|
502 |
-
)
|
503 |
-
]
|
504 |
-
|
505 |
-
zip_object = zip(money_accepted_loans, money_bad_accepted_loans)
|
506 |
-
estimated_value = [
|
507 |
-
money_accepted_loan - money_bad_accepted_loan
|
508 |
-
for money_accepted_loan, money_bad_accepted_loan in zip_object
|
509 |
-
]
|
510 |
-
|
511 |
-
accept_rates = ["{:.2f}".format(elem) for elem in accept_rates]
|
512 |
-
|
513 |
-
thresholds_strat = ["{:.2f}".format(elem) for elem in thresholds_strat]
|
514 |
-
|
515 |
-
bad_rates_start = ["{:.2f}".format(elem) for elem in bad_rates_start]
|
516 |
-
|
517 |
-
estimated_value = ["{:.2f}".format(elem) for elem in estimated_value]
|
518 |
-
|
519 |
-
return (
|
520 |
-
pd.DataFrame(
|
521 |
-
zip(
|
522 |
-
accept_rates,
|
523 |
-
thresholds_strat,
|
524 |
-
bad_rates_start,
|
525 |
-
num_accepted_loans_start,
|
526 |
-
estimated_value,
|
527 |
-
),
|
528 |
-
columns=[
|
529 |
-
"Acceptance Rate",
|
530 |
-
"Threshold",
|
531 |
-
"Bad Rate",
|
532 |
-
"Num Accepted Loans",
|
533 |
-
f"Estimated Value ({currency})",
|
534 |
-
],
|
535 |
-
)
|
536 |
-
.sort_values(by="Acceptance Rate", axis=0, ascending=False)
|
537 |
-
.reset_index(drop=True)
|
538 |
-
)
|
539 |
-
|
540 |
-
|
541 |
-
def get_df_trueStatus_probabilityDefault_threshStatus_loanAmount(
|
542 |
-
model, X, y, threshold, loan_amount_col_name
|
543 |
-
):
|
544 |
-
true_status = y.to_frame()
|
545 |
-
|
546 |
-
loan_amount = X[loan_amount_col_name]
|
547 |
-
|
548 |
-
clf_prediction_prob = model.predict_proba(np.ascontiguousarray(X))
|
549 |
-
|
550 |
-
clf_prediction_prob_df = pd.DataFrame(
|
551 |
-
clf_prediction_prob[:, 1], columns=["PROB_DEFAULT"]
|
552 |
-
)
|
553 |
-
|
554 |
-
clf_thresh_predicted_default_status = (
|
555 |
-
clf_prediction_prob_df["PROB_DEFAULT"]
|
556 |
-
.apply(lambda x: 1 if x > threshold else 0)
|
557 |
-
.rename("PREDICT_DEFAULT_STATUS")
|
558 |
-
)
|
559 |
-
|
560 |
-
return pd.concat(
|
561 |
-
[
|
562 |
-
true_status.reset_index(drop=True),
|
563 |
-
clf_prediction_prob_df.reset_index(drop=True),
|
564 |
-
clf_thresh_predicted_default_status.reset_index(drop=True),
|
565 |
-
loan_amount.reset_index(drop=True),
|
566 |
-
],
|
567 |
-
axis=1,
|
568 |
-
)
|
|
|
1 |
from typing import Union
|
2 |
import pandas as pd
|
|
|
3 |
import streamlit as st
|
4 |
import numpy as np
|
5 |
from sklearn.metrics import (
|
|
|
7 |
confusion_matrix,
|
8 |
)
|
9 |
from sklearn.linear_model import LogisticRegression
|
|
|
10 |
from xgboost.sklearn import XGBClassifier
|
11 |
+
from common.data import SplitDataset
|
12 |
+
from common.util import (
|
13 |
create_cross_validation_df,
|
14 |
cross_validation_scores,
|
15 |
get_df_trueStatus_probabilityDefault_threshStatus_loanAmount,
|
16 |
+
)
|
17 |
+
from common.views import (
|
18 |
cross_validation_graph,
|
19 |
)
|
20 |
|
21 |
|
22 |
+
def make_evaluation_view(
|
23 |
model_name_short: str,
|
24 |
model_name_generic: str,
|
25 |
):
|
26 |
def view(
|
27 |
+
clf_gbt_model: Union[XGBClassifier, LogisticRegression],
|
28 |
split_dataset: SplitDataset,
|
29 |
currency: str,
|
30 |
prob_thresh_selected,
|
|
|
40 |
train on each fold suggests performance will be stable."
|
41 |
)
|
42 |
|
43 |
+
st.write(f"XGBoost cross validation test:")
|
44 |
|
45 |
stcol_seed, stcol_eval_metric = st.columns(2)
|
46 |
|
|
|
170 |
)
|
171 |
|
172 |
cv_scores = cross_validation_scores(
|
173 |
+
clf_gbt_model,
|
174 |
split_dataset.X_test,
|
175 |
split_dataset.y_test,
|
176 |
nfolds_score,
|
|
|
325 |
|
326 |
df_trueStatus_probabilityDefault_threshStatus_loanAmount = (
|
327 |
get_df_trueStatus_probabilityDefault_threshStatus_loanAmount(
|
328 |
+
clf_gbt_model,
|
329 |
split_dataset.X_test,
|
330 |
split_dataset.y_test,
|
331 |
prob_thresh_selected,
|
|
|
406 |
return view
|
407 |
|
408 |
|
409 |
+
decision_tree_evaluation_view = make_evaluation_view("gbt", "Decision Tree")
|
410 |
+
logistic_evaluation_view = make_evaluation_view("lg", "Logistic Regression")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
views/logistic.py
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from common.data import SplitDataset
|
2 |
+
import streamlit as st
|
3 |
+
import pandas as pd
|
4 |
+
import plotly.express as px
|
5 |
+
from views.threshold import logistic_threshold_view
|
6 |
+
from views.evaluation import logistic_evaluation_view
|
7 |
+
from common.util import (
|
8 |
+
test_variables_logistic,
|
9 |
+
print_coeff_logistic,
|
10 |
+
model_probability_values_df,
|
11 |
+
apply_threshold_to_probability_values,
|
12 |
+
)
|
13 |
+
from common.views import (
|
14 |
+
streamlit_2columns_metrics_df,
|
15 |
+
streamlit_2columns_metrics_pct_df,
|
16 |
+
)
|
17 |
+
from views.typing import ModelView
|
18 |
+
|
19 |
+
|
20 |
+
def logistic_view(split_dataset: SplitDataset, currency: str) -> ModelView:
|
21 |
+
# ### Test and create variables logically
|
22 |
+
|
23 |
+
st.header("Logistic Regression")
|
24 |
+
|
25 |
+
clf_logistic_model = test_variables_logistic(
|
26 |
+
split_dataset.X_train, split_dataset.y_train
|
27 |
+
)
|
28 |
+
|
29 |
+
st.metric(
|
30 |
+
label="# of Coefficients in Logistic Regression",
|
31 |
+
value=clf_logistic_model.n_features_in_,
|
32 |
+
delta=None,
|
33 |
+
delta_color="normal",
|
34 |
+
)
|
35 |
+
|
36 |
+
coef_dict = print_coeff_logistic(clf_logistic_model, split_dataset)
|
37 |
+
|
38 |
+
st.subheader("Logistic Regression Coefficient Values")
|
39 |
+
|
40 |
+
coef_dict_sorted = dict(
|
41 |
+
sorted(coef_dict.items(), key=lambda item: item[1], reverse=False)
|
42 |
+
)
|
43 |
+
|
44 |
+
data_items = coef_dict_sorted.items()
|
45 |
+
data_list = list(data_items)
|
46 |
+
|
47 |
+
df = pd.DataFrame(data_list, columns=["Coefficient", "Value"])
|
48 |
+
|
49 |
+
fig1 = px.bar(data_frame=df, x="Value", y="Coefficient", orientation="h")
|
50 |
+
|
51 |
+
fig1.update_layout(
|
52 |
+
title="Logistic Regression Coefficients",
|
53 |
+
xaxis_title="Value",
|
54 |
+
yaxis_title="Coefficient",
|
55 |
+
)
|
56 |
+
|
57 |
+
st.plotly_chart(fig1)
|
58 |
+
|
59 |
+
st.subheader("Classification Probability Threshold")
|
60 |
+
|
61 |
+
st.write(
|
62 |
+
"""
|
63 |
+
The logistic regression model (obtained using training data) is applied on testing data to predict the loans probabilities of defaulting.\n
|
64 |
+
Probabilities of defaulting of the loans are compared to a probability threshold.\n
|
65 |
+
A loan is predicted to default if its predicted probability of defaulting is greater than the probability threshold.
|
66 |
+
"""
|
67 |
+
)
|
68 |
+
|
69 |
+
threshold = st.slider(
|
70 |
+
label="Default Probability Threshold:",
|
71 |
+
min_value=0.0,
|
72 |
+
max_value=1.0,
|
73 |
+
value=0.7,
|
74 |
+
key="key_threshold",
|
75 |
+
)
|
76 |
+
|
77 |
+
clf_prediction_prob_df_log = model_probability_values_df(
|
78 |
+
clf_logistic_model,
|
79 |
+
split_dataset.X_test,
|
80 |
+
)
|
81 |
+
|
82 |
+
clf_thresh_predicted_default_status_user = (
|
83 |
+
apply_threshold_to_probability_values(
|
84 |
+
clf_prediction_prob_df_log,
|
85 |
+
threshold,
|
86 |
+
)
|
87 |
+
)
|
88 |
+
|
89 |
+
streamlit_2columns_metrics_df(
|
90 |
+
"# of Predicted Defaults",
|
91 |
+
"# of Predicted Non-Default",
|
92 |
+
clf_thresh_predicted_default_status_user,
|
93 |
+
)
|
94 |
+
|
95 |
+
streamlit_2columns_metrics_pct_df(
|
96 |
+
"% of Loans Predicted to Default",
|
97 |
+
"% of Loans Predicted not to Default",
|
98 |
+
clf_thresh_predicted_default_status_user,
|
99 |
+
)
|
100 |
+
|
101 |
+
threshold = logistic_threshold_view(clf_logistic_model, split_dataset)
|
102 |
+
|
103 |
+
df_trueStatus_probabilityDefault_threshStatus_loanAmount = (
|
104 |
+
logistic_evaluation_view(
|
105 |
+
clf_logistic_model,
|
106 |
+
split_dataset,
|
107 |
+
currency,
|
108 |
+
threshold.probability_threshold_selected,
|
109 |
+
threshold.predicted_default_status,
|
110 |
+
)
|
111 |
+
)
|
112 |
+
|
113 |
+
return ModelView(
|
114 |
+
model=clf_logistic_model,
|
115 |
+
trueStatus_probabilityDefault_threshStatus_loanAmount_df=df_trueStatus_probabilityDefault_threshStatus_loanAmount,
|
116 |
+
probability_threshold_selected=threshold.probability_threshold_selected,
|
117 |
+
predicted_default_status=threshold.predicted_default_status,
|
118 |
+
prediction_probability_df=threshold.prediction_probability_df,
|
119 |
+
)
|
src/models/util_model_comparison.py β views/model_comparison.py
RENAMED
@@ -1,21 +1,16 @@
|
|
1 |
from typing import OrderedDict
|
2 |
import streamlit as st
|
3 |
from sklearn.metrics import roc_auc_score
|
4 |
-
from
|
5 |
-
from
|
6 |
-
streamlit_chart_setting_height_width
|
7 |
-
)
|
8 |
-
|
9 |
-
from visualization.graphs_test import (
|
10 |
roc_auc_compare_n_models,
|
11 |
-
|
|
|
12 |
)
|
|
|
13 |
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
def roc_auc_for_model(split_dataset: SplitDataset, model_view: ModelClass):
|
19 |
roc_auc_model = roc_auc_score(
|
20 |
split_dataset.y_test, model_view.predicted_default_status
|
21 |
)
|
@@ -36,7 +31,7 @@ def roc_auc_for_model(split_dataset: SplitDataset, model_view: ModelClass):
|
|
36 |
|
37 |
def model_comparison_view(
|
38 |
split_dataset: SplitDataset,
|
39 |
-
model_views: OrderedDict[str,
|
40 |
):
|
41 |
st.header("Model Comparison")
|
42 |
|
@@ -48,7 +43,7 @@ def model_comparison_view(
|
|
48 |
f"Receiver Operating Characteristic (ROC) Curve - {model_name}"
|
49 |
)
|
50 |
st.markdown(
|
51 |
-
f'Area Under the Receiver Operating Characteristic Curve from prediction scores from {model_name} model is {roc_auc_model}.\n'
|
52 |
)
|
53 |
st.markdown(
|
54 |
f'The score of {"{:.2f}".format(roc_auc_model)} is in the {roc_auc_lvl} ROC AUC score category.'
|
@@ -83,4 +78,4 @@ def model_comparison_view(
|
|
83 |
|
84 |
fig2.set_size_inches(xsize_cal, ysize_cal)
|
85 |
|
86 |
-
st.pyplot(fig2)
|
|
|
1 |
from typing import OrderedDict
|
2 |
import streamlit as st
|
3 |
from sklearn.metrics import roc_auc_score
|
4 |
+
from common.data import SplitDataset
|
5 |
+
from common.views import (
|
|
|
|
|
|
|
|
|
6 |
roc_auc_compare_n_models,
|
7 |
+
streamlit_chart_setting_height_width,
|
8 |
+
calibration_curve_report_commented_n,
|
9 |
)
|
10 |
+
from views.typing import ModelView
|
11 |
|
12 |
|
13 |
+
def roc_auc_for_model(split_dataset: SplitDataset, model_view: ModelView):
|
|
|
|
|
|
|
14 |
roc_auc_model = roc_auc_score(
|
15 |
split_dataset.y_test, model_view.predicted_default_status
|
16 |
)
|
|
|
31 |
|
32 |
def model_comparison_view(
|
33 |
split_dataset: SplitDataset,
|
34 |
+
model_views: OrderedDict[str, ModelView],
|
35 |
):
|
36 |
st.header("Model Comparison")
|
37 |
|
|
|
43 |
f"Receiver Operating Characteristic (ROC) Curve - {model_name}"
|
44 |
)
|
45 |
st.markdown(
|
46 |
+
f'Area Under the Receiver Operating Characteristic Curve from prediction scores from "{model_name}" model is {roc_auc_model}.\n'
|
47 |
)
|
48 |
st.markdown(
|
49 |
f'The score of {"{:.2f}".format(roc_auc_model)} is in the {roc_auc_lvl} ROC AUC score category.'
|
|
|
78 |
|
79 |
fig2.set_size_inches(xsize_cal, ysize_cal)
|
80 |
|
81 |
+
st.pyplot(fig2.figure)
|
src/models/util_strategy_table.py β views/strategy_table.py
RENAMED
@@ -2,12 +2,12 @@ from typing import OrderedDict
|
|
2 |
import plotly.express as px
|
3 |
import numpy as np
|
4 |
import streamlit as st
|
5 |
-
from
|
6 |
-
from
|
7 |
|
8 |
|
9 |
def strategy_table_view(
|
10 |
-
currency: str, model_views: OrderedDict[str,
|
11 |
):
|
12 |
st.header("Strategy Table")
|
13 |
|
@@ -89,7 +89,7 @@ def strategy_table_view(
|
|
89 |
)
|
90 |
|
91 |
st.metric(
|
92 |
-
label=
|
93 |
value=f"{currency} {tot_exp_loss:,.2f}",
|
94 |
delta=None,
|
95 |
delta_color="normal",
|
|
|
2 |
import plotly.express as px
|
3 |
import numpy as np
|
4 |
import streamlit as st
|
5 |
+
from common.util import create_strategyTable_df
|
6 |
+
from views.typing import ModelView
|
7 |
|
8 |
|
9 |
def strategy_table_view(
|
10 |
+
currency: str, model_views: OrderedDict[str, ModelView]
|
11 |
):
|
12 |
st.header("Strategy Table")
|
13 |
|
|
|
89 |
)
|
90 |
|
91 |
st.metric(
|
92 |
+
label=f"Total expected loss:",
|
93 |
value=f"{currency} {tot_exp_loss:,.2f}",
|
94 |
delta=None,
|
95 |
delta_color="normal",
|
views/threshold.py
ADDED
@@ -0,0 +1,272 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
from typing import Union, cast
|
3 |
+
import numpy as np
|
4 |
+
import streamlit as st
|
5 |
+
import plotly.express as px
|
6 |
+
import pandas as pd
|
7 |
+
from xgboost.sklearn import XGBClassifier
|
8 |
+
from sklearn.linear_model import LogisticRegression
|
9 |
+
from common.data import SplitDataset
|
10 |
+
from common.util import (
|
11 |
+
model_probability_values_df,
|
12 |
+
apply_threshold_to_probability_values,
|
13 |
+
find_best_threshold_J_statistic,
|
14 |
+
default_status_per_threshold,
|
15 |
+
classification_report_per_threshold,
|
16 |
+
thresh_classification_report_recall_accuracy,
|
17 |
+
)
|
18 |
+
from common.views import (
|
19 |
+
streamlit_2columns_metrics_df,
|
20 |
+
streamlit_2columns_metrics_pct_df,
|
21 |
+
)
|
22 |
+
|
23 |
+
|
24 |
+
@dataclass(frozen=True)
|
25 |
+
class Threshold:
|
26 |
+
probability_threshold_selected: float
|
27 |
+
predicted_default_status: pd.Series
|
28 |
+
prediction_probability_df: pd.DataFrame
|
29 |
+
|
30 |
+
|
31 |
+
def make_threshold_view(
|
32 |
+
model_name_short: str,
|
33 |
+
model_name: str,
|
34 |
+
):
|
35 |
+
def view(
|
36 |
+
clf_gbt_model: Union[XGBClassifier, LogisticRegression],
|
37 |
+
split_dataset: SplitDataset,
|
38 |
+
) -> Threshold:
|
39 |
+
st.subheader("Classification Probability Threshold - User Defined")
|
40 |
+
st.write(
|
41 |
+
f"""
|
42 |
+
The {model_name} model (obtained using training data) is applied on testing data to predict the loans probabilities of defaulting.\n
|
43 |
+
Probabilities of defaulting of the loans are compared to a probability threshold.\n
|
44 |
+
A loan is predicted to default if its predicted probability of defaulting is greater than the probability threshold.
|
45 |
+
"""
|
46 |
+
)
|
47 |
+
|
48 |
+
threshold_gbt_default = st.slider(
|
49 |
+
label="Default Probability Threshold:",
|
50 |
+
min_value=0.0,
|
51 |
+
max_value=1.0,
|
52 |
+
value=0.8,
|
53 |
+
key=f"threshold_{model_name_short}_default",
|
54 |
+
)
|
55 |
+
|
56 |
+
clf_prediction_prob_df_gbt = model_probability_values_df(
|
57 |
+
clf_gbt_model,
|
58 |
+
split_dataset.X_test,
|
59 |
+
)
|
60 |
+
|
61 |
+
clf_thresh_predicted_default_status_user_gbt = (
|
62 |
+
apply_threshold_to_probability_values(
|
63 |
+
clf_prediction_prob_df_gbt,
|
64 |
+
threshold_gbt_default,
|
65 |
+
)
|
66 |
+
)
|
67 |
+
|
68 |
+
streamlit_2columns_metrics_df(
|
69 |
+
"# of Predicted Defaults",
|
70 |
+
"# of Predicted Non-Default",
|
71 |
+
clf_thresh_predicted_default_status_user_gbt,
|
72 |
+
)
|
73 |
+
|
74 |
+
streamlit_2columns_metrics_pct_df(
|
75 |
+
"% of Loans Predicted to Default",
|
76 |
+
"% of Loans Predicted not to Default",
|
77 |
+
clf_thresh_predicted_default_status_user_gbt,
|
78 |
+
)
|
79 |
+
|
80 |
+
st.subheader("J Statistic Driven Classification Probability Threshold")
|
81 |
+
|
82 |
+
J_statistic_best_threshold = find_best_threshold_J_statistic(
|
83 |
+
split_dataset.y_test, clf_prediction_prob_df_gbt
|
84 |
+
)
|
85 |
+
st.metric(
|
86 |
+
label="Youden's J statistic calculated best threshold",
|
87 |
+
value=J_statistic_best_threshold,
|
88 |
+
)
|
89 |
+
|
90 |
+
clf_thresh_predicted_default_status_Jstatistic_gbt = (
|
91 |
+
apply_threshold_to_probability_values(
|
92 |
+
clf_prediction_prob_df_gbt,
|
93 |
+
J_statistic_best_threshold,
|
94 |
+
)
|
95 |
+
)
|
96 |
+
|
97 |
+
streamlit_2columns_metrics_df(
|
98 |
+
"# of Predicted Defaults",
|
99 |
+
"# of Predicted Non-Default",
|
100 |
+
clf_thresh_predicted_default_status_Jstatistic_gbt,
|
101 |
+
)
|
102 |
+
|
103 |
+
streamlit_2columns_metrics_pct_df(
|
104 |
+
"% of Loans Predicted to Default",
|
105 |
+
"% of Loans Predicted not to Default",
|
106 |
+
clf_thresh_predicted_default_status_Jstatistic_gbt,
|
107 |
+
)
|
108 |
+
|
109 |
+
st.subheader(
|
110 |
+
"Recall and Accuracy Tradeoff with given Probability Threshold"
|
111 |
+
)
|
112 |
+
# Steps
|
113 |
+
# Get list of thresholds
|
114 |
+
# Get default status per threshold
|
115 |
+
# Get classification report per threshold
|
116 |
+
# Get recall, nondef recall, and accuracy per threshold
|
117 |
+
|
118 |
+
threshold_list = np.arange(0, 1, 0.025).round(decimals=3).tolist()
|
119 |
+
|
120 |
+
threshold_default_status_list = default_status_per_threshold(
|
121 |
+
threshold_list, clf_prediction_prob_df_gbt["PROB_DEFAULT"]
|
122 |
+
)
|
123 |
+
thresh_classification_report_dict = (
|
124 |
+
classification_report_per_threshold(
|
125 |
+
threshold_list,
|
126 |
+
threshold_default_status_list,
|
127 |
+
split_dataset.y_test,
|
128 |
+
)
|
129 |
+
)
|
130 |
+
|
131 |
+
(
|
132 |
+
thresh_def_recalls_list,
|
133 |
+
thresh_nondef_recalls_list,
|
134 |
+
thresh_accs_list,
|
135 |
+
) = thresh_classification_report_recall_accuracy(
|
136 |
+
thresh_classification_report_dict
|
137 |
+
)
|
138 |
+
|
139 |
+
namelist = [
|
140 |
+
"Default Recall",
|
141 |
+
"Non Default Recall",
|
142 |
+
"Accuracy",
|
143 |
+
"Threshold",
|
144 |
+
]
|
145 |
+
|
146 |
+
df = pd.DataFrame(
|
147 |
+
[
|
148 |
+
thresh_def_recalls_list,
|
149 |
+
thresh_nondef_recalls_list,
|
150 |
+
thresh_accs_list,
|
151 |
+
threshold_list,
|
152 |
+
],
|
153 |
+
index=namelist,
|
154 |
+
)
|
155 |
+
|
156 |
+
df = df.T
|
157 |
+
|
158 |
+
fig2 = px.line(
|
159 |
+
data_frame=df,
|
160 |
+
y=["Default Recall", "Non Default Recall", "Accuracy"],
|
161 |
+
x="Threshold",
|
162 |
+
)
|
163 |
+
|
164 |
+
fig2.update_layout(
|
165 |
+
title="Recall and Accuracy score Trade-off with Probability Threshold",
|
166 |
+
xaxis_title="Probability Threshold",
|
167 |
+
yaxis_title="Score",
|
168 |
+
)
|
169 |
+
fig2.update_yaxes(range=[0.0, 1.0])
|
170 |
+
|
171 |
+
st.plotly_chart(fig2)
|
172 |
+
|
173 |
+
st.subheader("Acceptance Rate Driven Probability Threshold")
|
174 |
+
# Steps
|
175 |
+
# Set acceptance rate
|
176 |
+
# Get default status per threshold
|
177 |
+
# Get classification report per threshold
|
178 |
+
# Get recall, nondef recall, and accuracy per threshold
|
179 |
+
|
180 |
+
acceptance_rate = (
|
181 |
+
st.slider(
|
182 |
+
label="% of loans accepted (acceptance rate):",
|
183 |
+
min_value=0,
|
184 |
+
max_value=100,
|
185 |
+
value=85,
|
186 |
+
key=f"acceptance_rate_{model_name_short}",
|
187 |
+
format="%f%%",
|
188 |
+
)
|
189 |
+
/ 100
|
190 |
+
)
|
191 |
+
|
192 |
+
acc_rate_thresh_gbt = np.quantile(
|
193 |
+
clf_prediction_prob_df_gbt["PROB_DEFAULT"], acceptance_rate
|
194 |
+
)
|
195 |
+
|
196 |
+
st.write(
|
197 |
+
f"An acceptance rate of {acceptance_rate} results in probability threshold of {acc_rate_thresh_gbt}"
|
198 |
+
)
|
199 |
+
|
200 |
+
figa = px.histogram(clf_prediction_prob_df_gbt["PROB_DEFAULT"])
|
201 |
+
|
202 |
+
figa.update_layout(
|
203 |
+
title="Acceptance Rate Threshold vs. Loans Accepted",
|
204 |
+
xaxis_title="Acceptance Rate Threshold",
|
205 |
+
yaxis_title="Loans Accepted",
|
206 |
+
)
|
207 |
+
|
208 |
+
figa.update_traces(marker_line_width=1, marker_line_color="white")
|
209 |
+
|
210 |
+
figa.add_vline(
|
211 |
+
x=acc_rate_thresh_gbt,
|
212 |
+
line_width=3,
|
213 |
+
line_dash="solid",
|
214 |
+
line_color="red",
|
215 |
+
)
|
216 |
+
|
217 |
+
st.plotly_chart(figa)
|
218 |
+
|
219 |
+
clf_thresh_predicted_default_status_acceptance_gbt = (
|
220 |
+
apply_threshold_to_probability_values(
|
221 |
+
clf_prediction_prob_df_gbt,
|
222 |
+
acc_rate_thresh_gbt,
|
223 |
+
)
|
224 |
+
)
|
225 |
+
|
226 |
+
st.write()
|
227 |
+
st.subheader("Selected Probability Threshold")
|
228 |
+
|
229 |
+
options = [
|
230 |
+
"User Defined",
|
231 |
+
"J Statistic Driven",
|
232 |
+
"Acceptance Rate Driven",
|
233 |
+
]
|
234 |
+
prob_thresh_option = st.radio(
|
235 |
+
label="Selected Probability Threshold",
|
236 |
+
options=options,
|
237 |
+
key=f"{model_name_short}_radio_thresh",
|
238 |
+
)
|
239 |
+
|
240 |
+
if prob_thresh_option == "User Defined":
|
241 |
+
prob_thresh_selected_gbt = threshold_gbt_default
|
242 |
+
predicted_default_status_gbt = (
|
243 |
+
clf_thresh_predicted_default_status_user_gbt
|
244 |
+
)
|
245 |
+
elif prob_thresh_option == "J Statistic Driven":
|
246 |
+
prob_thresh_selected_gbt = J_statistic_best_threshold
|
247 |
+
predicted_default_status_gbt = (
|
248 |
+
clf_thresh_predicted_default_status_Jstatistic_gbt
|
249 |
+
)
|
250 |
+
else:
|
251 |
+
prob_thresh_selected_gbt = acc_rate_thresh_gbt
|
252 |
+
predicted_default_status_gbt = (
|
253 |
+
clf_thresh_predicted_default_status_acceptance_gbt
|
254 |
+
)
|
255 |
+
|
256 |
+
st.write(
|
257 |
+
f"Selected probability threshold is {prob_thresh_selected_gbt}"
|
258 |
+
)
|
259 |
+
|
260 |
+
return Threshold(
|
261 |
+
probability_threshold_selected=cast(
|
262 |
+
float, prob_thresh_selected_gbt
|
263 |
+
),
|
264 |
+
predicted_default_status=predicted_default_status_gbt,
|
265 |
+
prediction_probability_df=clf_prediction_prob_df_gbt,
|
266 |
+
)
|
267 |
+
|
268 |
+
return view
|
269 |
+
|
270 |
+
|
271 |
+
decision_tree_threshold_view = make_threshold_view("gbt", "decision tree")
|
272 |
+
logistic_threshold_view = make_threshold_view("lg", "logistic")
|
src/models/util_model_class.py β views/typing.py
RENAMED
@@ -7,7 +7,7 @@ from sklearn.linear_model import LogisticRegression
|
|
7 |
|
8 |
|
9 |
@dataclass(frozen=True)
|
10 |
-
class
|
11 |
model: Union[XGBClassifier, LogisticRegression]
|
12 |
probability_threshold_selected: float
|
13 |
predicted_default_status: pd.Series
|
|
|
7 |
|
8 |
|
9 |
@dataclass(frozen=True)
|
10 |
+
class ModelView:
|
11 |
model: Union[XGBClassifier, LogisticRegression]
|
12 |
probability_threshold_selected: float
|
13 |
predicted_default_status: pd.Series
|