File size: 4,670 Bytes
6e99eca
99c0d9f
701c14e
99c0d9f
 
 
 
 
 
 
 
 
 
6e99eca
fdf85d2
99c0d9f
701c14e
99c0d9f
95ec290
 
 
701c14e
 
99c0d9f
 
 
 
 
 
 
 
 
 
 
 
09db070
99c0d9f
 
 
 
 
 
a5b2d5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95ec290
99c0d9f
 
09db070
99c0d9f
 
 
95ec290
 
 
 
 
 
 
09db070
99c0d9f
 
 
 
b7187da
09db070
99c0d9f
 
 
 
 
 
b7187da
99c0d9f
 
 
 
 
b7187da
99c0d9f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
701c14e
 
99c0d9f
701c14e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99c0d9f
 
701c14e
09db070
99c0d9f
 
 
 
701c14e
99c0d9f
701c14e
 
 
 
 
 
 
 
99c0d9f
09db070
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib as plt
from sklearn import metrics
import graphviz as graphviz


# load data
data = pd.read_csv('german_credit_from_r.csv')

# remove vars
data.drop(['Foreign_worker', 'Gender'], axis=1, inplace=True)

# recode credit risk
data['Credit_risk'] = data['Credit_risk'].map({'GOOD': 0, 'BAD': 1})

# extract variable names
vars = data.columns.tolist()
vars.remove('Credit_risk')

# train/test split of data
X = data[vars]
y = data['Credit_risk']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# page title
st.title('Classification of Credit Risk')
#st.divider()

# header: variables
st.header('Variables')
st.markdown(
"""
The dataset contains the following variables:
- Status of existing checking account
- Credit duration in month
- Credit history
- Purpose
- Credit amount
- Savings account/bonds
- Present employment since
- Installment rate in percentage of disposable income
- Other debtors / guarantors
- Present residence since
- Property
- Age in years
- Other installment plans
- Housing
- Number of existing credits at this bank
- Job
- Number of people being liable to provide maintenance for
- Telephone
- Credit risk (1 = bad, 0 = good)
"""
)
#st.divider()

# header: data
st.header('Data')
# move Credit_risk to the end
cols = data.columns.tolist()
cols.remove('Credit_risk')
cols.append('Credit_risk')
data = data[cols]
st.write('First 20 rows of the dataset:')
data.loc[1:20, :]
#st.divider()

# header: predictors
st.header('Predictors')
st.write('Please select up to 3 predictors:')
selected = st.multiselect('', vars, max_selections=3)
#st.divider()

# header: model
st.header('Model')

X_train_f = X_train.loc[:, selected]

numeric_features = ["Duration", "Credit_amount", "Installment_rate", "Resident_since", "Age", "Existing_credits", "People_maintenance_for"]
numeric_features_selected = list(set(selected) & set(numeric_features))
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer())]
)

categorical_features = ["Account_status", "Credit_history", "Purpose", "Savings_bonds", "Present_employment_since", "Other_debtors_guarantors", "Property", "Other_installment_plans", "Housing", "Job", "Telephone", "Foreign_worker", "Gender"]
categorical_features_selected = list(set(selected) & set(categorical_features))
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features_selected),
        ("cat", categorical_transformer, categorical_features_selected),
    ]
)

if selected == []:
    st.write('Please select at least 1 predictor.')
else:
    maxd = st.slider('Max depth', min_value=1, max_value=10, value=2, step=1)
    pipe = Pipeline([("preprocessor", preprocessor), ('classifier', tree.DecisionTreeClassifier(max_depth=maxd))])
    pipe.fit(X_train_f, y_train)
    fn = pipe[:-1].get_feature_names_out()
    fn = [item.replace("cat__", "").replace("num__", "") for item in fn]
    labels = pipe.named_steps["classifier"].classes_
    labels = [str(item) for item in labels]
    mytree = tree.export_graphviz(pipe.named_steps["classifier"],
                                  feature_names = fn,
                                  class_names=labels,
                                  label = 'none',
                                  filled = True, 
                                  leaves_parallel = True,
                                  impurity= False,
                                  proportion = True,
                                  rotate=False,
                                  out_file=None)
    
    st.graphviz_chart(mytree)


#st.divider()

# header: accuracy
st.header('Accuracy')

if selected == []:
    st.write('Please select at least 1 predictor.')
else:
    preds = pd.DataFrame(pipe.predict_proba(X_test))
    preds.columns = ['prob_0', 'prob_1']
    fpr, tpr, thresholds = metrics.roc_curve(y_test, preds["prob_1"], pos_label=1)

    st.write('AUC: ', np.round(metrics.auc(fpr, tpr),3))
    st.write('Precision: ', np.round(metrics.precision_score(y_test, pipe.predict(X_test)),3))
    st.write('Recall: ', np.round(metrics.recall_score(y_test, pipe.predict(X_test)),3))

#st.divider()