germancredit / app.py
olivermueller's picture
Fine tuning
95ec290
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib as plt
from sklearn import metrics
import graphviz as graphviz
# load data
data = pd.read_csv('german_credit_from_r.csv')
# remove vars
data.drop(['Foreign_worker', 'Gender'], axis=1, inplace=True)
# recode credit risk
data['Credit_risk'] = data['Credit_risk'].map({'GOOD': 0, 'BAD': 1})
# extract variable names
vars = data.columns.tolist()
vars.remove('Credit_risk')
# train/test split of data
X = data[vars]
y = data['Credit_risk']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# page title
st.title('Classification of Credit Risk')
#st.divider()
# header: variables
st.header('Variables')
st.markdown(
"""
The dataset contains the following variables:
- Status of existing checking account
- Credit duration in month
- Credit history
- Purpose
- Credit amount
- Savings account/bonds
- Present employment since
- Installment rate in percentage of disposable income
- Other debtors / guarantors
- Present residence since
- Property
- Age in years
- Other installment plans
- Housing
- Number of existing credits at this bank
- Job
- Number of people being liable to provide maintenance for
- Telephone
- Credit risk (1 = bad, 0 = good)
"""
)
#st.divider()
# header: data
st.header('Data')
# move Credit_risk to the end
cols = data.columns.tolist()
cols.remove('Credit_risk')
cols.append('Credit_risk')
data = data[cols]
st.write('First 20 rows of the dataset:')
data.loc[1:20, :]
#st.divider()
# header: predictors
st.header('Predictors')
st.write('Please select up to 3 predictors:')
selected = st.multiselect('', vars, max_selections=3)
#st.divider()
# header: model
st.header('Model')
X_train_f = X_train.loc[:, selected]
numeric_features = ["Duration", "Credit_amount", "Installment_rate", "Resident_since", "Age", "Existing_credits", "People_maintenance_for"]
numeric_features_selected = list(set(selected) & set(numeric_features))
numeric_transformer = Pipeline(
steps=[("imputer", SimpleImputer())]
)
categorical_features = ["Account_status", "Credit_history", "Purpose", "Savings_bonds", "Present_employment_since", "Other_debtors_guarantors", "Property", "Other_installment_plans", "Housing", "Job", "Telephone", "Foreign_worker", "Gender"]
categorical_features_selected = list(set(selected) & set(categorical_features))
categorical_transformer = Pipeline(
steps=[
("encoder", OneHotEncoder(handle_unknown="ignore"))
]
)
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features_selected),
("cat", categorical_transformer, categorical_features_selected),
]
)
if selected == []:
st.write('Please select at least 1 predictor.')
else:
maxd = st.slider('Max depth', min_value=1, max_value=10, value=2, step=1)
pipe = Pipeline([("preprocessor", preprocessor), ('classifier', tree.DecisionTreeClassifier(max_depth=maxd))])
pipe.fit(X_train_f, y_train)
fn = pipe[:-1].get_feature_names_out()
fn = [item.replace("cat__", "").replace("num__", "") for item in fn]
labels = pipe.named_steps["classifier"].classes_
labels = [str(item) for item in labels]
mytree = tree.export_graphviz(pipe.named_steps["classifier"],
feature_names = fn,
class_names=labels,
label = 'none',
filled = True,
leaves_parallel = True,
impurity= False,
proportion = True,
rotate=False,
out_file=None)
st.graphviz_chart(mytree)
#st.divider()
# header: accuracy
st.header('Accuracy')
if selected == []:
st.write('Please select at least 1 predictor.')
else:
preds = pd.DataFrame(pipe.predict_proba(X_test))
preds.columns = ['prob_0', 'prob_1']
fpr, tpr, thresholds = metrics.roc_curve(y_test, preds["prob_1"], pos_label=1)
st.write('AUC: ', np.round(metrics.auc(fpr, tpr),3))
st.write('Precision: ', np.round(metrics.precision_score(y_test, pipe.predict(X_test)),3))
st.write('Recall: ', np.round(metrics.recall_score(y_test, pipe.predict(X_test)),3))
#st.divider()