Spaces:
Sleeping
Sleeping
File size: 4,670 Bytes
6e99eca 99c0d9f 701c14e 99c0d9f 6e99eca fdf85d2 99c0d9f 701c14e 99c0d9f 95ec290 701c14e 99c0d9f 09db070 99c0d9f a5b2d5a 95ec290 99c0d9f 09db070 99c0d9f 95ec290 09db070 99c0d9f b7187da 09db070 99c0d9f b7187da 99c0d9f b7187da 99c0d9f 701c14e 99c0d9f 701c14e 99c0d9f 701c14e 09db070 99c0d9f 701c14e 99c0d9f 701c14e 99c0d9f 09db070 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib as plt
from sklearn import metrics
import graphviz as graphviz
# load data
data = pd.read_csv('german_credit_from_r.csv')
# remove vars
data.drop(['Foreign_worker', 'Gender'], axis=1, inplace=True)
# recode credit risk
data['Credit_risk'] = data['Credit_risk'].map({'GOOD': 0, 'BAD': 1})
# extract variable names
vars = data.columns.tolist()
vars.remove('Credit_risk')
# train/test split of data
X = data[vars]
y = data['Credit_risk']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# page title
st.title('Classification of Credit Risk')
#st.divider()
# header: variables
st.header('Variables')
st.markdown(
"""
The dataset contains the following variables:
- Status of existing checking account
- Credit duration in month
- Credit history
- Purpose
- Credit amount
- Savings account/bonds
- Present employment since
- Installment rate in percentage of disposable income
- Other debtors / guarantors
- Present residence since
- Property
- Age in years
- Other installment plans
- Housing
- Number of existing credits at this bank
- Job
- Number of people being liable to provide maintenance for
- Telephone
- Credit risk (1 = bad, 0 = good)
"""
)
#st.divider()
# header: data
st.header('Data')
# move Credit_risk to the end
cols = data.columns.tolist()
cols.remove('Credit_risk')
cols.append('Credit_risk')
data = data[cols]
st.write('First 20 rows of the dataset:')
data.loc[1:20, :]
#st.divider()
# header: predictors
st.header('Predictors')
st.write('Please select up to 3 predictors:')
selected = st.multiselect('', vars, max_selections=3)
#st.divider()
# header: model
st.header('Model')
X_train_f = X_train.loc[:, selected]
numeric_features = ["Duration", "Credit_amount", "Installment_rate", "Resident_since", "Age", "Existing_credits", "People_maintenance_for"]
numeric_features_selected = list(set(selected) & set(numeric_features))
numeric_transformer = Pipeline(
steps=[("imputer", SimpleImputer())]
)
categorical_features = ["Account_status", "Credit_history", "Purpose", "Savings_bonds", "Present_employment_since", "Other_debtors_guarantors", "Property", "Other_installment_plans", "Housing", "Job", "Telephone", "Foreign_worker", "Gender"]
categorical_features_selected = list(set(selected) & set(categorical_features))
categorical_transformer = Pipeline(
steps=[
("encoder", OneHotEncoder(handle_unknown="ignore"))
]
)
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features_selected),
("cat", categorical_transformer, categorical_features_selected),
]
)
if selected == []:
st.write('Please select at least 1 predictor.')
else:
maxd = st.slider('Max depth', min_value=1, max_value=10, value=2, step=1)
pipe = Pipeline([("preprocessor", preprocessor), ('classifier', tree.DecisionTreeClassifier(max_depth=maxd))])
pipe.fit(X_train_f, y_train)
fn = pipe[:-1].get_feature_names_out()
fn = [item.replace("cat__", "").replace("num__", "") for item in fn]
labels = pipe.named_steps["classifier"].classes_
labels = [str(item) for item in labels]
mytree = tree.export_graphviz(pipe.named_steps["classifier"],
feature_names = fn,
class_names=labels,
label = 'none',
filled = True,
leaves_parallel = True,
impurity= False,
proportion = True,
rotate=False,
out_file=None)
st.graphviz_chart(mytree)
#st.divider()
# header: accuracy
st.header('Accuracy')
if selected == []:
st.write('Please select at least 1 predictor.')
else:
preds = pd.DataFrame(pipe.predict_proba(X_test))
preds.columns = ['prob_0', 'prob_1']
fpr, tpr, thresholds = metrics.roc_curve(y_test, preds["prob_1"], pos_label=1)
st.write('AUC: ', np.round(metrics.auc(fpr, tpr),3))
st.write('Precision: ', np.round(metrics.precision_score(y_test, pipe.predict(X_test)),3))
st.write('Recall: ', np.round(metrics.recall_score(y_test, pipe.predict(X_test)),3))
#st.divider() |