Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
from sklearn.model_selection import train_test_split | |
from sklearn.pipeline import Pipeline | |
from sklearn.preprocessing import StandardScaler, OneHotEncoder | |
from sklearn.impute import SimpleImputer | |
from sklearn.compose import ColumnTransformer | |
from sklearn import tree | |
from sklearn.tree import DecisionTreeClassifier, plot_tree | |
import matplotlib as plt | |
from sklearn import metrics | |
import graphviz as graphviz | |
# load data | |
data = pd.read_csv('german_credit_from_r.csv') | |
# remove vars | |
data.drop(['Foreign_worker', 'Gender'], axis=1, inplace=True) | |
# recode credit risk | |
data['Credit_risk'] = data['Credit_risk'].map({'GOOD': 0, 'BAD': 1}) | |
# extract variable names | |
vars = data.columns.tolist() | |
vars.remove('Credit_risk') | |
# train/test split of data | |
X = data[vars] | |
y = data['Credit_risk'] | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
# page title | |
st.title('Classification of Credit Risk') | |
#st.divider() | |
# header: variables | |
st.header('Variables') | |
st.markdown( | |
""" | |
The dataset contains the following variables: | |
- Status of existing checking account | |
- Credit duration in month | |
- Credit history | |
- Purpose | |
- Credit amount | |
- Savings account/bonds | |
- Present employment since | |
- Installment rate in percentage of disposable income | |
- Other debtors / guarantors | |
- Present residence since | |
- Property | |
- Age in years | |
- Other installment plans | |
- Housing | |
- Number of existing credits at this bank | |
- Job | |
- Number of people being liable to provide maintenance for | |
- Telephone | |
- Credit risk (1 = bad, 0 = good) | |
""" | |
) | |
#st.divider() | |
# header: data | |
st.header('Data') | |
# move Credit_risk to the end | |
cols = data.columns.tolist() | |
cols.remove('Credit_risk') | |
cols.append('Credit_risk') | |
data = data[cols] | |
st.write('First 20 rows of the dataset:') | |
data.loc[1:20, :] | |
#st.divider() | |
# header: predictors | |
st.header('Predictors') | |
st.write('Please select up to 3 predictors:') | |
selected = st.multiselect('', vars, max_selections=3) | |
#st.divider() | |
# header: model | |
st.header('Model') | |
X_train_f = X_train.loc[:, selected] | |
numeric_features = ["Duration", "Credit_amount", "Installment_rate", "Resident_since", "Age", "Existing_credits", "People_maintenance_for"] | |
numeric_features_selected = list(set(selected) & set(numeric_features)) | |
numeric_transformer = Pipeline( | |
steps=[("imputer", SimpleImputer())] | |
) | |
categorical_features = ["Account_status", "Credit_history", "Purpose", "Savings_bonds", "Present_employment_since", "Other_debtors_guarantors", "Property", "Other_installment_plans", "Housing", "Job", "Telephone", "Foreign_worker", "Gender"] | |
categorical_features_selected = list(set(selected) & set(categorical_features)) | |
categorical_transformer = Pipeline( | |
steps=[ | |
("encoder", OneHotEncoder(handle_unknown="ignore")) | |
] | |
) | |
preprocessor = ColumnTransformer( | |
transformers=[ | |
("num", numeric_transformer, numeric_features_selected), | |
("cat", categorical_transformer, categorical_features_selected), | |
] | |
) | |
if selected == []: | |
st.write('Please select at least 1 predictor.') | |
else: | |
maxd = st.slider('Max depth', min_value=1, max_value=10, value=2, step=1) | |
pipe = Pipeline([("preprocessor", preprocessor), ('classifier', tree.DecisionTreeClassifier(max_depth=maxd))]) | |
pipe.fit(X_train_f, y_train) | |
fn = pipe[:-1].get_feature_names_out() | |
fn = [item.replace("cat__", "").replace("num__", "") for item in fn] | |
labels = pipe.named_steps["classifier"].classes_ | |
labels = [str(item) for item in labels] | |
mytree = tree.export_graphviz(pipe.named_steps["classifier"], | |
feature_names = fn, | |
class_names=labels, | |
label = 'none', | |
filled = True, | |
leaves_parallel = True, | |
impurity= False, | |
proportion = True, | |
rotate=False, | |
out_file=None) | |
st.graphviz_chart(mytree) | |
#st.divider() | |
# header: accuracy | |
st.header('Accuracy') | |
if selected == []: | |
st.write('Please select at least 1 predictor.') | |
else: | |
preds = pd.DataFrame(pipe.predict_proba(X_test)) | |
preds.columns = ['prob_0', 'prob_1'] | |
fpr, tpr, thresholds = metrics.roc_curve(y_test, preds["prob_1"], pos_label=1) | |
st.write('AUC: ', np.round(metrics.auc(fpr, tpr),3)) | |
st.write('Precision: ', np.round(metrics.precision_score(y_test, pipe.predict(X_test)),3)) | |
st.write('Recall: ', np.round(metrics.recall_score(y_test, pipe.predict(X_test)),3)) | |
#st.divider() |