import streamlit as st import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.impute import SimpleImputer from sklearn.compose import ColumnTransformer from sklearn import tree from sklearn.tree import DecisionTreeClassifier, plot_tree import matplotlib as plt from sklearn import metrics import graphviz as graphviz # load data data = pd.read_csv('german_credit_from_r.csv') # remove vars data.drop(['Foreign_worker', 'Gender'], axis=1, inplace=True) # recode credit risk data['Credit_risk'] = data['Credit_risk'].map({'GOOD': 0, 'BAD': 1}) # extract variable names vars = data.columns.tolist() vars.remove('Credit_risk') # train/test split of data X = data[vars] y = data['Credit_risk'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # page title st.title('Classification of Credit Risk') #st.divider() # header: variables st.header('Variables') st.markdown( """ The dataset contains the following variables: - Status of existing checking account - Credit duration in month - Credit history - Purpose - Credit amount - Savings account/bonds - Present employment since - Installment rate in percentage of disposable income - Other debtors / guarantors - Present residence since - Property - Age in years - Other installment plans - Housing - Number of existing credits at this bank - Job - Number of people being liable to provide maintenance for - Telephone - Credit risk (1 = bad, 0 = good) """ ) #st.divider() # header: data st.header('Data') # move Credit_risk to the end cols = data.columns.tolist() cols.remove('Credit_risk') cols.append('Credit_risk') data = data[cols] st.write('First 20 rows of the dataset:') data.loc[1:20, :] #st.divider() # header: predictors st.header('Predictors') st.write('Please select up to 3 predictors:') selected = st.multiselect('', vars, max_selections=3) #st.divider() # header: model st.header('Model') X_train_f = X_train.loc[:, selected] numeric_features = ["Duration", "Credit_amount", "Installment_rate", "Resident_since", "Age", "Existing_credits", "People_maintenance_for"] numeric_features_selected = list(set(selected) & set(numeric_features)) numeric_transformer = Pipeline( steps=[("imputer", SimpleImputer())] ) categorical_features = ["Account_status", "Credit_history", "Purpose", "Savings_bonds", "Present_employment_since", "Other_debtors_guarantors", "Property", "Other_installment_plans", "Housing", "Job", "Telephone", "Foreign_worker", "Gender"] categorical_features_selected = list(set(selected) & set(categorical_features)) categorical_transformer = Pipeline( steps=[ ("encoder", OneHotEncoder(handle_unknown="ignore")) ] ) preprocessor = ColumnTransformer( transformers=[ ("num", numeric_transformer, numeric_features_selected), ("cat", categorical_transformer, categorical_features_selected), ] ) if selected == []: st.write('Please select at least 1 predictor.') else: maxd = st.slider('Max depth', min_value=1, max_value=10, value=2, step=1) pipe = Pipeline([("preprocessor", preprocessor), ('classifier', tree.DecisionTreeClassifier(max_depth=maxd))]) pipe.fit(X_train_f, y_train) fn = pipe[:-1].get_feature_names_out() fn = [item.replace("cat__", "").replace("num__", "") for item in fn] labels = pipe.named_steps["classifier"].classes_ labels = [str(item) for item in labels] mytree = tree.export_graphviz(pipe.named_steps["classifier"], feature_names = fn, class_names=labels, label = 'none', filled = True, leaves_parallel = True, impurity= False, proportion = True, rotate=False, out_file=None) st.graphviz_chart(mytree) #st.divider() # header: accuracy st.header('Accuracy') if selected == []: st.write('Please select at least 1 predictor.') else: preds = pd.DataFrame(pipe.predict_proba(X_test)) preds.columns = ['prob_0', 'prob_1'] fpr, tpr, thresholds = metrics.roc_curve(y_test, preds["prob_1"], pos_label=1) st.write('AUC: ', np.round(metrics.auc(fpr, tpr),3)) st.write('Precision: ', np.round(metrics.precision_score(y_test, pipe.predict(X_test)),3)) st.write('Recall: ', np.round(metrics.recall_score(y_test, pipe.predict(X_test)),3)) #st.divider()