Spaces:
Sleeping
Sleeping
Upload 9 files
Browse files- app.py +133 -0
- explainer.py +21 -0
- model.pkl +3 -0
- requirements.txt +6 -0
- scaler.pkl +3 -0
- utils/__pycache__/create_model.cpython-312.pyc +0 -0
- utils/__pycache__/data.cpython-312.pyc +0 -0
- utils/create_model.py +32 -0
- utils/data.py +47 -0
app.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import pickle
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from explainer import get_shap_values
|
| 6 |
+
import shap
|
| 7 |
+
import matplotlib.pyplot as plt
|
| 8 |
+
import io
|
| 9 |
+
import os
|
| 10 |
+
from PIL import Image
|
| 11 |
+
from huggingface_hub import HfApi
|
| 12 |
+
|
| 13 |
+
# Load model
|
| 14 |
+
with open("model.pkl", "rb") as f:
|
| 15 |
+
model = pickle.load(f)
|
| 16 |
+
|
| 17 |
+
# List of columns used during training
|
| 18 |
+
columns = ['Age', 'Saving accounts', 'Checking account', 'Credit amount', 'Duration',
|
| 19 |
+
'Purpose_business', 'Purpose_car', 'Purpose_domestic appliances', 'Purpose_education', 'Purpose_furniture/equipment',
|
| 20 |
+
'Purpose_radio/TV', 'Purpose_repairs', 'Purpose_vacation/others', 'Sex_female', 'Sex_male',
|
| 21 |
+
'Housing_free', 'Housing_own', 'Housing_rent', 'Job_0', 'Job_1', 'Job_2', 'Job_3']
|
| 22 |
+
|
| 23 |
+
# Define prediction function
|
| 24 |
+
# All categorical mappings must match the encoding used during training
|
| 25 |
+
saving_account_map = {'little': 0, 'moderate': 1, 'quite rich': 2, 'rich': 3, 'No Savings': 0}
|
| 26 |
+
checking_account_map = {'little': 0, 'moderate': 1, 'rich': 2, 'No Checking': 0}
|
| 27 |
+
|
| 28 |
+
purpose_options = ['business', 'car', 'domestic appliances', 'education', 'furniture/equipment', 'radio/TV', 'repairs', 'vacation/others']
|
| 29 |
+
sex_options = ['female', 'male']
|
| 30 |
+
housing_options = ['free', 'own', 'rent']
|
| 31 |
+
job_options = [0, 1, 2, 3]
|
| 32 |
+
|
| 33 |
+
def to_scalar(val):
|
| 34 |
+
# If val is a numpy array, get the first element; else, return as float
|
| 35 |
+
if isinstance(val, np.ndarray):
|
| 36 |
+
return float(val.flatten()[0])
|
| 37 |
+
return float(val)
|
| 38 |
+
|
| 39 |
+
def predict_and_explain(age, saving_account, checking_account, credit_amount, duration, purpose, sex, housing, job):
|
| 40 |
+
# Build a single-row DataFrame with all columns set to 0
|
| 41 |
+
input_data = pd.DataFrame([[0]*len(columns)], columns=columns)
|
| 42 |
+
input_data['Age'] = age
|
| 43 |
+
input_data['Saving accounts'] = saving_account_map.get(saving_account, 0)
|
| 44 |
+
input_data['Checking account'] = checking_account_map.get(checking_account, 0)
|
| 45 |
+
input_data['Credit amount'] = credit_amount
|
| 46 |
+
input_data['Duration'] = duration
|
| 47 |
+
|
| 48 |
+
# One-hot encoding for purpose
|
| 49 |
+
for p in purpose_options:
|
| 50 |
+
input_data[f'Purpose_{p}'] = 1 if purpose == p else 0
|
| 51 |
+
# One-hot encoding for sex
|
| 52 |
+
for s in sex_options:
|
| 53 |
+
input_data[f'Sex_{s}'] = 1 if sex == s else 0
|
| 54 |
+
# One-hot encoding for housing
|
| 55 |
+
for h in housing_options:
|
| 56 |
+
input_data[f'Housing_{h}'] = 1 if housing == h else 0
|
| 57 |
+
# One-hot encoding for job
|
| 58 |
+
for j in job_options:
|
| 59 |
+
input_data[f'Job_{j}'] = 1 if job == j else 0
|
| 60 |
+
|
| 61 |
+
# Predict
|
| 62 |
+
pred = model.predict(input_data)[0]
|
| 63 |
+
prob = model.predict_proba(input_data)[0][int(pred)]
|
| 64 |
+
risk = "Low Risk" if pred == 0 else "High Risk"
|
| 65 |
+
prediction_text = f"{risk} (Confidence: {prob:.2f})"
|
| 66 |
+
|
| 67 |
+
# Save flagged (high risk) input
|
| 68 |
+
if risk == "High Risk":
|
| 69 |
+
flagged_file = "flagged_inputs.csv"
|
| 70 |
+
# Save the raw user input, not the one-hot encoded row
|
| 71 |
+
user_row = {
|
| 72 |
+
"Age": age,
|
| 73 |
+
"Saving accounts": saving_account,
|
| 74 |
+
"Checking account": checking_account,
|
| 75 |
+
"Credit amount": credit_amount,
|
| 76 |
+
"Duration": duration,
|
| 77 |
+
"Purpose": purpose,
|
| 78 |
+
"Sex": sex,
|
| 79 |
+
"Housing": housing,
|
| 80 |
+
"Job": job
|
| 81 |
+
}
|
| 82 |
+
df_flag = pd.DataFrame([user_row])
|
| 83 |
+
if not os.path.exists(flagged_file):
|
| 84 |
+
df_flag.to_csv(flagged_file, index=False)
|
| 85 |
+
else:
|
| 86 |
+
df_flag.to_csv(flagged_file, mode='a', header=False, index=False)
|
| 87 |
+
|
| 88 |
+
# Get SHAP values
|
| 89 |
+
shap_values = get_shap_values(model, input_data)
|
| 90 |
+
# If get_shap_values returns (shap_values, explainer), use: shap_values, explainer = get_shap_values(input_data)
|
| 91 |
+
|
| 92 |
+
# Get feature importances for this prediction
|
| 93 |
+
shap_vals = shap_values[0] if hasattr(shap_values, '__getitem__') else shap_values
|
| 94 |
+
feature_importance = sorted(
|
| 95 |
+
zip(input_data.columns, shap_vals),
|
| 96 |
+
key=lambda x: abs(x[1]),
|
| 97 |
+
reverse=True
|
| 98 |
+
)
|
| 99 |
+
# Get top 5 important features
|
| 100 |
+
top_features = feature_importance[:5]
|
| 101 |
+
importance_text = "Top features for this prediction:\n" + "\n".join(
|
| 102 |
+
[f"{feat}: {to_scalar(val):.3f}" for feat, val in top_features]
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
return prediction_text, importance_text
|
| 106 |
+
|
| 107 |
+
# Gradio UI
|
| 108 |
+
demo = gr.Interface(
|
| 109 |
+
fn=predict_and_explain,
|
| 110 |
+
inputs=[
|
| 111 |
+
gr.Number(label="Age", info="Customer's Age"),
|
| 112 |
+
gr.Radio(list(saving_account_map.keys()), label="Saving accounts", info="Saving accounts (Amount of Saving)"),
|
| 113 |
+
gr.Radio(list(checking_account_map.keys()), label="Checking account", info="Checking account (in USD)"),
|
| 114 |
+
gr.Number(label="Credit amount", info="Credit amount (numeric, in USD)"),
|
| 115 |
+
gr.Number(label="Duration", info="Duration (numeric, in month)"),
|
| 116 |
+
gr.Radio(purpose_options, label="Purpose", info="Purpose (car, furniture/equipment, radio/TV, domestic appliances, repairs, education, business, vacation/others)"),
|
| 117 |
+
gr.Radio(sex_options, label="Sex", info="Sex (Male, Female)"),
|
| 118 |
+
gr.Radio(housing_options, label="Housing", info="Housing (Own, Rent, or Free)"),
|
| 119 |
+
gr.Radio(job_options, label="Job", info="Job (0 - Unskilled and non-resident, 1 - Unskilled and resident, 2 - Skilled, 3 - Highly skilled)")
|
| 120 |
+
],
|
| 121 |
+
outputs=[
|
| 122 |
+
gr.Text(label="Credit Risk Prediction"),
|
| 123 |
+
gr.Text(label="Top Features for This Prediction")
|
| 124 |
+
],
|
| 125 |
+
title="Credit Risk Predictor",
|
| 126 |
+
description="Input applicant info to assess credit risk level and see SHAP explanation"
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
if __name__ == "__main__":
|
| 131 |
+
demo.launch()
|
| 132 |
+
|
| 133 |
+
|
explainer.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import shap
|
| 2 |
+
import pickle
|
| 3 |
+
import pandas as pd
|
| 4 |
+
|
| 5 |
+
# Load the trained model
|
| 6 |
+
with open('model.pkl', 'rb') as f:
|
| 7 |
+
model = pickle.load(f)
|
| 8 |
+
|
| 9 |
+
def get_shap_values(model, input_data):
|
| 10 |
+
"""
|
| 11 |
+
Returns SHAP values for the given input DataFrame using the trained model.
|
| 12 |
+
input_df: pandas DataFrame, preprocessed and matching model input columns
|
| 13 |
+
Returns: shap_values, explainer
|
| 14 |
+
"""
|
| 15 |
+
explainer = shap.TreeExplainer(model)
|
| 16 |
+
shap_values = explainer.shap_values(input_data)
|
| 17 |
+
|
| 18 |
+
return shap_values, explainer
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
|
model.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:71ca02c857c381c626c2b92b8ec821dffd6da117a9ee62c3df5b9a7921f5212f
|
| 3 |
+
size 98520
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
numpy
|
| 3 |
+
pandas
|
| 4 |
+
xgboost
|
| 5 |
+
scikit-learn
|
| 6 |
+
shap
|
scaler.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cc40ce79f3fa825008b0b93345ce51f4393b05ff716ba0fba9c757cc96d66540
|
| 3 |
+
size 1410
|
utils/__pycache__/create_model.cpython-312.pyc
ADDED
|
Binary file (1.48 kB). View file
|
|
|
utils/__pycache__/data.cpython-312.pyc
ADDED
|
Binary file (2.25 kB). View file
|
|
|
utils/create_model.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sklearn.model_selection import train_test_split
|
| 2 |
+
import xgboost as xgb
|
| 3 |
+
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
|
| 4 |
+
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, accuracy_score, classification_report, confusion_matrix
|
| 5 |
+
from utils.data import get_cleaned_data
|
| 6 |
+
|
| 7 |
+
def create_model():
|
| 8 |
+
df = get_cleaned_data()
|
| 9 |
+
#feature selection
|
| 10 |
+
X = df.drop('credit_risk', axis=1)
|
| 11 |
+
y = df['credit_risk']
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
#split data into training and testing sets
|
| 15 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
| 16 |
+
|
| 17 |
+
scaler = StandardScaler()
|
| 18 |
+
X_train = scaler.fit_transform(X_train)
|
| 19 |
+
X_test = scaler.transform(X_test)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
#train model
|
| 23 |
+
model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)
|
| 24 |
+
model.fit(X_train, y_train)
|
| 25 |
+
|
| 26 |
+
#evaluate model
|
| 27 |
+
y_pred = model.predict(X_test)
|
| 28 |
+
|
| 29 |
+
#print("classification report /n:", classification_report(y_test, y_pred))
|
| 30 |
+
#print("R2", r2_score(y_test, y_pred))
|
| 31 |
+
# print("MSE:", mean_squared_error(y_test, y_pred))
|
| 32 |
+
return model, scaler
|
utils/data.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def get_cleaned_data():
|
| 6 |
+
df = pd.read_csv('German Credit Data.csv')
|
| 7 |
+
|
| 8 |
+
# Fill missing values
|
| 9 |
+
df['Saving accounts'] = df['Saving accounts'].fillna('No Savings')
|
| 10 |
+
df['Checking account'] = df['Checking account'].fillna('No Checking')
|
| 11 |
+
df = df.drop(columns='Unnamed: 0')
|
| 12 |
+
|
| 13 |
+
#print(df.info())
|
| 14 |
+
|
| 15 |
+
num_cols = ['Credit amount', 'Duration in month', 'Age in years']
|
| 16 |
+
cat_cols = ['Saving accounts', 'Checking account', 'Purpose', 'Sex', 'Housing', 'Job']
|
| 17 |
+
|
| 18 |
+
#Encoding Categorical Variabpythles
|
| 19 |
+
label = LabelEncoder()
|
| 20 |
+
df['Saving accounts'] = label.fit_transform(df['Saving accounts'])
|
| 21 |
+
df['Checking account'] = label.fit_transform(df['Checking account'])
|
| 22 |
+
|
| 23 |
+
#One Hot Encoding
|
| 24 |
+
df = pd.get_dummies(df, columns=['Purpose', 'Sex', 'Housing', 'Job']).astype(int)
|
| 25 |
+
|
| 26 |
+
# Scoring system
|
| 27 |
+
risk_score = (
|
| 28 |
+
(df['Credit amount'] > 5000).astype(int) +
|
| 29 |
+
(df['Duration'] > 24).astype(int) +
|
| 30 |
+
(df['Saving accounts'] == 0).astype(int) + # 0 = 'No Savings' after label encoding
|
| 31 |
+
(df['Checking account'] == 0).astype(int) + # 0 = 'No Checking' after label encoding
|
| 32 |
+
(df['Purpose_radio/TV'] == 1).astype(int) if 'Purpose_radio/TV' in df.columns else 0 +
|
| 33 |
+
(df['Housing_rent'] == 1).astype(int) if 'Housing_rent' in df.columns else 0 +
|
| 34 |
+
(df['Job_0'] == 1).astype(int) if 'Job_0' in df.columns else 0
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
# Set threshold: if risk_score >= 3, high risk (1), else low risk (0)
|
| 38 |
+
df['credit_risk'] = (risk_score >= 3).astype(int)
|
| 39 |
+
return df
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
|