PhunvVi commited on
Commit
067ea44
·
verified ·
1 Parent(s): facfa27

Upload 9 files

Browse files
app.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pickle
3
+ import numpy as np
4
+ import pandas as pd
5
+ from explainer import get_shap_values
6
+ import shap
7
+ import matplotlib.pyplot as plt
8
+ import io
9
+ import os
10
+ from PIL import Image
11
+ from huggingface_hub import HfApi
12
+
13
+ # Load model
14
+ with open("model.pkl", "rb") as f:
15
+ model = pickle.load(f)
16
+
17
+ # List of columns used during training
18
+ columns = ['Age', 'Saving accounts', 'Checking account', 'Credit amount', 'Duration',
19
+ 'Purpose_business', 'Purpose_car', 'Purpose_domestic appliances', 'Purpose_education', 'Purpose_furniture/equipment',
20
+ 'Purpose_radio/TV', 'Purpose_repairs', 'Purpose_vacation/others', 'Sex_female', 'Sex_male',
21
+ 'Housing_free', 'Housing_own', 'Housing_rent', 'Job_0', 'Job_1', 'Job_2', 'Job_3']
22
+
23
+ # Define prediction function
24
+ # All categorical mappings must match the encoding used during training
25
+ saving_account_map = {'little': 0, 'moderate': 1, 'quite rich': 2, 'rich': 3, 'No Savings': 0}
26
+ checking_account_map = {'little': 0, 'moderate': 1, 'rich': 2, 'No Checking': 0}
27
+
28
+ purpose_options = ['business', 'car', 'domestic appliances', 'education', 'furniture/equipment', 'radio/TV', 'repairs', 'vacation/others']
29
+ sex_options = ['female', 'male']
30
+ housing_options = ['free', 'own', 'rent']
31
+ job_options = [0, 1, 2, 3]
32
+
33
+ def to_scalar(val):
34
+ # If val is a numpy array, get the first element; else, return as float
35
+ if isinstance(val, np.ndarray):
36
+ return float(val.flatten()[0])
37
+ return float(val)
38
+
39
+ def predict_and_explain(age, saving_account, checking_account, credit_amount, duration, purpose, sex, housing, job):
40
+ # Build a single-row DataFrame with all columns set to 0
41
+ input_data = pd.DataFrame([[0]*len(columns)], columns=columns)
42
+ input_data['Age'] = age
43
+ input_data['Saving accounts'] = saving_account_map.get(saving_account, 0)
44
+ input_data['Checking account'] = checking_account_map.get(checking_account, 0)
45
+ input_data['Credit amount'] = credit_amount
46
+ input_data['Duration'] = duration
47
+
48
+ # One-hot encoding for purpose
49
+ for p in purpose_options:
50
+ input_data[f'Purpose_{p}'] = 1 if purpose == p else 0
51
+ # One-hot encoding for sex
52
+ for s in sex_options:
53
+ input_data[f'Sex_{s}'] = 1 if sex == s else 0
54
+ # One-hot encoding for housing
55
+ for h in housing_options:
56
+ input_data[f'Housing_{h}'] = 1 if housing == h else 0
57
+ # One-hot encoding for job
58
+ for j in job_options:
59
+ input_data[f'Job_{j}'] = 1 if job == j else 0
60
+
61
+ # Predict
62
+ pred = model.predict(input_data)[0]
63
+ prob = model.predict_proba(input_data)[0][int(pred)]
64
+ risk = "Low Risk" if pred == 0 else "High Risk"
65
+ prediction_text = f"{risk} (Confidence: {prob:.2f})"
66
+
67
+ # Save flagged (high risk) input
68
+ if risk == "High Risk":
69
+ flagged_file = "flagged_inputs.csv"
70
+ # Save the raw user input, not the one-hot encoded row
71
+ user_row = {
72
+ "Age": age,
73
+ "Saving accounts": saving_account,
74
+ "Checking account": checking_account,
75
+ "Credit amount": credit_amount,
76
+ "Duration": duration,
77
+ "Purpose": purpose,
78
+ "Sex": sex,
79
+ "Housing": housing,
80
+ "Job": job
81
+ }
82
+ df_flag = pd.DataFrame([user_row])
83
+ if not os.path.exists(flagged_file):
84
+ df_flag.to_csv(flagged_file, index=False)
85
+ else:
86
+ df_flag.to_csv(flagged_file, mode='a', header=False, index=False)
87
+
88
+ # Get SHAP values
89
+ shap_values = get_shap_values(model, input_data)
90
+ # If get_shap_values returns (shap_values, explainer), use: shap_values, explainer = get_shap_values(input_data)
91
+
92
+ # Get feature importances for this prediction
93
+ shap_vals = shap_values[0] if hasattr(shap_values, '__getitem__') else shap_values
94
+ feature_importance = sorted(
95
+ zip(input_data.columns, shap_vals),
96
+ key=lambda x: abs(x[1]),
97
+ reverse=True
98
+ )
99
+ # Get top 5 important features
100
+ top_features = feature_importance[:5]
101
+ importance_text = "Top features for this prediction:\n" + "\n".join(
102
+ [f"{feat}: {to_scalar(val):.3f}" for feat, val in top_features]
103
+ )
104
+
105
+ return prediction_text, importance_text
106
+
107
+ # Gradio UI
108
+ demo = gr.Interface(
109
+ fn=predict_and_explain,
110
+ inputs=[
111
+ gr.Number(label="Age", info="Customer's Age"),
112
+ gr.Radio(list(saving_account_map.keys()), label="Saving accounts", info="Saving accounts (Amount of Saving)"),
113
+ gr.Radio(list(checking_account_map.keys()), label="Checking account", info="Checking account (in USD)"),
114
+ gr.Number(label="Credit amount", info="Credit amount (numeric, in USD)"),
115
+ gr.Number(label="Duration", info="Duration (numeric, in month)"),
116
+ gr.Radio(purpose_options, label="Purpose", info="Purpose (car, furniture/equipment, radio/TV, domestic appliances, repairs, education, business, vacation/others)"),
117
+ gr.Radio(sex_options, label="Sex", info="Sex (Male, Female)"),
118
+ gr.Radio(housing_options, label="Housing", info="Housing (Own, Rent, or Free)"),
119
+ gr.Radio(job_options, label="Job", info="Job (0 - Unskilled and non-resident, 1 - Unskilled and resident, 2 - Skilled, 3 - Highly skilled)")
120
+ ],
121
+ outputs=[
122
+ gr.Text(label="Credit Risk Prediction"),
123
+ gr.Text(label="Top Features for This Prediction")
124
+ ],
125
+ title="Credit Risk Predictor",
126
+ description="Input applicant info to assess credit risk level and see SHAP explanation"
127
+ )
128
+
129
+
130
+ if __name__ == "__main__":
131
+ demo.launch()
132
+
133
+
explainer.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import shap
2
+ import pickle
3
+ import pandas as pd
4
+
5
+ # Load the trained model
6
+ with open('model.pkl', 'rb') as f:
7
+ model = pickle.load(f)
8
+
9
+ def get_shap_values(model, input_data):
10
+ """
11
+ Returns SHAP values for the given input DataFrame using the trained model.
12
+ input_df: pandas DataFrame, preprocessed and matching model input columns
13
+ Returns: shap_values, explainer
14
+ """
15
+ explainer = shap.TreeExplainer(model)
16
+ shap_values = explainer.shap_values(input_data)
17
+
18
+ return shap_values, explainer
19
+
20
+
21
+
model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71ca02c857c381c626c2b92b8ec821dffd6da117a9ee62c3df5b9a7921f5212f
3
+ size 98520
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ numpy
3
+ pandas
4
+ xgboost
5
+ scikit-learn
6
+ shap
scaler.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc40ce79f3fa825008b0b93345ce51f4393b05ff716ba0fba9c757cc96d66540
3
+ size 1410
utils/__pycache__/create_model.cpython-312.pyc ADDED
Binary file (1.48 kB). View file
 
utils/__pycache__/data.cpython-312.pyc ADDED
Binary file (2.25 kB). View file
 
utils/create_model.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.model_selection import train_test_split
2
+ import xgboost as xgb
3
+ from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
4
+ from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, accuracy_score, classification_report, confusion_matrix
5
+ from utils.data import get_cleaned_data
6
+
7
+ def create_model():
8
+ df = get_cleaned_data()
9
+ #feature selection
10
+ X = df.drop('credit_risk', axis=1)
11
+ y = df['credit_risk']
12
+
13
+
14
+ #split data into training and testing sets
15
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
16
+
17
+ scaler = StandardScaler()
18
+ X_train = scaler.fit_transform(X_train)
19
+ X_test = scaler.transform(X_test)
20
+
21
+
22
+ #train model
23
+ model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)
24
+ model.fit(X_train, y_train)
25
+
26
+ #evaluate model
27
+ y_pred = model.predict(X_test)
28
+
29
+ #print("classification report /n:", classification_report(y_test, y_pred))
30
+ #print("R2", r2_score(y_test, y_pred))
31
+ # print("MSE:", mean_squared_error(y_test, y_pred))
32
+ return model, scaler
utils/data.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.preprocessing import OneHotEncoder, LabelEncoder
3
+
4
+
5
+ def get_cleaned_data():
6
+ df = pd.read_csv('German Credit Data.csv')
7
+
8
+ # Fill missing values
9
+ df['Saving accounts'] = df['Saving accounts'].fillna('No Savings')
10
+ df['Checking account'] = df['Checking account'].fillna('No Checking')
11
+ df = df.drop(columns='Unnamed: 0')
12
+
13
+ #print(df.info())
14
+
15
+ num_cols = ['Credit amount', 'Duration in month', 'Age in years']
16
+ cat_cols = ['Saving accounts', 'Checking account', 'Purpose', 'Sex', 'Housing', 'Job']
17
+
18
+ #Encoding Categorical Variabpythles
19
+ label = LabelEncoder()
20
+ df['Saving accounts'] = label.fit_transform(df['Saving accounts'])
21
+ df['Checking account'] = label.fit_transform(df['Checking account'])
22
+
23
+ #One Hot Encoding
24
+ df = pd.get_dummies(df, columns=['Purpose', 'Sex', 'Housing', 'Job']).astype(int)
25
+
26
+ # Scoring system
27
+ risk_score = (
28
+ (df['Credit amount'] > 5000).astype(int) +
29
+ (df['Duration'] > 24).astype(int) +
30
+ (df['Saving accounts'] == 0).astype(int) + # 0 = 'No Savings' after label encoding
31
+ (df['Checking account'] == 0).astype(int) + # 0 = 'No Checking' after label encoding
32
+ (df['Purpose_radio/TV'] == 1).astype(int) if 'Purpose_radio/TV' in df.columns else 0 +
33
+ (df['Housing_rent'] == 1).astype(int) if 'Housing_rent' in df.columns else 0 +
34
+ (df['Job_0'] == 1).astype(int) if 'Job_0' in df.columns else 0
35
+ )
36
+
37
+ # Set threshold: if risk_score >= 3, high risk (1), else low risk (0)
38
+ df['credit_risk'] = (risk_score >= 3).astype(int)
39
+ return df
40
+
41
+
42
+
43
+
44
+
45
+
46
+
47
+