Spaces:
Sleeping
Sleeping
upload app
Browse files- .gitattributes +7 -0
- app.py +146 -0
- credit_models.py +95 -0
- data/external/.gitkeep +0 -0
- data/interim/.gitkeep +0 -0
- data/interim/modifying_data.py +40 -0
- data/interim/train_clean_type.csv +3 -0
- data/processed/.gitkeep +0 -0
- data/processed/v2/train_balanced_mixed.csv +3 -0
- data/processed/v2/train_balanced_synthetic.csv +3 -0
- data/processed/v2/train_balanced_synthetic2.csv +3 -0
- data/processed/v4/real_test_data.csv +0 -0
- data/processed/v4/real_train_data.csv +3 -0
- data/processed/v4/synthetic_train_data.csv +3 -0
- data/raw/.gitkeep +0 -0
- data/raw/train_clean.csv +3 -0
- data_generation.py +23 -0
- data_preprocessing.py +68 -0
- models/.gitkeep +0 -0
- models/v2/model_good.pkl +3 -0
- models/v2/model_poor.pkl +3 -0
- models/v2/model_standard.pkl +3 -0
- models/v4/synth_good.pkl +3 -0
- models/v4/synth_poor.pkl +3 -0
- models/v4/synth_standard.pkl +3 -0
- visualization.py +262 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
data/interim/train_clean_type.csv filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
data/processed/v2/train_balanced_mixed.csv filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
data/processed/v2/train_balanced_synthetic.csv filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
data/processed/v2/train_balanced_synthetic2.csv filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
data/processed/v4/real_train_data.csv filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
data/processed/v4/synthetic_train_data.csv filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
data/raw/train_clean.csv filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import pandas as pd
|
| 3 |
+
|
| 4 |
+
from data_generation import generate_synthetic_training_data
|
| 5 |
+
from data_preprocessing import preprocess_real_data, preprocess_synthetic_data
|
| 6 |
+
from credit_models import real_data_credit_model, synthetic_data_credit_model
|
| 7 |
+
from visualization import (
|
| 8 |
+
plot_feature_distributions,
|
| 9 |
+
plot_comparative_credit_score_distribution,
|
| 10 |
+
plot_comparison_table,
|
| 11 |
+
plot_comparative_confusion_matrices,
|
| 12 |
+
plot_comparative_credit_score_distribution_by_actual_class,
|
| 13 |
+
get_metrics_df,
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
COLOR_MAP = {
|
| 17 |
+
'Good': '#28B463',
|
| 18 |
+
'Standard': '#F1C40F',
|
| 19 |
+
'Poor': '#E74C3C',
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
LABEL_ORDER = ['Good', 'Standard', 'Poor']
|
| 23 |
+
TARGET = 'Credit_Score'
|
| 24 |
+
|
| 25 |
+
# Load and preprocess real data once at startup
|
| 26 |
+
real_train = pd.read_csv('../data/processed/v4/real_train_data.csv')
|
| 27 |
+
real_test = pd.read_csv('../data/processed/v4/real_test_data.csv')
|
| 28 |
+
|
| 29 |
+
X_real_train, y_real_train, X_real_test, y_real_test = preprocess_real_data(
|
| 30 |
+
real_train, real_test, TARGET
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
# Train real-data model once at startup
|
| 34 |
+
real_scores, real_classification = real_data_credit_model(
|
| 35 |
+
X_real_train, y_real_train, X_real_test
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def run_analysis():
|
| 40 |
+
"""Generate new synthetic data, train the synthetic model, and return all comparison plots."""
|
| 41 |
+
synthetic_data = generate_synthetic_training_data(n=int(len(X_real_train)/3)) # Same number of samples as real training data
|
| 42 |
+
X_synth_train, y_synth_train = preprocess_synthetic_data(synthetic_data, TARGET)
|
| 43 |
+
|
| 44 |
+
fig_feature_dist = plot_feature_distributions(
|
| 45 |
+
X_real_train, X_synth_train
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
synth_scores, synth_classification = synthetic_data_credit_model(
|
| 49 |
+
X_synth_train, y_synth_train, X_real_test
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
fig_score_dist = plot_comparative_credit_score_distribution(
|
| 53 |
+
real_scores, synth_scores
|
| 54 |
+
)
|
| 55 |
+
fig_score_by_class = plot_comparative_credit_score_distribution_by_actual_class(
|
| 56 |
+
y_real_test, real_scores, synth_scores,
|
| 57 |
+
color_map=COLOR_MAP,
|
| 58 |
+
label_order=LABEL_ORDER,
|
| 59 |
+
)
|
| 60 |
+
fig_metrics = plot_comparison_table(
|
| 61 |
+
y_real_test, real_classification, synth_classification
|
| 62 |
+
)
|
| 63 |
+
fig_cm = plot_comparative_confusion_matrices(
|
| 64 |
+
y_real_test, real_classification, synth_classification,
|
| 65 |
+
labels=LABEL_ORDER,
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
metrics_df = get_metrics_df(y_real_test, real_classification, synth_classification)
|
| 69 |
+
metrics_df = metrics_df.round(4)
|
| 70 |
+
|
| 71 |
+
return fig_feature_dist, fig_score_dist, fig_score_by_class, fig_metrics, fig_cm, metrics_df
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
with gr.Blocks(title="Credit Score Model Dashboard", theme=gr.themes.Soft()) as demo:
|
| 75 |
+
gr.Markdown(
|
| 76 |
+
"""
|
| 77 |
+
# Credit Score Model Dashboard
|
| 78 |
+
Compare a **Real-Data Model** vs a **Synthetic-Data Model** trained with CTGAN-generated data.
|
| 79 |
+
Click the button to regenerate synthetic data and retrain the synthetic model.
|
| 80 |
+
"""
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
run_btn = gr.Button(
|
| 84 |
+
"Generate New Synthetic Data & Analyze", variant="primary", size="lg"
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
gr.Markdown(
|
| 88 |
+
"""
|
| 89 |
+
## Feature Distribution Comparison
|
| 90 |
+
|
| 91 |
+
Below are the distributions of the features in the real vs synthetic training datasets.
|
| 92 |
+
"""
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
with gr.Row():
|
| 96 |
+
plot_feature_dist = gr.Plot(label='')
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
gr.Markdown(
|
| 100 |
+
"""
|
| 101 |
+
## Credit Models Metrics
|
| 102 |
+
|
| 103 |
+
Below are the metrics for the real-data and synthetic-data models.
|
| 104 |
+
"""
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
with gr.Row():
|
| 108 |
+
plot_metrics = gr.Plot(label='')
|
| 109 |
+
|
| 110 |
+
gr.Markdown(
|
| 111 |
+
"""
|
| 112 |
+
## Credit Score Distribution Comparison
|
| 113 |
+
Below are the distributions of the predicted credit scores for the real-data and synthetic-data models.
|
| 114 |
+
"""
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
with gr.Row():
|
| 118 |
+
plot_score_dist = gr.Plot(label='')
|
| 119 |
+
|
| 120 |
+
gr.Markdown(
|
| 121 |
+
"""
|
| 122 |
+
## Credit Score Distribution by Actual Class
|
| 123 |
+
Below are the distributions of the predicted credit scores for each actual class (Good, Standard, Poor) for both models.
|
| 124 |
+
"""
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
with gr.Row():
|
| 128 |
+
plot_score_by_class = gr.Plot(label='')
|
| 129 |
+
|
| 130 |
+
gr.Markdown(
|
| 131 |
+
"""
|
| 132 |
+
## Confusion Matrix Comparison
|
| 133 |
+
Below are the confusion matrices for the real-data and synthetic-data models.
|
| 134 |
+
"""
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
with gr.Row():
|
| 138 |
+
plot_cm = gr.Plot(label='')
|
| 139 |
+
|
| 140 |
+
run_btn.click(
|
| 141 |
+
fn=run_analysis,
|
| 142 |
+
inputs=[],
|
| 143 |
+
outputs=[plot_feature_dist, plot_score_dist, plot_score_by_class, plot_metrics, plot_cm],
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
demo.launch()
|
credit_models.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sklearn.linear_model import LogisticRegression
|
| 2 |
+
import pandas as pd
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def compute_credit_score(
|
| 6 |
+
coef_dict,
|
| 7 |
+
num_credit_card, changed_credit_limit, delay_from_due_date, interest_rate,
|
| 8 |
+
outstanding_debt, credit_mix_good, credit_mix_standard
|
| 9 |
+
):
|
| 10 |
+
"""Computes a credit score based on the logistic regression coefficients and input features.
|
| 11 |
+
|
| 12 |
+
Args:
|
| 13 |
+
coef_dict (dict): A dictionary containing the logistic regression coefficients.
|
| 14 |
+
num_credit_card (float): The number of credit cards.
|
| 15 |
+
changed_credit_limit (float): The change in credit limit.
|
| 16 |
+
delay_from_due_date (float): The delay from the due date.
|
| 17 |
+
interest_rate (float): The interest rate.
|
| 18 |
+
outstanding_debt (float): The outstanding debt.
|
| 19 |
+
credit_mix_good (float): The proportion of good credit mix.
|
| 20 |
+
credit_mix_standard (float): The proportion of standard credit mix.
|
| 21 |
+
Returns:
|
| 22 |
+
float: The computed credit score.
|
| 23 |
+
"""
|
| 24 |
+
score = (
|
| 25 |
+
coef_dict['Num_Credit_Card'] * num_credit_card +
|
| 26 |
+
coef_dict['Changed_Credit_Limit'] * changed_credit_limit +
|
| 27 |
+
coef_dict['Delay_from_due_date'] * delay_from_due_date +
|
| 28 |
+
coef_dict['Interest_Rate'] * interest_rate +
|
| 29 |
+
coef_dict['Outstanding_Debt'] * outstanding_debt +
|
| 30 |
+
coef_dict['Credit_Mix_Good'] * credit_mix_good +
|
| 31 |
+
coef_dict['Credit_Mix_Standard'] * credit_mix_standard
|
| 32 |
+
)
|
| 33 |
+
return score
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def real_data_credit_model(X_train, y_train, X_test):
|
| 37 |
+
"""
|
| 38 |
+
Trains a logistic regression model on the real training data and evaluates it on the real testing data.
|
| 39 |
+
|
| 40 |
+
Args:
|
| 41 |
+
X_train (pd.DataFrame): The training features.
|
| 42 |
+
y_train (pd.Series): The training target variable.
|
| 43 |
+
X_test (pd.DataFrame): The testing features.
|
| 44 |
+
Returns:
|
| 45 |
+
tuple: A tuple containing the computed credit scores and classifications for the testing data.
|
| 46 |
+
"""
|
| 47 |
+
model = LogisticRegression(
|
| 48 |
+
max_iter=1000,
|
| 49 |
+
class_weight='balanced'
|
| 50 |
+
)
|
| 51 |
+
model.fit(X_train, y_train)
|
| 52 |
+
|
| 53 |
+
coefficients = model.coef_[0]
|
| 54 |
+
coef_dict = dict(zip(X_train.columns, coefficients))
|
| 55 |
+
|
| 56 |
+
score = pd.Series([
|
| 57 |
+
compute_credit_score(
|
| 58 |
+
coef_dict,
|
| 59 |
+
row['Num_Credit_Card'], row['Changed_Credit_Limit'], row['Delay_from_due_date'],
|
| 60 |
+
row['Interest_Rate'], row['Outstanding_Debt'], row['Credit_Mix_Good'], row['Credit_Mix_Standard']
|
| 61 |
+
) for _, row in X_test.iterrows()
|
| 62 |
+
])
|
| 63 |
+
classification = model.predict(X_test)
|
| 64 |
+
|
| 65 |
+
return score, classification
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def synthetic_data_credit_model(X_train, y_train, X_test):
|
| 69 |
+
"""Trains a logistic regression model on the synthetic training data and evaluates it on the real testing data.
|
| 70 |
+
|
| 71 |
+
Args:
|
| 72 |
+
X_train (pd.DataFrame): The synthetic training features.
|
| 73 |
+
y_train (pd.Series): The synthetic training target variable.
|
| 74 |
+
X_test (pd.DataFrame): The real testing features.
|
| 75 |
+
Returns:
|
| 76 |
+
tuple: A tuple containing the computed credit scores and classifications for the testing data.
|
| 77 |
+
"""
|
| 78 |
+
model = LogisticRegression(
|
| 79 |
+
max_iter=1_000,
|
| 80 |
+
)
|
| 81 |
+
model.fit(X_train, y_train)
|
| 82 |
+
|
| 83 |
+
coefficients = model.coef_[0]
|
| 84 |
+
coef_dict = dict(zip(X_train.columns, coefficients))
|
| 85 |
+
|
| 86 |
+
score = pd.Series([
|
| 87 |
+
compute_credit_score(
|
| 88 |
+
coef_dict,
|
| 89 |
+
row['Num_Credit_Card'], row['Changed_Credit_Limit'], row['Delay_from_due_date'],
|
| 90 |
+
row['Interest_Rate'], row['Outstanding_Debt'], row['Credit_Mix_Good'], row['Credit_Mix_Standard']
|
| 91 |
+
) for _, row in X_test.iterrows()
|
| 92 |
+
])
|
| 93 |
+
classification = model.predict(X_test)
|
| 94 |
+
|
| 95 |
+
return score, classification
|
data/external/.gitkeep
ADDED
|
File without changes
|
data/interim/.gitkeep
ADDED
|
File without changes
|
data/interim/modifying_data.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
|
| 3 |
+
#
|
| 4 |
+
df_original = pd.read_csv("../data/raw/train_clean.csv")
|
| 5 |
+
|
| 6 |
+
def clean_loans(text):
|
| 7 |
+
if pd.isna(text):
|
| 8 |
+
return []
|
| 9 |
+
|
| 10 |
+
text = text.replace(" and ", ", ") # this might not work as they are classified with ", and" | like it did work, but imma keep this comment just i case
|
| 11 |
+
loans = [l.strip() for l in text.split(",")]
|
| 12 |
+
|
| 13 |
+
loans = [l for l in loans if l != ""]
|
| 14 |
+
return list(set(loans))
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
df_original["Loan_List"] = df_original["Type_of_Loan"].apply(clean_loans)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# Get all unique loan types
|
| 21 |
+
all_loans = set()
|
| 22 |
+
for row in df_original["Loan_List"]:
|
| 23 |
+
all_loans.update(row)
|
| 24 |
+
|
| 25 |
+
print(all_loans)
|
| 26 |
+
|
| 27 |
+
# Create binary columns
|
| 28 |
+
for loan in all_loans:
|
| 29 |
+
df_original[loan] = df_original["Loan_List"].apply(lambda x: int(loan in x))
|
| 30 |
+
|
| 31 |
+
# Drop original columns
|
| 32 |
+
df_original = df_original.drop(columns=["Type_of_Loan", "Loan_List"])
|
| 33 |
+
|
| 34 |
+
# Save new dataset
|
| 35 |
+
output_path = "./train_clean_type.csv"
|
| 36 |
+
df_original.to_csv(output_path, index=False)
|
| 37 |
+
|
| 38 |
+
print(f" File saved to: {output_path}")
|
| 39 |
+
print(f"shape: {df_original.shape}")
|
| 40 |
+
print("New columns addeeeeeddd:", list(all_loans))
|
data/interim/train_clean_type.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:537dc0040723321f7b968d65fd49d5ddf666a01a2cfc935d76ca00bc26731d47
|
| 3 |
+
size 16601166
|
data/processed/.gitkeep
ADDED
|
File without changes
|
data/processed/v2/train_balanced_mixed.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b741acf7cc7512f40e0013e32b4e77911763ba07a53650bd27236615ed30df1f
|
| 3 |
+
size 27855735
|
data/processed/v2/train_balanced_synthetic.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8ac8bd3b577f29b3752a7cc64a6f9c960280e565bdf31fbb225df33947811084
|
| 3 |
+
size 18773470
|
data/processed/v2/train_balanced_synthetic2.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:282f24dc0094bc5545000890516662cd3bba2c25c134612d8104aa5762ba718d
|
| 3 |
+
size 18774085
|
data/processed/v4/real_test_data.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/processed/v4/real_train_data.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d17e48e95a445d0829bd6884db9a39472792e93e0e5130fcd7040bb6d95daccc
|
| 3 |
+
size 12081755
|
data/processed/v4/synthetic_train_data.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0addf8d57b4ef9b0b32db4b53382a29f787e47cfb5cbac0359127cfb1b8ca66b
|
| 3 |
+
size 18017035
|
data/raw/.gitkeep
ADDED
|
File without changes
|
data/raw/train_clean.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:afe6a49d21938d60d482b326b821cea6dcfa41f55f7f0cf15a3a517bf590403c
|
| 3 |
+
size 20561019
|
data_generation.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from sdv.single_table import CTGANSynthesizer
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def generate_synthetic_training_data(n=30_000):
|
| 6 |
+
"""Generates synthetic training data using pre-trained CTGAN models for each credit score category.
|
| 7 |
+
|
| 8 |
+
Args:
|
| 9 |
+
n (int, optional): The number of samples to generate for each category. Defaults to 30_000.
|
| 10 |
+
Returns:
|
| 11 |
+
pd.DataFrame: The generated synthetic training data.
|
| 12 |
+
"""
|
| 13 |
+
good_generator = CTGANSynthesizer.load("../models/v4/synth_good.pkl")
|
| 14 |
+
poor_generator = CTGANSynthesizer.load("../models/v4/synth_poor.pkl")
|
| 15 |
+
standard_generator = CTGANSynthesizer.load("../models/v4/synth_standard.pkl")
|
| 16 |
+
|
| 17 |
+
synth_good = good_generator.sample(n)
|
| 18 |
+
synth_poor = poor_generator.sample(n)
|
| 19 |
+
synth_standard = standard_generator.sample(n)
|
| 20 |
+
|
| 21 |
+
full_data = pd.concat([synth_good, synth_poor, synth_standard], ignore_index=True)
|
| 22 |
+
shuffled_data = full_data.sample(frac=1).reset_index(drop=True)
|
| 23 |
+
return shuffled_data
|
data_preprocessing.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def preprocess_real_data(train, test, target):
|
| 5 |
+
"""Preprocesses the real training and testing datasets by selecting relevant features and encoding categorical variables.
|
| 6 |
+
|
| 7 |
+
Args:
|
| 8 |
+
train (pd.DataFrame): The real training dataset.
|
| 9 |
+
test (pd.DataFrame): The real testing dataset.
|
| 10 |
+
target (str): The name of the target variable.
|
| 11 |
+
Returns:
|
| 12 |
+
tuple: A tuple containing the preprocessed training features, training target, testing features, and testing target.
|
| 13 |
+
"""
|
| 14 |
+
train['Outstanding_Debt'] = train['Outstanding_Debt'] / 1000
|
| 15 |
+
test['Outstanding_Debt'] = test['Outstanding_Debt'] / 1000
|
| 16 |
+
|
| 17 |
+
cols = [
|
| 18 |
+
'Num_Credit_Card',
|
| 19 |
+
'Changed_Credit_Limit',
|
| 20 |
+
'Delay_from_due_date',
|
| 21 |
+
'Interest_Rate',
|
| 22 |
+
'Credit_Mix',
|
| 23 |
+
'Outstanding_Debt',
|
| 24 |
+
target
|
| 25 |
+
]
|
| 26 |
+
|
| 27 |
+
train = train[cols]
|
| 28 |
+
test = test[cols]
|
| 29 |
+
|
| 30 |
+
train = pd.get_dummies(train, columns=['Credit_Mix'], drop_first=True)
|
| 31 |
+
test = pd.get_dummies(test, columns=['Credit_Mix'], drop_first=True)
|
| 32 |
+
|
| 33 |
+
X_real_train = train.drop(columns=[target])
|
| 34 |
+
y_real_train = train[target]
|
| 35 |
+
|
| 36 |
+
X_real_test = test.drop(columns=[target])
|
| 37 |
+
y_real_test = test[target]
|
| 38 |
+
|
| 39 |
+
return X_real_train, y_real_train, X_real_test, y_real_test
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def preprocess_synthetic_data(synthetic_data, target):
|
| 43 |
+
"""Preprocesses the synthetic dataset by selecting relevant features and encoding categorical variables.
|
| 44 |
+
|
| 45 |
+
Args:
|
| 46 |
+
synthetic_data (pd.DataFrame): The synthetic dataset to preprocess.
|
| 47 |
+
target (str): The name of the target variable.
|
| 48 |
+
Returns:
|
| 49 |
+
tuple: A tuple containing the preprocessed synthetic features and synthetic target.
|
| 50 |
+
"""
|
| 51 |
+
synthetic_data['Outstanding_Debt'] = synthetic_data['Outstanding_Debt'] / 1000
|
| 52 |
+
|
| 53 |
+
synthetic_data = synthetic_data[[
|
| 54 |
+
'Num_Credit_Card',
|
| 55 |
+
'Changed_Credit_Limit',
|
| 56 |
+
'Delay_from_due_date',
|
| 57 |
+
'Interest_Rate',
|
| 58 |
+
'Credit_Mix',
|
| 59 |
+
'Outstanding_Debt',
|
| 60 |
+
target
|
| 61 |
+
]]
|
| 62 |
+
|
| 63 |
+
synthetic_data = pd.get_dummies(synthetic_data, columns=['Credit_Mix'], drop_first=True)
|
| 64 |
+
|
| 65 |
+
X_synthetic_train = synthetic_data.drop(columns=[target])
|
| 66 |
+
y_synthetic_train = synthetic_data[target]
|
| 67 |
+
|
| 68 |
+
return X_synthetic_train, y_synthetic_train
|
models/.gitkeep
ADDED
|
File without changes
|
models/v2/model_good.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:709c2649ce4180a137e34382ae3239a8f6b69e7d1e3371434865bd6879eb7ed9
|
| 3 |
+
size 4194460
|
models/v2/model_poor.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:af85d29197a3d4845b8534605c4ccd848b9e27de189b003889e361fc4a03a902
|
| 3 |
+
size 5667880
|
models/v2/model_standard.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:af53dccff9f076a49ef6b2fcf0dc91fdab2815ed8d8fffcb693fdd6eab250bd2
|
| 3 |
+
size 8651019
|
models/v4/synth_good.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a9beb0dc022c97a97f3815a8038258b8269c1c2882bbc851d280ddbf2d3e0dca
|
| 3 |
+
size 2458679
|
models/v4/synth_poor.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:30af7c40ed01cf22acf77a33bce931dbe166e7ee25890e38e3feb27019140467
|
| 3 |
+
size 2951615
|
models/v4/synth_standard.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e6d5e4bb29162a02f59f25bd8f4db5ebd286b9844eef20e0067641a20a911d6a
|
| 3 |
+
size 3941298
|
visualization.py
ADDED
|
@@ -0,0 +1,262 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import matplotlib.pyplot as plt
|
| 2 |
+
import seaborn as sns
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
|
| 5 |
+
|
| 6 |
+
plt.rcParams['figure.facecolor'] = '#1F2937'
|
| 7 |
+
plt.rcParams['axes.facecolor'] = '#0B0F19'
|
| 8 |
+
plt.rcParams['text.color'] = 'white'
|
| 9 |
+
plt.rcParams['axes.labelcolor'] = 'white'
|
| 10 |
+
plt.rcParams['xtick.color'] = 'white'
|
| 11 |
+
plt.rcParams['ytick.color'] = 'white'
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def plot_feature_distributions(real_data, synthetic_data):
|
| 15 |
+
features = real_data.columns.to_list()
|
| 16 |
+
|
| 17 |
+
n_cols = 3
|
| 18 |
+
n_rows = (len(features) + n_cols - 1) // n_cols
|
| 19 |
+
|
| 20 |
+
fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 5 * n_rows))
|
| 21 |
+
|
| 22 |
+
for i, feature in enumerate(features):
|
| 23 |
+
row = i // n_cols
|
| 24 |
+
col = i % n_cols
|
| 25 |
+
|
| 26 |
+
sns.histplot(
|
| 27 |
+
real_data[feature],
|
| 28 |
+
bins=30,
|
| 29 |
+
color='skyblue',
|
| 30 |
+
stat='count',
|
| 31 |
+
element='step',
|
| 32 |
+
fill=True,
|
| 33 |
+
alpha=0.2,
|
| 34 |
+
ax=axes[row, col]
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
sns.histplot(
|
| 38 |
+
synthetic_data[feature],
|
| 39 |
+
bins=30,
|
| 40 |
+
color='indianred',
|
| 41 |
+
stat='count',
|
| 42 |
+
element='step',
|
| 43 |
+
fill=True,
|
| 44 |
+
alpha=0.2,
|
| 45 |
+
ax=axes[row, col]
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
axes[row, col].set_title(f'Distribution of {feature}')
|
| 49 |
+
axes[row, col].set_xlabel(feature)
|
| 50 |
+
axes[row, col].set_ylabel('Frequency')
|
| 51 |
+
axes[row, col].legend(['Real Data', 'Synthetic Data'])
|
| 52 |
+
|
| 53 |
+
for j in range(i + 1, n_rows * n_cols):
|
| 54 |
+
fig.delaxes(axes[j // n_cols, j % n_cols])
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
plt.tight_layout()
|
| 58 |
+
return fig
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def get_metrics_df(y_true, y_real_pred, y_synth_pred):
|
| 62 |
+
metrics = {
|
| 63 |
+
'Model': ['Real Data Model', 'Synthetic Data Model'],
|
| 64 |
+
'Accuracy': [
|
| 65 |
+
accuracy_score(y_true, y_real_pred),
|
| 66 |
+
accuracy_score(y_true, y_synth_pred)
|
| 67 |
+
],
|
| 68 |
+
'Precision': [
|
| 69 |
+
precision_score(y_true, y_real_pred, average='weighted'),
|
| 70 |
+
precision_score(y_true, y_synth_pred, average='weighted')
|
| 71 |
+
],
|
| 72 |
+
'Recall': [
|
| 73 |
+
recall_score(y_true, y_real_pred, average='weighted'),
|
| 74 |
+
recall_score(y_true, y_synth_pred, average='weighted')
|
| 75 |
+
],
|
| 76 |
+
'F1-Score': [
|
| 77 |
+
f1_score(y_true, y_real_pred, average='weighted'),
|
| 78 |
+
f1_score(y_true, y_synth_pred, average='weighted')
|
| 79 |
+
]
|
| 80 |
+
}
|
| 81 |
+
return pd.DataFrame(metrics)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def plot_comparative_credit_score_distribution(
|
| 85 |
+
real_scores,
|
| 86 |
+
synth_scores,
|
| 87 |
+
bins=50,
|
| 88 |
+
title='Comparative Credit Score Distribution: Real vs Synthetic Models'
|
| 89 |
+
):
|
| 90 |
+
fig, axes = plt.subplots(1, 2, figsize=(16, 5), sharey=True)
|
| 91 |
+
|
| 92 |
+
sns.histplot(
|
| 93 |
+
real_scores,
|
| 94 |
+
bins=bins,
|
| 95 |
+
stat='count',
|
| 96 |
+
element='step',
|
| 97 |
+
fill=True,
|
| 98 |
+
alpha=0.2,
|
| 99 |
+
color='skyblue',
|
| 100 |
+
ax=axes[0]
|
| 101 |
+
)
|
| 102 |
+
axes[0].set_title('Real-Data Model Score Distribution')
|
| 103 |
+
axes[0].set_xlabel('Predicted Credit Score')
|
| 104 |
+
axes[0].set_ylabel('Frequency')
|
| 105 |
+
|
| 106 |
+
sns.histplot(
|
| 107 |
+
synth_scores,
|
| 108 |
+
bins=bins,
|
| 109 |
+
stat='count',
|
| 110 |
+
element='step',
|
| 111 |
+
fill=True,
|
| 112 |
+
alpha=0.2,
|
| 113 |
+
color='skyblue',
|
| 114 |
+
ax=axes[1]
|
| 115 |
+
)
|
| 116 |
+
axes[1].set_title('Synthetic-Data Model Score Distribution')
|
| 117 |
+
axes[1].set_xlabel('Predicted Credit Score')
|
| 118 |
+
axes[1].set_ylabel('Frequency')
|
| 119 |
+
|
| 120 |
+
plt.tight_layout()
|
| 121 |
+
return fig
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def plot_comparison_table(
|
| 125 |
+
y_true, y_real_pred, y_synth_pred,
|
| 126 |
+
title='Model Comparison: Real Data vs Synthetic Data'
|
| 127 |
+
):
|
| 128 |
+
metrics_df = get_metrics_df(y_true, y_real_pred, y_synth_pred)
|
| 129 |
+
display_df = metrics_df.copy().round(4).set_index('Model')
|
| 130 |
+
|
| 131 |
+
fig, ax = plt.subplots(figsize=(18, 2))
|
| 132 |
+
ax.axis('off')
|
| 133 |
+
|
| 134 |
+
table = ax.table(
|
| 135 |
+
cellText=display_df.values,
|
| 136 |
+
rowLabels=display_df.index,
|
| 137 |
+
colLabels=display_df.columns,
|
| 138 |
+
cellLoc='center',
|
| 139 |
+
loc='center',
|
| 140 |
+
)
|
| 141 |
+
table.auto_set_font_size(False)
|
| 142 |
+
table.set_fontsize(16)
|
| 143 |
+
table.scale(1.2, 1.9)
|
| 144 |
+
|
| 145 |
+
for j in range(len(display_df.columns)):
|
| 146 |
+
table[(0, j)].set_facecolor('#1F77B4')
|
| 147 |
+
table[(0, j)].set_text_props(color='white', weight='bold')
|
| 148 |
+
|
| 149 |
+
table[(0, j)].set_edgecolor('white')
|
| 150 |
+
table[(0, j)].set_linewidth(1)
|
| 151 |
+
|
| 152 |
+
for i in range(1, len(display_df.index) + 1):
|
| 153 |
+
bg = '#0B0F19' if i % 2 else '#0B0F19'
|
| 154 |
+
|
| 155 |
+
table[(i, -1)].set_text_props(color='white', weight='bold')
|
| 156 |
+
table[(i, -1)].set_facecolor(bg)
|
| 157 |
+
table[(i, -1)].set_edgecolor('white')
|
| 158 |
+
table[(i, -1)].set_linewidth(1)
|
| 159 |
+
|
| 160 |
+
for j in range(len(display_df.columns)):
|
| 161 |
+
table[(i, j)].set_facecolor(bg)
|
| 162 |
+
table[(i, j)].set_text_props(color='white')
|
| 163 |
+
table[(i, j)].set_edgecolor('white')
|
| 164 |
+
table[(i, j)].set_linewidth(1)
|
| 165 |
+
|
| 166 |
+
plt.tight_layout()
|
| 167 |
+
return fig
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def plot_comparative_confusion_matrices(
|
| 171 |
+
y_true,
|
| 172 |
+
y_pred_real,
|
| 173 |
+
y_pred_synth,
|
| 174 |
+
labels=None,
|
| 175 |
+
normalize=False,
|
| 176 |
+
cmap='Blues'
|
| 177 |
+
):
|
| 178 |
+
cm_real = confusion_matrix(y_true, y_pred_real, labels=labels)
|
| 179 |
+
cm_synth = confusion_matrix(y_true, y_pred_synth, labels=labels)
|
| 180 |
+
|
| 181 |
+
if normalize:
|
| 182 |
+
cm_real_plot = cm_real.astype(float) / cm_real.sum(axis=1, keepdims=True)
|
| 183 |
+
cm_synth_plot = cm_synth.astype(float) / cm_synth.sum(axis=1, keepdims=True)
|
| 184 |
+
fmt = '.2f'
|
| 185 |
+
else:
|
| 186 |
+
cm_real_plot = cm_real
|
| 187 |
+
cm_synth_plot = cm_synth
|
| 188 |
+
fmt = 'd'
|
| 189 |
+
|
| 190 |
+
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
|
| 191 |
+
|
| 192 |
+
sns.heatmap(
|
| 193 |
+
cm_real_plot, annot=True, fmt=fmt, cmap=cmap,
|
| 194 |
+
xticklabels=labels, yticklabels=labels, ax=axes[0]
|
| 195 |
+
)
|
| 196 |
+
axes[0].set_title(f"Real Data Confusion Matrix")
|
| 197 |
+
axes[0].set_xlabel("Predicted")
|
| 198 |
+
axes[0].set_ylabel("Actual")
|
| 199 |
+
|
| 200 |
+
sns.heatmap(
|
| 201 |
+
cm_synth_plot, annot=True, fmt=fmt, cmap=cmap,
|
| 202 |
+
xticklabels=labels, yticklabels=labels, ax=axes[1]
|
| 203 |
+
)
|
| 204 |
+
axes[1].set_title(f"Synthetic Data Confusion Matrix")
|
| 205 |
+
axes[1].set_xlabel("Predicted")
|
| 206 |
+
axes[1].set_ylabel("Actual")
|
| 207 |
+
|
| 208 |
+
plt.tight_layout()
|
| 209 |
+
return fig
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
def plot_comparative_credit_score_distribution_by_actual_class(
|
| 213 |
+
y_true,
|
| 214 |
+
real_scores,
|
| 215 |
+
synth_scores,
|
| 216 |
+
color_map,
|
| 217 |
+
label_order=None,
|
| 218 |
+
bins=50,
|
| 219 |
+
):
|
| 220 |
+
fig, (ax_left, ax_right) = plt.subplots(1, 2, figsize=(16, 5), sharey=True)
|
| 221 |
+
|
| 222 |
+
y_true_arr = pd.Series(y_true).values
|
| 223 |
+
|
| 224 |
+
for label in label_order:
|
| 225 |
+
mask = (y_true_arr == label)
|
| 226 |
+
|
| 227 |
+
sns.histplot(
|
| 228 |
+
real_scores[mask],
|
| 229 |
+
bins=bins,
|
| 230 |
+
stat='count',
|
| 231 |
+
element='step',
|
| 232 |
+
fill=True,
|
| 233 |
+
alpha=0.2,
|
| 234 |
+
color=color_map.get(label, None),
|
| 235 |
+
label=label,
|
| 236 |
+
ax=ax_left
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
sns.histplot(
|
| 240 |
+
synth_scores[mask],
|
| 241 |
+
bins=bins,
|
| 242 |
+
stat='count',
|
| 243 |
+
element='step',
|
| 244 |
+
fill=True,
|
| 245 |
+
alpha=0.2,
|
| 246 |
+
color=color_map.get(label, None),
|
| 247 |
+
label=label,
|
| 248 |
+
ax=ax_right
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
ax_left.set_title('Real-Data Model: Actual Class Distribution')
|
| 252 |
+
ax_left.set_xlabel('Predicted Credit Score')
|
| 253 |
+
ax_left.set_ylabel('Frequency')
|
| 254 |
+
ax_left.legend(title='Actual Class')
|
| 255 |
+
|
| 256 |
+
ax_right.set_title('Synthetic-Data Model: Actual Class Distribution')
|
| 257 |
+
ax_right.set_xlabel('Predicted Credit Score')
|
| 258 |
+
ax_right.set_ylabel('Frequency')
|
| 259 |
+
ax_right.legend(title='Actual Class')
|
| 260 |
+
|
| 261 |
+
plt.tight_layout()
|
| 262 |
+
return fig
|