Spaces:
Sleeping
Sleeping
Upload 3 files
Browse files- app.py +35 -6
- data_generation.py +90 -4
- visualization.py +44 -0
app.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
| 3 |
|
| 4 |
-
from data_generation import generate_synthetic_training_data
|
| 5 |
from data_preprocessing import preprocess_real_data, preprocess_synthetic_data
|
| 6 |
from credit_models import real_data_credit_model, synthetic_data_credit_model
|
| 7 |
from visualization import (
|
|
@@ -10,6 +10,7 @@ from visualization import (
|
|
| 10 |
plot_comparison_table,
|
| 11 |
plot_comparative_confusion_matrices,
|
| 12 |
plot_comparative_credit_score_distribution_by_actual_class,
|
|
|
|
| 13 |
get_metrics_df,
|
| 14 |
)
|
| 15 |
|
|
@@ -23,8 +24,8 @@ LABEL_ORDER = ['Good', 'Standard', 'Poor']
|
|
| 23 |
TARGET = 'Credit_Score'
|
| 24 |
|
| 25 |
# Load and preprocess real data once at startup
|
| 26 |
-
real_train = pd.read_csv('data/processed/v4/real_train_data.csv')
|
| 27 |
-
real_test = pd.read_csv('data/processed/v4/real_test_data.csv')
|
| 28 |
|
| 29 |
X_real_train, y_real_train, X_real_test, y_real_test = preprocess_real_data(
|
| 30 |
real_train, real_test, TARGET
|
|
@@ -35,16 +36,34 @@ real_scores, real_classification = real_data_credit_model(
|
|
| 35 |
X_real_train, y_real_train, X_real_test
|
| 36 |
)
|
| 37 |
|
| 38 |
-
|
| 39 |
def run_analysis():
|
| 40 |
"""Generate new synthetic data, train the synthetic model, and return all comparison plots."""
|
| 41 |
synthetic_data = generate_synthetic_training_data(n=int(len(X_real_train)/3)) # Same number of samples as real training data
|
| 42 |
X_synth_train, y_synth_train = preprocess_synthetic_data(synthetic_data, TARGET)
|
| 43 |
|
|
|
|
|
|
|
|
|
|
| 44 |
fig_feature_dist = plot_feature_distributions(
|
| 45 |
X_real_train, X_synth_train
|
| 46 |
)
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
synth_scores, synth_classification = synthetic_data_credit_model(
|
| 49 |
X_synth_train, y_synth_train, X_real_test
|
| 50 |
)
|
|
@@ -68,7 +87,7 @@ def run_analysis():
|
|
| 68 |
metrics_df = get_metrics_df(y_real_test, real_classification, synth_classification)
|
| 69 |
metrics_df = metrics_df.round(4)
|
| 70 |
|
| 71 |
-
return fig_feature_dist, fig_score_dist, fig_score_by_class, fig_metrics, fig_cm, metrics_df
|
| 72 |
|
| 73 |
|
| 74 |
with gr.Blocks(title="Credit Score Model Dashboard", theme=gr.themes.Soft()) as demo:
|
|
@@ -95,6 +114,16 @@ with gr.Blocks(title="Credit Score Model Dashboard", theme=gr.themes.Soft()) as
|
|
| 95 |
with gr.Row():
|
| 96 |
plot_feature_dist = gr.Plot(label='')
|
| 97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
|
| 99 |
gr.Markdown(
|
| 100 |
"""
|
|
@@ -140,7 +169,7 @@ with gr.Blocks(title="Credit Score Model Dashboard", theme=gr.themes.Soft()) as
|
|
| 140 |
run_btn.click(
|
| 141 |
fn=run_analysis,
|
| 142 |
inputs=[],
|
| 143 |
-
outputs=[plot_feature_dist, plot_score_dist, plot_score_by_class, plot_metrics, plot_cm],
|
| 144 |
)
|
| 145 |
|
| 146 |
demo.launch()
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
| 3 |
|
| 4 |
+
from data_generation import generate_synthetic_training_data, evaluate_synthetic_data
|
| 5 |
from data_preprocessing import preprocess_real_data, preprocess_synthetic_data
|
| 6 |
from credit_models import real_data_credit_model, synthetic_data_credit_model
|
| 7 |
from visualization import (
|
|
|
|
| 10 |
plot_comparison_table,
|
| 11 |
plot_comparative_confusion_matrices,
|
| 12 |
plot_comparative_credit_score_distribution_by_actual_class,
|
| 13 |
+
plot_evaluation_table,
|
| 14 |
get_metrics_df,
|
| 15 |
)
|
| 16 |
|
|
|
|
| 24 |
TARGET = 'Credit_Score'
|
| 25 |
|
| 26 |
# Load and preprocess real data once at startup
|
| 27 |
+
real_train = pd.read_csv('../data/processed/v4/real_train_data.csv')
|
| 28 |
+
real_test = pd.read_csv('../data/processed/v4/real_test_data.csv')
|
| 29 |
|
| 30 |
X_real_train, y_real_train, X_real_test, y_real_test = preprocess_real_data(
|
| 31 |
real_train, real_test, TARGET
|
|
|
|
| 36 |
X_real_train, y_real_train, X_real_test
|
| 37 |
)
|
| 38 |
|
|
|
|
| 39 |
def run_analysis():
|
| 40 |
"""Generate new synthetic data, train the synthetic model, and return all comparison plots."""
|
| 41 |
synthetic_data = generate_synthetic_training_data(n=int(len(X_real_train)/3)) # Same number of samples as real training data
|
| 42 |
X_synth_train, y_synth_train = preprocess_synthetic_data(synthetic_data, TARGET)
|
| 43 |
|
| 44 |
+
categorical_cols = [col for col in X_real_train.columns if X_real_train[col].dtype in ["object", "bool", "uint8"]]
|
| 45 |
+
numeric_cols = [col for col in X_real_train.columns if X_real_train[col].dtype in ["int64", "float64"]]
|
| 46 |
+
|
| 47 |
fig_feature_dist = plot_feature_distributions(
|
| 48 |
X_real_train, X_synth_train
|
| 49 |
)
|
| 50 |
|
| 51 |
+
summary_rows = []
|
| 52 |
+
for cls in LABEL_ORDER:
|
| 53 |
+
real_cls = X_real_train[y_real_train == cls]
|
| 54 |
+
synth_cls = X_synth_train[y_synth_train == cls]
|
| 55 |
+
ks_pass_rate, mean_ks, chi_pass_rate, mean_corr_diff = evaluate_synthetic_data(
|
| 56 |
+
real_cls, synth_cls, categorical_cols, numeric_cols
|
| 57 |
+
)
|
| 58 |
+
summary_rows.append({
|
| 59 |
+
"ks_passed": ks_pass_rate == 1.0,
|
| 60 |
+
"mean_ks_stat": round(mean_ks, 4),
|
| 61 |
+
"chi_passed": chi_pass_rate == 1.0 if chi_pass_rate is not None else None,
|
| 62 |
+
"mean_corr_diff": round(mean_corr_diff, 4),
|
| 63 |
+
})
|
| 64 |
+
summary_df = pd.DataFrame(summary_rows, index=LABEL_ORDER)
|
| 65 |
+
summary = plot_evaluation_table(summary_df)
|
| 66 |
+
|
| 67 |
synth_scores, synth_classification = synthetic_data_credit_model(
|
| 68 |
X_synth_train, y_synth_train, X_real_test
|
| 69 |
)
|
|
|
|
| 87 |
metrics_df = get_metrics_df(y_real_test, real_classification, synth_classification)
|
| 88 |
metrics_df = metrics_df.round(4)
|
| 89 |
|
| 90 |
+
return fig_feature_dist, summary, fig_score_dist, fig_score_by_class, fig_metrics, fig_cm, metrics_df
|
| 91 |
|
| 92 |
|
| 93 |
with gr.Blocks(title="Credit Score Model Dashboard", theme=gr.themes.Soft()) as demo:
|
|
|
|
| 114 |
with gr.Row():
|
| 115 |
plot_feature_dist = gr.Plot(label='')
|
| 116 |
|
| 117 |
+
gr.Markdown(
|
| 118 |
+
"""
|
| 119 |
+
## Generated Data Quality Summary
|
| 120 |
+
|
| 121 |
+
Below is a summary of the data quality evaluation comparing the synthetic training data to the real training data across multiple metrics.
|
| 122 |
+
"""
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
with gr.Row():
|
| 126 |
+
plot_summary = gr.Plot(label='')
|
| 127 |
|
| 128 |
gr.Markdown(
|
| 129 |
"""
|
|
|
|
| 169 |
run_btn.click(
|
| 170 |
fn=run_analysis,
|
| 171 |
inputs=[],
|
| 172 |
+
outputs=[plot_feature_dist, plot_summary, plot_score_dist, plot_score_by_class, plot_metrics, plot_cm],
|
| 173 |
)
|
| 174 |
|
| 175 |
demo.launch()
|
data_generation.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
| 1 |
import torch
|
| 2 |
import pandas as pd
|
| 3 |
from sdv.single_table import CTGANSynthesizer
|
|
|
|
|
|
|
| 4 |
|
| 5 |
# Patch torch.load to remap MPS tensors to CPU for environments without Apple Silicon
|
| 6 |
_original_torch_load = torch.load
|
|
@@ -20,9 +22,9 @@ def generate_synthetic_training_data(n=30_000):
|
|
| 20 |
Returns:
|
| 21 |
pd.DataFrame: The generated synthetic training data.
|
| 22 |
"""
|
| 23 |
-
good_generator = CTGANSynthesizer.load("models/v4/synth_good.pkl")
|
| 24 |
-
poor_generator = CTGANSynthesizer.load("models/v4/synth_poor.pkl")
|
| 25 |
-
standard_generator = CTGANSynthesizer.load("models/v4/synth_standard.pkl")
|
| 26 |
|
| 27 |
synth_good = good_generator.sample(n)
|
| 28 |
synth_poor = poor_generator.sample(n)
|
|
@@ -30,4 +32,88 @@ def generate_synthetic_training_data(n=30_000):
|
|
| 30 |
|
| 31 |
full_data = pd.concat([synth_good, synth_poor, synth_standard], ignore_index=True)
|
| 32 |
shuffled_data = full_data.sample(frac=1).reset_index(drop=True)
|
| 33 |
-
return shuffled_data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import torch
|
| 2 |
import pandas as pd
|
| 3 |
from sdv.single_table import CTGANSynthesizer
|
| 4 |
+
from scipy import stats
|
| 5 |
+
import numpy as np
|
| 6 |
|
| 7 |
# Patch torch.load to remap MPS tensors to CPU for environments without Apple Silicon
|
| 8 |
_original_torch_load = torch.load
|
|
|
|
| 22 |
Returns:
|
| 23 |
pd.DataFrame: The generated synthetic training data.
|
| 24 |
"""
|
| 25 |
+
good_generator = CTGANSynthesizer.load("../models/v4/synth_good.pkl")
|
| 26 |
+
poor_generator = CTGANSynthesizer.load("../models/v4/synth_poor.pkl")
|
| 27 |
+
standard_generator = CTGANSynthesizer.load("../models/v4/synth_standard.pkl")
|
| 28 |
|
| 29 |
synth_good = good_generator.sample(n)
|
| 30 |
synth_poor = poor_generator.sample(n)
|
|
|
|
| 32 |
|
| 33 |
full_data = pd.concat([synth_good, synth_poor, synth_standard], ignore_index=True)
|
| 34 |
shuffled_data = full_data.sample(frac=1).reset_index(drop=True)
|
| 35 |
+
return shuffled_data
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def evaluate_synthetic_data(real_df, synthetic_df,
|
| 39 |
+
categorical_cols=None, numeric_cols=None):
|
| 40 |
+
if numeric_cols is None:
|
| 41 |
+
numeric_cols = real_df.select_dtypes(include="number").columns.tolist()
|
| 42 |
+
if categorical_cols is None:
|
| 43 |
+
categorical_cols = real_df.select_dtypes(include="object").columns.tolist()
|
| 44 |
+
|
| 45 |
+
# ── KS Test ───────────────────────────────────────────────────────────
|
| 46 |
+
ks_results = []
|
| 47 |
+
for col in numeric_cols:
|
| 48 |
+
stat, p_value = stats.ks_2samp(
|
| 49 |
+
real_df[col].dropna(),
|
| 50 |
+
synthetic_df[col].dropna()
|
| 51 |
+
)
|
| 52 |
+
ks_results.append({
|
| 53 |
+
"column" : col,
|
| 54 |
+
"ks_stat": round(stat, 4),
|
| 55 |
+
"p_value": round(p_value, 4),
|
| 56 |
+
"pass" : p_value > 0.05
|
| 57 |
+
})
|
| 58 |
+
ks_df = pd.DataFrame(ks_results)
|
| 59 |
+
|
| 60 |
+
# ── Chi-Square Test ───────────────────────────────────────────────────
|
| 61 |
+
chi_results = []
|
| 62 |
+
for col in categorical_cols:
|
| 63 |
+
real_counts = real_df[col].value_counts()
|
| 64 |
+
synth_counts = synthetic_df[col].value_counts()
|
| 65 |
+
all_cats = real_counts.index.union(synth_counts.index)
|
| 66 |
+
real_freq = real_counts.reindex(all_cats, fill_value=0)
|
| 67 |
+
synth_freq = synth_counts.reindex(all_cats, fill_value=0)
|
| 68 |
+
n = real_freq.sum()
|
| 69 |
+
f_exp = (real_freq / real_freq.sum()) * n
|
| 70 |
+
f_obs = (synth_freq / synth_freq.sum()) * n
|
| 71 |
+
stat, p_value = stats.chisquare(f_obs=f_obs, f_exp=f_exp)
|
| 72 |
+
chi_results.append({
|
| 73 |
+
"column" : col,
|
| 74 |
+
"chi_stat": round(stat, 4),
|
| 75 |
+
"p_value" : round(p_value, 4),
|
| 76 |
+
"pass" : p_value > 0.05
|
| 77 |
+
})
|
| 78 |
+
chi_df = pd.DataFrame(chi_results)
|
| 79 |
+
|
| 80 |
+
# ── Correlation Matrix ────────────────────────────────────────────────
|
| 81 |
+
real_corr = real_df[numeric_cols].corr()
|
| 82 |
+
synth_corr = synthetic_df[numeric_cols].corr()
|
| 83 |
+
corr_diff = (real_corr - synth_corr).abs()
|
| 84 |
+
upper_idx = np.triu_indices_from(corr_diff.values, k=1)
|
| 85 |
+
mean_corr_diff = corr_diff.values[upper_idx].mean()
|
| 86 |
+
|
| 87 |
+
ks_pass_rate = ks_df["pass"].mean()
|
| 88 |
+
mean_ks = ks_df["ks_stat"].mean()
|
| 89 |
+
chi_pass_rate = chi_df["pass"].mean() if not chi_df.empty else None
|
| 90 |
+
|
| 91 |
+
return ks_pass_rate, mean_ks, chi_pass_rate, mean_corr_diff
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def data_evaluation(
|
| 95 |
+
real_list, synthetic_list, class_names=None, categorical_cols=None, numeric_cols=None
|
| 96 |
+
):
|
| 97 |
+
if class_names is None:
|
| 98 |
+
class_names = ["good", "poor", "standard"]
|
| 99 |
+
|
| 100 |
+
summary_rows = []
|
| 101 |
+
|
| 102 |
+
for cls, real_df, syn_df in zip(class_names, real_list, synthetic_list):
|
| 103 |
+
|
| 104 |
+
ks_pass_rate, mean_ks, chi_pass_rate, mean_corr_diff = evaluate_synthetic_data(
|
| 105 |
+
real_df = real_df,
|
| 106 |
+
synthetic_df = syn_df,
|
| 107 |
+
categorical_cols = categorical_cols,
|
| 108 |
+
numeric_cols = numeric_cols
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
summary_rows.append({
|
| 112 |
+
"class" : cls,
|
| 113 |
+
"ks_pass_rate" : round(ks_pass_rate, 4),
|
| 114 |
+
"mean_ks_stat" : round(mean_ks, 4),
|
| 115 |
+
"chi_pass_rate" : round(chi_pass_rate, 4) if chi_pass_rate is not None else None,
|
| 116 |
+
"mean_corr_diff": round(mean_corr_diff, 4)
|
| 117 |
+
})
|
| 118 |
+
|
| 119 |
+
return pd.DataFrame(summary_rows).set_index("class")
|
visualization.py
CHANGED
|
@@ -258,5 +258,49 @@ def plot_comparative_credit_score_distribution_by_actual_class(
|
|
| 258 |
ax_right.set_ylabel('Frequency')
|
| 259 |
ax_right.legend(title='Actual Class')
|
| 260 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
plt.tight_layout()
|
| 262 |
return fig
|
|
|
|
| 258 |
ax_right.set_ylabel('Frequency')
|
| 259 |
ax_right.legend(title='Actual Class')
|
| 260 |
|
| 261 |
+
plt.tight_layout()
|
| 262 |
+
return fig
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
def plot_evaluation_table(summary_df, title="Synthetic Data Evaluation Summary"):
|
| 266 |
+
display_df = summary_df.copy().round(4)
|
| 267 |
+
|
| 268 |
+
fig, ax = plt.subplots(figsize=(18, 2))
|
| 269 |
+
ax.axis("off")
|
| 270 |
+
|
| 271 |
+
table = ax.table(
|
| 272 |
+
cellText = display_df.values,
|
| 273 |
+
rowLabels = display_df.index,
|
| 274 |
+
colLabels = display_df.columns,
|
| 275 |
+
cellLoc = "center",
|
| 276 |
+
loc = "center",
|
| 277 |
+
)
|
| 278 |
+
table.auto_set_font_size(False)
|
| 279 |
+
table.set_fontsize(16)
|
| 280 |
+
table.scale(1.2, 1.9)
|
| 281 |
+
|
| 282 |
+
for j in range(len(display_df.columns)):
|
| 283 |
+
table[(0, j)].set_facecolor("#1F77B4")
|
| 284 |
+
table[(0, j)].set_text_props(color="white", weight="bold")
|
| 285 |
+
table[(0, j)].set_edgecolor("white")
|
| 286 |
+
table[(0, j)].set_linewidth(1)
|
| 287 |
+
|
| 288 |
+
for i in range(1, len(display_df.index) + 1):
|
| 289 |
+
bg = "#0B0F19"
|
| 290 |
+
|
| 291 |
+
table[(i, -1)].set_text_props(color="white", weight="bold")
|
| 292 |
+
table[(i, -1)].set_facecolor(bg)
|
| 293 |
+
table[(i, -1)].set_edgecolor("white")
|
| 294 |
+
table[(i, -1)].set_linewidth(1)
|
| 295 |
+
|
| 296 |
+
for j in range(len(display_df.columns)):
|
| 297 |
+
table[(i, j)].set_facecolor(bg)
|
| 298 |
+
table[(i, j)].set_text_props(color="white")
|
| 299 |
+
table[(i, j)].set_edgecolor("white")
|
| 300 |
+
table[(i, j)].set_linewidth(1)
|
| 301 |
+
|
| 302 |
+
ax.set_title(title, color="white", fontsize=16, weight="bold", pad=12)
|
| 303 |
+
fig.patch.set_facecolor("#0B0F19")
|
| 304 |
+
|
| 305 |
plt.tight_layout()
|
| 306 |
return fig
|