luisejdm commited on
Commit
b52804e
·
verified ·
1 Parent(s): f624647

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +35 -6
  2. data_generation.py +90 -4
  3. visualization.py +44 -0
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
  import pandas as pd
3
 
4
- from data_generation import generate_synthetic_training_data
5
  from data_preprocessing import preprocess_real_data, preprocess_synthetic_data
6
  from credit_models import real_data_credit_model, synthetic_data_credit_model
7
  from visualization import (
@@ -10,6 +10,7 @@ from visualization import (
10
  plot_comparison_table,
11
  plot_comparative_confusion_matrices,
12
  plot_comparative_credit_score_distribution_by_actual_class,
 
13
  get_metrics_df,
14
  )
15
 
@@ -23,8 +24,8 @@ LABEL_ORDER = ['Good', 'Standard', 'Poor']
23
  TARGET = 'Credit_Score'
24
 
25
  # Load and preprocess real data once at startup
26
- real_train = pd.read_csv('data/processed/v4/real_train_data.csv')
27
- real_test = pd.read_csv('data/processed/v4/real_test_data.csv')
28
 
29
  X_real_train, y_real_train, X_real_test, y_real_test = preprocess_real_data(
30
  real_train, real_test, TARGET
@@ -35,16 +36,34 @@ real_scores, real_classification = real_data_credit_model(
35
  X_real_train, y_real_train, X_real_test
36
  )
37
 
38
-
39
  def run_analysis():
40
  """Generate new synthetic data, train the synthetic model, and return all comparison plots."""
41
  synthetic_data = generate_synthetic_training_data(n=int(len(X_real_train)/3)) # Same number of samples as real training data
42
  X_synth_train, y_synth_train = preprocess_synthetic_data(synthetic_data, TARGET)
43
 
 
 
 
44
  fig_feature_dist = plot_feature_distributions(
45
  X_real_train, X_synth_train
46
  )
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  synth_scores, synth_classification = synthetic_data_credit_model(
49
  X_synth_train, y_synth_train, X_real_test
50
  )
@@ -68,7 +87,7 @@ def run_analysis():
68
  metrics_df = get_metrics_df(y_real_test, real_classification, synth_classification)
69
  metrics_df = metrics_df.round(4)
70
 
71
- return fig_feature_dist, fig_score_dist, fig_score_by_class, fig_metrics, fig_cm, metrics_df
72
 
73
 
74
  with gr.Blocks(title="Credit Score Model Dashboard", theme=gr.themes.Soft()) as demo:
@@ -95,6 +114,16 @@ with gr.Blocks(title="Credit Score Model Dashboard", theme=gr.themes.Soft()) as
95
  with gr.Row():
96
  plot_feature_dist = gr.Plot(label='')
97
 
 
 
 
 
 
 
 
 
 
 
98
 
99
  gr.Markdown(
100
  """
@@ -140,7 +169,7 @@ with gr.Blocks(title="Credit Score Model Dashboard", theme=gr.themes.Soft()) as
140
  run_btn.click(
141
  fn=run_analysis,
142
  inputs=[],
143
- outputs=[plot_feature_dist, plot_score_dist, plot_score_by_class, plot_metrics, plot_cm],
144
  )
145
 
146
  demo.launch()
 
1
  import gradio as gr
2
  import pandas as pd
3
 
4
+ from data_generation import generate_synthetic_training_data, evaluate_synthetic_data
5
  from data_preprocessing import preprocess_real_data, preprocess_synthetic_data
6
  from credit_models import real_data_credit_model, synthetic_data_credit_model
7
  from visualization import (
 
10
  plot_comparison_table,
11
  plot_comparative_confusion_matrices,
12
  plot_comparative_credit_score_distribution_by_actual_class,
13
+ plot_evaluation_table,
14
  get_metrics_df,
15
  )
16
 
 
24
  TARGET = 'Credit_Score'
25
 
26
  # Load and preprocess real data once at startup
27
+ real_train = pd.read_csv('../data/processed/v4/real_train_data.csv')
28
+ real_test = pd.read_csv('../data/processed/v4/real_test_data.csv')
29
 
30
  X_real_train, y_real_train, X_real_test, y_real_test = preprocess_real_data(
31
  real_train, real_test, TARGET
 
36
  X_real_train, y_real_train, X_real_test
37
  )
38
 
 
39
  def run_analysis():
40
  """Generate new synthetic data, train the synthetic model, and return all comparison plots."""
41
  synthetic_data = generate_synthetic_training_data(n=int(len(X_real_train)/3)) # Same number of samples as real training data
42
  X_synth_train, y_synth_train = preprocess_synthetic_data(synthetic_data, TARGET)
43
 
44
+ categorical_cols = [col for col in X_real_train.columns if X_real_train[col].dtype in ["object", "bool", "uint8"]]
45
+ numeric_cols = [col for col in X_real_train.columns if X_real_train[col].dtype in ["int64", "float64"]]
46
+
47
  fig_feature_dist = plot_feature_distributions(
48
  X_real_train, X_synth_train
49
  )
50
 
51
+ summary_rows = []
52
+ for cls in LABEL_ORDER:
53
+ real_cls = X_real_train[y_real_train == cls]
54
+ synth_cls = X_synth_train[y_synth_train == cls]
55
+ ks_pass_rate, mean_ks, chi_pass_rate, mean_corr_diff = evaluate_synthetic_data(
56
+ real_cls, synth_cls, categorical_cols, numeric_cols
57
+ )
58
+ summary_rows.append({
59
+ "ks_passed": ks_pass_rate == 1.0,
60
+ "mean_ks_stat": round(mean_ks, 4),
61
+ "chi_passed": chi_pass_rate == 1.0 if chi_pass_rate is not None else None,
62
+ "mean_corr_diff": round(mean_corr_diff, 4),
63
+ })
64
+ summary_df = pd.DataFrame(summary_rows, index=LABEL_ORDER)
65
+ summary = plot_evaluation_table(summary_df)
66
+
67
  synth_scores, synth_classification = synthetic_data_credit_model(
68
  X_synth_train, y_synth_train, X_real_test
69
  )
 
87
  metrics_df = get_metrics_df(y_real_test, real_classification, synth_classification)
88
  metrics_df = metrics_df.round(4)
89
 
90
+ return fig_feature_dist, summary, fig_score_dist, fig_score_by_class, fig_metrics, fig_cm, metrics_df
91
 
92
 
93
  with gr.Blocks(title="Credit Score Model Dashboard", theme=gr.themes.Soft()) as demo:
 
114
  with gr.Row():
115
  plot_feature_dist = gr.Plot(label='')
116
 
117
+ gr.Markdown(
118
+ """
119
+ ## Generated Data Quality Summary
120
+
121
+ Below is a summary of the data quality evaluation comparing the synthetic training data to the real training data across multiple metrics.
122
+ """
123
+ )
124
+
125
+ with gr.Row():
126
+ plot_summary = gr.Plot(label='')
127
 
128
  gr.Markdown(
129
  """
 
169
  run_btn.click(
170
  fn=run_analysis,
171
  inputs=[],
172
+ outputs=[plot_feature_dist, plot_summary, plot_score_dist, plot_score_by_class, plot_metrics, plot_cm],
173
  )
174
 
175
  demo.launch()
data_generation.py CHANGED
@@ -1,6 +1,8 @@
1
  import torch
2
  import pandas as pd
3
  from sdv.single_table import CTGANSynthesizer
 
 
4
 
5
  # Patch torch.load to remap MPS tensors to CPU for environments without Apple Silicon
6
  _original_torch_load = torch.load
@@ -20,9 +22,9 @@ def generate_synthetic_training_data(n=30_000):
20
  Returns:
21
  pd.DataFrame: The generated synthetic training data.
22
  """
23
- good_generator = CTGANSynthesizer.load("models/v4/synth_good.pkl")
24
- poor_generator = CTGANSynthesizer.load("models/v4/synth_poor.pkl")
25
- standard_generator = CTGANSynthesizer.load("models/v4/synth_standard.pkl")
26
 
27
  synth_good = good_generator.sample(n)
28
  synth_poor = poor_generator.sample(n)
@@ -30,4 +32,88 @@ def generate_synthetic_training_data(n=30_000):
30
 
31
  full_data = pd.concat([synth_good, synth_poor, synth_standard], ignore_index=True)
32
  shuffled_data = full_data.sample(frac=1).reset_index(drop=True)
33
- return shuffled_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
  import pandas as pd
3
  from sdv.single_table import CTGANSynthesizer
4
+ from scipy import stats
5
+ import numpy as np
6
 
7
  # Patch torch.load to remap MPS tensors to CPU for environments without Apple Silicon
8
  _original_torch_load = torch.load
 
22
  Returns:
23
  pd.DataFrame: The generated synthetic training data.
24
  """
25
+ good_generator = CTGANSynthesizer.load("../models/v4/synth_good.pkl")
26
+ poor_generator = CTGANSynthesizer.load("../models/v4/synth_poor.pkl")
27
+ standard_generator = CTGANSynthesizer.load("../models/v4/synth_standard.pkl")
28
 
29
  synth_good = good_generator.sample(n)
30
  synth_poor = poor_generator.sample(n)
 
32
 
33
  full_data = pd.concat([synth_good, synth_poor, synth_standard], ignore_index=True)
34
  shuffled_data = full_data.sample(frac=1).reset_index(drop=True)
35
+ return shuffled_data
36
+
37
+
38
+ def evaluate_synthetic_data(real_df, synthetic_df,
39
+ categorical_cols=None, numeric_cols=None):
40
+ if numeric_cols is None:
41
+ numeric_cols = real_df.select_dtypes(include="number").columns.tolist()
42
+ if categorical_cols is None:
43
+ categorical_cols = real_df.select_dtypes(include="object").columns.tolist()
44
+
45
+ # ── KS Test ───────────────────────────────────────────────────────────
46
+ ks_results = []
47
+ for col in numeric_cols:
48
+ stat, p_value = stats.ks_2samp(
49
+ real_df[col].dropna(),
50
+ synthetic_df[col].dropna()
51
+ )
52
+ ks_results.append({
53
+ "column" : col,
54
+ "ks_stat": round(stat, 4),
55
+ "p_value": round(p_value, 4),
56
+ "pass" : p_value > 0.05
57
+ })
58
+ ks_df = pd.DataFrame(ks_results)
59
+
60
+ # ── Chi-Square Test ───────────────────────────────────────────────────
61
+ chi_results = []
62
+ for col in categorical_cols:
63
+ real_counts = real_df[col].value_counts()
64
+ synth_counts = synthetic_df[col].value_counts()
65
+ all_cats = real_counts.index.union(synth_counts.index)
66
+ real_freq = real_counts.reindex(all_cats, fill_value=0)
67
+ synth_freq = synth_counts.reindex(all_cats, fill_value=0)
68
+ n = real_freq.sum()
69
+ f_exp = (real_freq / real_freq.sum()) * n
70
+ f_obs = (synth_freq / synth_freq.sum()) * n
71
+ stat, p_value = stats.chisquare(f_obs=f_obs, f_exp=f_exp)
72
+ chi_results.append({
73
+ "column" : col,
74
+ "chi_stat": round(stat, 4),
75
+ "p_value" : round(p_value, 4),
76
+ "pass" : p_value > 0.05
77
+ })
78
+ chi_df = pd.DataFrame(chi_results)
79
+
80
+ # ── Correlation Matrix ────────────────────────────────────────────────
81
+ real_corr = real_df[numeric_cols].corr()
82
+ synth_corr = synthetic_df[numeric_cols].corr()
83
+ corr_diff = (real_corr - synth_corr).abs()
84
+ upper_idx = np.triu_indices_from(corr_diff.values, k=1)
85
+ mean_corr_diff = corr_diff.values[upper_idx].mean()
86
+
87
+ ks_pass_rate = ks_df["pass"].mean()
88
+ mean_ks = ks_df["ks_stat"].mean()
89
+ chi_pass_rate = chi_df["pass"].mean() if not chi_df.empty else None
90
+
91
+ return ks_pass_rate, mean_ks, chi_pass_rate, mean_corr_diff
92
+
93
+
94
+ def data_evaluation(
95
+ real_list, synthetic_list, class_names=None, categorical_cols=None, numeric_cols=None
96
+ ):
97
+ if class_names is None:
98
+ class_names = ["good", "poor", "standard"]
99
+
100
+ summary_rows = []
101
+
102
+ for cls, real_df, syn_df in zip(class_names, real_list, synthetic_list):
103
+
104
+ ks_pass_rate, mean_ks, chi_pass_rate, mean_corr_diff = evaluate_synthetic_data(
105
+ real_df = real_df,
106
+ synthetic_df = syn_df,
107
+ categorical_cols = categorical_cols,
108
+ numeric_cols = numeric_cols
109
+ )
110
+
111
+ summary_rows.append({
112
+ "class" : cls,
113
+ "ks_pass_rate" : round(ks_pass_rate, 4),
114
+ "mean_ks_stat" : round(mean_ks, 4),
115
+ "chi_pass_rate" : round(chi_pass_rate, 4) if chi_pass_rate is not None else None,
116
+ "mean_corr_diff": round(mean_corr_diff, 4)
117
+ })
118
+
119
+ return pd.DataFrame(summary_rows).set_index("class")
visualization.py CHANGED
@@ -258,5 +258,49 @@ def plot_comparative_credit_score_distribution_by_actual_class(
258
  ax_right.set_ylabel('Frequency')
259
  ax_right.legend(title='Actual Class')
260
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
  plt.tight_layout()
262
  return fig
 
258
  ax_right.set_ylabel('Frequency')
259
  ax_right.legend(title='Actual Class')
260
 
261
+ plt.tight_layout()
262
+ return fig
263
+
264
+
265
+ def plot_evaluation_table(summary_df, title="Synthetic Data Evaluation Summary"):
266
+ display_df = summary_df.copy().round(4)
267
+
268
+ fig, ax = plt.subplots(figsize=(18, 2))
269
+ ax.axis("off")
270
+
271
+ table = ax.table(
272
+ cellText = display_df.values,
273
+ rowLabels = display_df.index,
274
+ colLabels = display_df.columns,
275
+ cellLoc = "center",
276
+ loc = "center",
277
+ )
278
+ table.auto_set_font_size(False)
279
+ table.set_fontsize(16)
280
+ table.scale(1.2, 1.9)
281
+
282
+ for j in range(len(display_df.columns)):
283
+ table[(0, j)].set_facecolor("#1F77B4")
284
+ table[(0, j)].set_text_props(color="white", weight="bold")
285
+ table[(0, j)].set_edgecolor("white")
286
+ table[(0, j)].set_linewidth(1)
287
+
288
+ for i in range(1, len(display_df.index) + 1):
289
+ bg = "#0B0F19"
290
+
291
+ table[(i, -1)].set_text_props(color="white", weight="bold")
292
+ table[(i, -1)].set_facecolor(bg)
293
+ table[(i, -1)].set_edgecolor("white")
294
+ table[(i, -1)].set_linewidth(1)
295
+
296
+ for j in range(len(display_df.columns)):
297
+ table[(i, j)].set_facecolor(bg)
298
+ table[(i, j)].set_text_props(color="white")
299
+ table[(i, j)].set_edgecolor("white")
300
+ table[(i, j)].set_linewidth(1)
301
+
302
+ ax.set_title(title, color="white", fontsize=16, weight="bold", pad=12)
303
+ fig.patch.set_facecolor("#0B0F19")
304
+
305
  plt.tight_layout()
306
  return fig