luisejdm commited on
Commit
b077775
·
verified ·
1 Parent(s): 7630c66

upload app

Browse files
.gitattributes CHANGED
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/interim/train_clean_type.csv filter=lfs diff=lfs merge=lfs -text
37
+ data/processed/v2/train_balanced_mixed.csv filter=lfs diff=lfs merge=lfs -text
38
+ data/processed/v2/train_balanced_synthetic.csv filter=lfs diff=lfs merge=lfs -text
39
+ data/processed/v2/train_balanced_synthetic2.csv filter=lfs diff=lfs merge=lfs -text
40
+ data/processed/v4/real_train_data.csv filter=lfs diff=lfs merge=lfs -text
41
+ data/processed/v4/synthetic_train_data.csv filter=lfs diff=lfs merge=lfs -text
42
+ data/raw/train_clean.csv filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+
4
+ from data_generation import generate_synthetic_training_data
5
+ from data_preprocessing import preprocess_real_data, preprocess_synthetic_data
6
+ from credit_models import real_data_credit_model, synthetic_data_credit_model
7
+ from visualization import (
8
+ plot_feature_distributions,
9
+ plot_comparative_credit_score_distribution,
10
+ plot_comparison_table,
11
+ plot_comparative_confusion_matrices,
12
+ plot_comparative_credit_score_distribution_by_actual_class,
13
+ get_metrics_df,
14
+ )
15
+
16
+ COLOR_MAP = {
17
+ 'Good': '#28B463',
18
+ 'Standard': '#F1C40F',
19
+ 'Poor': '#E74C3C',
20
+ }
21
+
22
+ LABEL_ORDER = ['Good', 'Standard', 'Poor']
23
+ TARGET = 'Credit_Score'
24
+
25
+ # Load and preprocess real data once at startup
26
+ real_train = pd.read_csv('../data/processed/v4/real_train_data.csv')
27
+ real_test = pd.read_csv('../data/processed/v4/real_test_data.csv')
28
+
29
+ X_real_train, y_real_train, X_real_test, y_real_test = preprocess_real_data(
30
+ real_train, real_test, TARGET
31
+ )
32
+
33
+ # Train real-data model once at startup
34
+ real_scores, real_classification = real_data_credit_model(
35
+ X_real_train, y_real_train, X_real_test
36
+ )
37
+
38
+
39
+ def run_analysis():
40
+ """Generate new synthetic data, train the synthetic model, and return all comparison plots."""
41
+ synthetic_data = generate_synthetic_training_data(n=int(len(X_real_train)/3)) # Same number of samples as real training data
42
+ X_synth_train, y_synth_train = preprocess_synthetic_data(synthetic_data, TARGET)
43
+
44
+ fig_feature_dist = plot_feature_distributions(
45
+ X_real_train, X_synth_train
46
+ )
47
+
48
+ synth_scores, synth_classification = synthetic_data_credit_model(
49
+ X_synth_train, y_synth_train, X_real_test
50
+ )
51
+
52
+ fig_score_dist = plot_comparative_credit_score_distribution(
53
+ real_scores, synth_scores
54
+ )
55
+ fig_score_by_class = plot_comparative_credit_score_distribution_by_actual_class(
56
+ y_real_test, real_scores, synth_scores,
57
+ color_map=COLOR_MAP,
58
+ label_order=LABEL_ORDER,
59
+ )
60
+ fig_metrics = plot_comparison_table(
61
+ y_real_test, real_classification, synth_classification
62
+ )
63
+ fig_cm = plot_comparative_confusion_matrices(
64
+ y_real_test, real_classification, synth_classification,
65
+ labels=LABEL_ORDER,
66
+ )
67
+
68
+ metrics_df = get_metrics_df(y_real_test, real_classification, synth_classification)
69
+ metrics_df = metrics_df.round(4)
70
+
71
+ return fig_feature_dist, fig_score_dist, fig_score_by_class, fig_metrics, fig_cm, metrics_df
72
+
73
+
74
+ with gr.Blocks(title="Credit Score Model Dashboard", theme=gr.themes.Soft()) as demo:
75
+ gr.Markdown(
76
+ """
77
+ # Credit Score Model Dashboard
78
+ Compare a **Real-Data Model** vs a **Synthetic-Data Model** trained with CTGAN-generated data.
79
+ Click the button to regenerate synthetic data and retrain the synthetic model.
80
+ """
81
+ )
82
+
83
+ run_btn = gr.Button(
84
+ "Generate New Synthetic Data & Analyze", variant="primary", size="lg"
85
+ )
86
+
87
+ gr.Markdown(
88
+ """
89
+ ## Feature Distribution Comparison
90
+
91
+ Below are the distributions of the features in the real vs synthetic training datasets.
92
+ """
93
+ )
94
+
95
+ with gr.Row():
96
+ plot_feature_dist = gr.Plot(label='')
97
+
98
+
99
+ gr.Markdown(
100
+ """
101
+ ## Credit Models Metrics
102
+
103
+ Below are the metrics for the real-data and synthetic-data models.
104
+ """
105
+ )
106
+
107
+ with gr.Row():
108
+ plot_metrics = gr.Plot(label='')
109
+
110
+ gr.Markdown(
111
+ """
112
+ ## Credit Score Distribution Comparison
113
+ Below are the distributions of the predicted credit scores for the real-data and synthetic-data models.
114
+ """
115
+ )
116
+
117
+ with gr.Row():
118
+ plot_score_dist = gr.Plot(label='')
119
+
120
+ gr.Markdown(
121
+ """
122
+ ## Credit Score Distribution by Actual Class
123
+ Below are the distributions of the predicted credit scores for each actual class (Good, Standard, Poor) for both models.
124
+ """
125
+ )
126
+
127
+ with gr.Row():
128
+ plot_score_by_class = gr.Plot(label='')
129
+
130
+ gr.Markdown(
131
+ """
132
+ ## Confusion Matrix Comparison
133
+ Below are the confusion matrices for the real-data and synthetic-data models.
134
+ """
135
+ )
136
+
137
+ with gr.Row():
138
+ plot_cm = gr.Plot(label='')
139
+
140
+ run_btn.click(
141
+ fn=run_analysis,
142
+ inputs=[],
143
+ outputs=[plot_feature_dist, plot_score_dist, plot_score_by_class, plot_metrics, plot_cm],
144
+ )
145
+
146
+ demo.launch()
credit_models.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.linear_model import LogisticRegression
2
+ import pandas as pd
3
+
4
+
5
+ def compute_credit_score(
6
+ coef_dict,
7
+ num_credit_card, changed_credit_limit, delay_from_due_date, interest_rate,
8
+ outstanding_debt, credit_mix_good, credit_mix_standard
9
+ ):
10
+ """Computes a credit score based on the logistic regression coefficients and input features.
11
+
12
+ Args:
13
+ coef_dict (dict): A dictionary containing the logistic regression coefficients.
14
+ num_credit_card (float): The number of credit cards.
15
+ changed_credit_limit (float): The change in credit limit.
16
+ delay_from_due_date (float): The delay from the due date.
17
+ interest_rate (float): The interest rate.
18
+ outstanding_debt (float): The outstanding debt.
19
+ credit_mix_good (float): The proportion of good credit mix.
20
+ credit_mix_standard (float): The proportion of standard credit mix.
21
+ Returns:
22
+ float: The computed credit score.
23
+ """
24
+ score = (
25
+ coef_dict['Num_Credit_Card'] * num_credit_card +
26
+ coef_dict['Changed_Credit_Limit'] * changed_credit_limit +
27
+ coef_dict['Delay_from_due_date'] * delay_from_due_date +
28
+ coef_dict['Interest_Rate'] * interest_rate +
29
+ coef_dict['Outstanding_Debt'] * outstanding_debt +
30
+ coef_dict['Credit_Mix_Good'] * credit_mix_good +
31
+ coef_dict['Credit_Mix_Standard'] * credit_mix_standard
32
+ )
33
+ return score
34
+
35
+
36
+ def real_data_credit_model(X_train, y_train, X_test):
37
+ """
38
+ Trains a logistic regression model on the real training data and evaluates it on the real testing data.
39
+
40
+ Args:
41
+ X_train (pd.DataFrame): The training features.
42
+ y_train (pd.Series): The training target variable.
43
+ X_test (pd.DataFrame): The testing features.
44
+ Returns:
45
+ tuple: A tuple containing the computed credit scores and classifications for the testing data.
46
+ """
47
+ model = LogisticRegression(
48
+ max_iter=1000,
49
+ class_weight='balanced'
50
+ )
51
+ model.fit(X_train, y_train)
52
+
53
+ coefficients = model.coef_[0]
54
+ coef_dict = dict(zip(X_train.columns, coefficients))
55
+
56
+ score = pd.Series([
57
+ compute_credit_score(
58
+ coef_dict,
59
+ row['Num_Credit_Card'], row['Changed_Credit_Limit'], row['Delay_from_due_date'],
60
+ row['Interest_Rate'], row['Outstanding_Debt'], row['Credit_Mix_Good'], row['Credit_Mix_Standard']
61
+ ) for _, row in X_test.iterrows()
62
+ ])
63
+ classification = model.predict(X_test)
64
+
65
+ return score, classification
66
+
67
+
68
+ def synthetic_data_credit_model(X_train, y_train, X_test):
69
+ """Trains a logistic regression model on the synthetic training data and evaluates it on the real testing data.
70
+
71
+ Args:
72
+ X_train (pd.DataFrame): The synthetic training features.
73
+ y_train (pd.Series): The synthetic training target variable.
74
+ X_test (pd.DataFrame): The real testing features.
75
+ Returns:
76
+ tuple: A tuple containing the computed credit scores and classifications for the testing data.
77
+ """
78
+ model = LogisticRegression(
79
+ max_iter=1_000,
80
+ )
81
+ model.fit(X_train, y_train)
82
+
83
+ coefficients = model.coef_[0]
84
+ coef_dict = dict(zip(X_train.columns, coefficients))
85
+
86
+ score = pd.Series([
87
+ compute_credit_score(
88
+ coef_dict,
89
+ row['Num_Credit_Card'], row['Changed_Credit_Limit'], row['Delay_from_due_date'],
90
+ row['Interest_Rate'], row['Outstanding_Debt'], row['Credit_Mix_Good'], row['Credit_Mix_Standard']
91
+ ) for _, row in X_test.iterrows()
92
+ ])
93
+ classification = model.predict(X_test)
94
+
95
+ return score, classification
data/external/.gitkeep ADDED
File without changes
data/interim/.gitkeep ADDED
File without changes
data/interim/modifying_data.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ #
4
+ df_original = pd.read_csv("../data/raw/train_clean.csv")
5
+
6
+ def clean_loans(text):
7
+ if pd.isna(text):
8
+ return []
9
+
10
+ text = text.replace(" and ", ", ") # this might not work as they are classified with ", and" | like it did work, but imma keep this comment just i case
11
+ loans = [l.strip() for l in text.split(",")]
12
+
13
+ loans = [l for l in loans if l != ""]
14
+ return list(set(loans))
15
+
16
+
17
+ df_original["Loan_List"] = df_original["Type_of_Loan"].apply(clean_loans)
18
+
19
+
20
+ # Get all unique loan types
21
+ all_loans = set()
22
+ for row in df_original["Loan_List"]:
23
+ all_loans.update(row)
24
+
25
+ print(all_loans)
26
+
27
+ # Create binary columns
28
+ for loan in all_loans:
29
+ df_original[loan] = df_original["Loan_List"].apply(lambda x: int(loan in x))
30
+
31
+ # Drop original columns
32
+ df_original = df_original.drop(columns=["Type_of_Loan", "Loan_List"])
33
+
34
+ # Save new dataset
35
+ output_path = "./train_clean_type.csv"
36
+ df_original.to_csv(output_path, index=False)
37
+
38
+ print(f" File saved to: {output_path}")
39
+ print(f"shape: {df_original.shape}")
40
+ print("New columns addeeeeeddd:", list(all_loans))
data/interim/train_clean_type.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:537dc0040723321f7b968d65fd49d5ddf666a01a2cfc935d76ca00bc26731d47
3
+ size 16601166
data/processed/.gitkeep ADDED
File without changes
data/processed/v2/train_balanced_mixed.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b741acf7cc7512f40e0013e32b4e77911763ba07a53650bd27236615ed30df1f
3
+ size 27855735
data/processed/v2/train_balanced_synthetic.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ac8bd3b577f29b3752a7cc64a6f9c960280e565bdf31fbb225df33947811084
3
+ size 18773470
data/processed/v2/train_balanced_synthetic2.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:282f24dc0094bc5545000890516662cd3bba2c25c134612d8104aa5762ba718d
3
+ size 18774085
data/processed/v4/real_test_data.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/processed/v4/real_train_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d17e48e95a445d0829bd6884db9a39472792e93e0e5130fcd7040bb6d95daccc
3
+ size 12081755
data/processed/v4/synthetic_train_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0addf8d57b4ef9b0b32db4b53382a29f787e47cfb5cbac0359127cfb1b8ca66b
3
+ size 18017035
data/raw/.gitkeep ADDED
File without changes
data/raw/train_clean.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afe6a49d21938d60d482b326b821cea6dcfa41f55f7f0cf15a3a517bf590403c
3
+ size 20561019
data_generation.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sdv.single_table import CTGANSynthesizer
3
+
4
+
5
+ def generate_synthetic_training_data(n=30_000):
6
+ """Generates synthetic training data using pre-trained CTGAN models for each credit score category.
7
+
8
+ Args:
9
+ n (int, optional): The number of samples to generate for each category. Defaults to 30_000.
10
+ Returns:
11
+ pd.DataFrame: The generated synthetic training data.
12
+ """
13
+ good_generator = CTGANSynthesizer.load("../models/v4/synth_good.pkl")
14
+ poor_generator = CTGANSynthesizer.load("../models/v4/synth_poor.pkl")
15
+ standard_generator = CTGANSynthesizer.load("../models/v4/synth_standard.pkl")
16
+
17
+ synth_good = good_generator.sample(n)
18
+ synth_poor = poor_generator.sample(n)
19
+ synth_standard = standard_generator.sample(n)
20
+
21
+ full_data = pd.concat([synth_good, synth_poor, synth_standard], ignore_index=True)
22
+ shuffled_data = full_data.sample(frac=1).reset_index(drop=True)
23
+ return shuffled_data
data_preprocessing.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+
4
+ def preprocess_real_data(train, test, target):
5
+ """Preprocesses the real training and testing datasets by selecting relevant features and encoding categorical variables.
6
+
7
+ Args:
8
+ train (pd.DataFrame): The real training dataset.
9
+ test (pd.DataFrame): The real testing dataset.
10
+ target (str): The name of the target variable.
11
+ Returns:
12
+ tuple: A tuple containing the preprocessed training features, training target, testing features, and testing target.
13
+ """
14
+ train['Outstanding_Debt'] = train['Outstanding_Debt'] / 1000
15
+ test['Outstanding_Debt'] = test['Outstanding_Debt'] / 1000
16
+
17
+ cols = [
18
+ 'Num_Credit_Card',
19
+ 'Changed_Credit_Limit',
20
+ 'Delay_from_due_date',
21
+ 'Interest_Rate',
22
+ 'Credit_Mix',
23
+ 'Outstanding_Debt',
24
+ target
25
+ ]
26
+
27
+ train = train[cols]
28
+ test = test[cols]
29
+
30
+ train = pd.get_dummies(train, columns=['Credit_Mix'], drop_first=True)
31
+ test = pd.get_dummies(test, columns=['Credit_Mix'], drop_first=True)
32
+
33
+ X_real_train = train.drop(columns=[target])
34
+ y_real_train = train[target]
35
+
36
+ X_real_test = test.drop(columns=[target])
37
+ y_real_test = test[target]
38
+
39
+ return X_real_train, y_real_train, X_real_test, y_real_test
40
+
41
+
42
+ def preprocess_synthetic_data(synthetic_data, target):
43
+ """Preprocesses the synthetic dataset by selecting relevant features and encoding categorical variables.
44
+
45
+ Args:
46
+ synthetic_data (pd.DataFrame): The synthetic dataset to preprocess.
47
+ target (str): The name of the target variable.
48
+ Returns:
49
+ tuple: A tuple containing the preprocessed synthetic features and synthetic target.
50
+ """
51
+ synthetic_data['Outstanding_Debt'] = synthetic_data['Outstanding_Debt'] / 1000
52
+
53
+ synthetic_data = synthetic_data[[
54
+ 'Num_Credit_Card',
55
+ 'Changed_Credit_Limit',
56
+ 'Delay_from_due_date',
57
+ 'Interest_Rate',
58
+ 'Credit_Mix',
59
+ 'Outstanding_Debt',
60
+ target
61
+ ]]
62
+
63
+ synthetic_data = pd.get_dummies(synthetic_data, columns=['Credit_Mix'], drop_first=True)
64
+
65
+ X_synthetic_train = synthetic_data.drop(columns=[target])
66
+ y_synthetic_train = synthetic_data[target]
67
+
68
+ return X_synthetic_train, y_synthetic_train
models/.gitkeep ADDED
File without changes
models/v2/model_good.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:709c2649ce4180a137e34382ae3239a8f6b69e7d1e3371434865bd6879eb7ed9
3
+ size 4194460
models/v2/model_poor.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af85d29197a3d4845b8534605c4ccd848b9e27de189b003889e361fc4a03a902
3
+ size 5667880
models/v2/model_standard.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af53dccff9f076a49ef6b2fcf0dc91fdab2815ed8d8fffcb693fdd6eab250bd2
3
+ size 8651019
models/v4/synth_good.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9beb0dc022c97a97f3815a8038258b8269c1c2882bbc851d280ddbf2d3e0dca
3
+ size 2458679
models/v4/synth_poor.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30af7c40ed01cf22acf77a33bce931dbe166e7ee25890e38e3feb27019140467
3
+ size 2951615
models/v4/synth_standard.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6d5e4bb29162a02f59f25bd8f4db5ebd286b9844eef20e0067641a20a911d6a
3
+ size 3941298
visualization.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib.pyplot as plt
2
+ import seaborn as sns
3
+ import pandas as pd
4
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
5
+
6
+ plt.rcParams['figure.facecolor'] = '#1F2937'
7
+ plt.rcParams['axes.facecolor'] = '#0B0F19'
8
+ plt.rcParams['text.color'] = 'white'
9
+ plt.rcParams['axes.labelcolor'] = 'white'
10
+ plt.rcParams['xtick.color'] = 'white'
11
+ plt.rcParams['ytick.color'] = 'white'
12
+
13
+
14
+ def plot_feature_distributions(real_data, synthetic_data):
15
+ features = real_data.columns.to_list()
16
+
17
+ n_cols = 3
18
+ n_rows = (len(features) + n_cols - 1) // n_cols
19
+
20
+ fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 5 * n_rows))
21
+
22
+ for i, feature in enumerate(features):
23
+ row = i // n_cols
24
+ col = i % n_cols
25
+
26
+ sns.histplot(
27
+ real_data[feature],
28
+ bins=30,
29
+ color='skyblue',
30
+ stat='count',
31
+ element='step',
32
+ fill=True,
33
+ alpha=0.2,
34
+ ax=axes[row, col]
35
+ )
36
+
37
+ sns.histplot(
38
+ synthetic_data[feature],
39
+ bins=30,
40
+ color='indianred',
41
+ stat='count',
42
+ element='step',
43
+ fill=True,
44
+ alpha=0.2,
45
+ ax=axes[row, col]
46
+ )
47
+
48
+ axes[row, col].set_title(f'Distribution of {feature}')
49
+ axes[row, col].set_xlabel(feature)
50
+ axes[row, col].set_ylabel('Frequency')
51
+ axes[row, col].legend(['Real Data', 'Synthetic Data'])
52
+
53
+ for j in range(i + 1, n_rows * n_cols):
54
+ fig.delaxes(axes[j // n_cols, j % n_cols])
55
+
56
+
57
+ plt.tight_layout()
58
+ return fig
59
+
60
+
61
+ def get_metrics_df(y_true, y_real_pred, y_synth_pred):
62
+ metrics = {
63
+ 'Model': ['Real Data Model', 'Synthetic Data Model'],
64
+ 'Accuracy': [
65
+ accuracy_score(y_true, y_real_pred),
66
+ accuracy_score(y_true, y_synth_pred)
67
+ ],
68
+ 'Precision': [
69
+ precision_score(y_true, y_real_pred, average='weighted'),
70
+ precision_score(y_true, y_synth_pred, average='weighted')
71
+ ],
72
+ 'Recall': [
73
+ recall_score(y_true, y_real_pred, average='weighted'),
74
+ recall_score(y_true, y_synth_pred, average='weighted')
75
+ ],
76
+ 'F1-Score': [
77
+ f1_score(y_true, y_real_pred, average='weighted'),
78
+ f1_score(y_true, y_synth_pred, average='weighted')
79
+ ]
80
+ }
81
+ return pd.DataFrame(metrics)
82
+
83
+
84
+ def plot_comparative_credit_score_distribution(
85
+ real_scores,
86
+ synth_scores,
87
+ bins=50,
88
+ title='Comparative Credit Score Distribution: Real vs Synthetic Models'
89
+ ):
90
+ fig, axes = plt.subplots(1, 2, figsize=(16, 5), sharey=True)
91
+
92
+ sns.histplot(
93
+ real_scores,
94
+ bins=bins,
95
+ stat='count',
96
+ element='step',
97
+ fill=True,
98
+ alpha=0.2,
99
+ color='skyblue',
100
+ ax=axes[0]
101
+ )
102
+ axes[0].set_title('Real-Data Model Score Distribution')
103
+ axes[0].set_xlabel('Predicted Credit Score')
104
+ axes[0].set_ylabel('Frequency')
105
+
106
+ sns.histplot(
107
+ synth_scores,
108
+ bins=bins,
109
+ stat='count',
110
+ element='step',
111
+ fill=True,
112
+ alpha=0.2,
113
+ color='skyblue',
114
+ ax=axes[1]
115
+ )
116
+ axes[1].set_title('Synthetic-Data Model Score Distribution')
117
+ axes[1].set_xlabel('Predicted Credit Score')
118
+ axes[1].set_ylabel('Frequency')
119
+
120
+ plt.tight_layout()
121
+ return fig
122
+
123
+
124
+ def plot_comparison_table(
125
+ y_true, y_real_pred, y_synth_pred,
126
+ title='Model Comparison: Real Data vs Synthetic Data'
127
+ ):
128
+ metrics_df = get_metrics_df(y_true, y_real_pred, y_synth_pred)
129
+ display_df = metrics_df.copy().round(4).set_index('Model')
130
+
131
+ fig, ax = plt.subplots(figsize=(18, 2))
132
+ ax.axis('off')
133
+
134
+ table = ax.table(
135
+ cellText=display_df.values,
136
+ rowLabels=display_df.index,
137
+ colLabels=display_df.columns,
138
+ cellLoc='center',
139
+ loc='center',
140
+ )
141
+ table.auto_set_font_size(False)
142
+ table.set_fontsize(16)
143
+ table.scale(1.2, 1.9)
144
+
145
+ for j in range(len(display_df.columns)):
146
+ table[(0, j)].set_facecolor('#1F77B4')
147
+ table[(0, j)].set_text_props(color='white', weight='bold')
148
+
149
+ table[(0, j)].set_edgecolor('white')
150
+ table[(0, j)].set_linewidth(1)
151
+
152
+ for i in range(1, len(display_df.index) + 1):
153
+ bg = '#0B0F19' if i % 2 else '#0B0F19'
154
+
155
+ table[(i, -1)].set_text_props(color='white', weight='bold')
156
+ table[(i, -1)].set_facecolor(bg)
157
+ table[(i, -1)].set_edgecolor('white')
158
+ table[(i, -1)].set_linewidth(1)
159
+
160
+ for j in range(len(display_df.columns)):
161
+ table[(i, j)].set_facecolor(bg)
162
+ table[(i, j)].set_text_props(color='white')
163
+ table[(i, j)].set_edgecolor('white')
164
+ table[(i, j)].set_linewidth(1)
165
+
166
+ plt.tight_layout()
167
+ return fig
168
+
169
+
170
+ def plot_comparative_confusion_matrices(
171
+ y_true,
172
+ y_pred_real,
173
+ y_pred_synth,
174
+ labels=None,
175
+ normalize=False,
176
+ cmap='Blues'
177
+ ):
178
+ cm_real = confusion_matrix(y_true, y_pred_real, labels=labels)
179
+ cm_synth = confusion_matrix(y_true, y_pred_synth, labels=labels)
180
+
181
+ if normalize:
182
+ cm_real_plot = cm_real.astype(float) / cm_real.sum(axis=1, keepdims=True)
183
+ cm_synth_plot = cm_synth.astype(float) / cm_synth.sum(axis=1, keepdims=True)
184
+ fmt = '.2f'
185
+ else:
186
+ cm_real_plot = cm_real
187
+ cm_synth_plot = cm_synth
188
+ fmt = 'd'
189
+
190
+ fig, axes = plt.subplots(1, 2, figsize=(16, 6))
191
+
192
+ sns.heatmap(
193
+ cm_real_plot, annot=True, fmt=fmt, cmap=cmap,
194
+ xticklabels=labels, yticklabels=labels, ax=axes[0]
195
+ )
196
+ axes[0].set_title(f"Real Data Confusion Matrix")
197
+ axes[0].set_xlabel("Predicted")
198
+ axes[0].set_ylabel("Actual")
199
+
200
+ sns.heatmap(
201
+ cm_synth_plot, annot=True, fmt=fmt, cmap=cmap,
202
+ xticklabels=labels, yticklabels=labels, ax=axes[1]
203
+ )
204
+ axes[1].set_title(f"Synthetic Data Confusion Matrix")
205
+ axes[1].set_xlabel("Predicted")
206
+ axes[1].set_ylabel("Actual")
207
+
208
+ plt.tight_layout()
209
+ return fig
210
+
211
+
212
+ def plot_comparative_credit_score_distribution_by_actual_class(
213
+ y_true,
214
+ real_scores,
215
+ synth_scores,
216
+ color_map,
217
+ label_order=None,
218
+ bins=50,
219
+ ):
220
+ fig, (ax_left, ax_right) = plt.subplots(1, 2, figsize=(16, 5), sharey=True)
221
+
222
+ y_true_arr = pd.Series(y_true).values
223
+
224
+ for label in label_order:
225
+ mask = (y_true_arr == label)
226
+
227
+ sns.histplot(
228
+ real_scores[mask],
229
+ bins=bins,
230
+ stat='count',
231
+ element='step',
232
+ fill=True,
233
+ alpha=0.2,
234
+ color=color_map.get(label, None),
235
+ label=label,
236
+ ax=ax_left
237
+ )
238
+
239
+ sns.histplot(
240
+ synth_scores[mask],
241
+ bins=bins,
242
+ stat='count',
243
+ element='step',
244
+ fill=True,
245
+ alpha=0.2,
246
+ color=color_map.get(label, None),
247
+ label=label,
248
+ ax=ax_right
249
+ )
250
+
251
+ ax_left.set_title('Real-Data Model: Actual Class Distribution')
252
+ ax_left.set_xlabel('Predicted Credit Score')
253
+ ax_left.set_ylabel('Frequency')
254
+ ax_left.legend(title='Actual Class')
255
+
256
+ ax_right.set_title('Synthetic-Data Model: Actual Class Distribution')
257
+ ax_right.set_xlabel('Predicted Credit Score')
258
+ ax_right.set_ylabel('Frequency')
259
+ ax_right.legend(title='Actual Class')
260
+
261
+ plt.tight_layout()
262
+ return fig