Spaces:

luisejdm
/

Proyecto2_Deep_Learning

Sleeping

App Files Files Community

luisejdm commited on 30 days ago

Commit

b077775

verified ·

1 Parent(s): 7630c66

upload app

Browse files

Files changed (26) hide show

.gitattributes +7 -0
app.py +146 -0
credit_models.py +95 -0
data/external/.gitkeep +0 -0
data/interim/.gitkeep +0 -0
data/interim/modifying_data.py +40 -0
data/interim/train_clean_type.csv +3 -0
data/processed/.gitkeep +0 -0
data/processed/v2/train_balanced_mixed.csv +3 -0
data/processed/v2/train_balanced_synthetic.csv +3 -0
data/processed/v2/train_balanced_synthetic2.csv +3 -0
data/processed/v4/real_test_data.csv +0 -0
data/processed/v4/real_train_data.csv +3 -0
data/processed/v4/synthetic_train_data.csv +3 -0
data/raw/.gitkeep +0 -0
data/raw/train_clean.csv +3 -0
data_generation.py +23 -0
data_preprocessing.py +68 -0
models/.gitkeep +0 -0
models/v2/model_good.pkl +3 -0
models/v2/model_poor.pkl +3 -0
models/v2/model_standard.pkl +3 -0
models/v4/synth_good.pkl +3 -0
models/v4/synth_poor.pkl +3 -0
models/v4/synth_standard.pkl +3 -0
visualization.py +262 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/interim/train_clean_type.csv filter=lfs diff=lfs merge=lfs -text
+data/processed/v2/train_balanced_mixed.csv filter=lfs diff=lfs merge=lfs -text
+data/processed/v2/train_balanced_synthetic.csv filter=lfs diff=lfs merge=lfs -text
+data/processed/v2/train_balanced_synthetic2.csv filter=lfs diff=lfs merge=lfs -text
+data/processed/v4/real_train_data.csv filter=lfs diff=lfs merge=lfs -text
+data/processed/v4/synthetic_train_data.csv filter=lfs diff=lfs merge=lfs -text
+data/raw/train_clean.csv filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import gradio as gr
+import pandas as pd
+from data_generation import generate_synthetic_training_data
+from data_preprocessing import preprocess_real_data, preprocess_synthetic_data
+from credit_models import real_data_credit_model, synthetic_data_credit_model
+from visualization import (
+    plot_feature_distributions,
+    plot_comparative_credit_score_distribution,
+    plot_comparison_table,
+    plot_comparative_confusion_matrices,
+    plot_comparative_credit_score_distribution_by_actual_class,
+    get_metrics_df,
+)
+COLOR_MAP = {
+    'Good': '#28B463',
+    'Standard': '#F1C40F',
+    'Poor': '#E74C3C',
+}
+LABEL_ORDER = ['Good', 'Standard', 'Poor']
+TARGET = 'Credit_Score'
+# Load and preprocess real data once at startup
+real_train = pd.read_csv('../data/processed/v4/real_train_data.csv')
+real_test = pd.read_csv('../data/processed/v4/real_test_data.csv')
+X_real_train, y_real_train, X_real_test, y_real_test = preprocess_real_data(
+    real_train, real_test, TARGET
+)
+# Train real-data model once at startup
+real_scores, real_classification = real_data_credit_model(
+    X_real_train, y_real_train, X_real_test
+)
+def run_analysis():
+    """Generate new synthetic data, train the synthetic model, and return all comparison plots."""
+    synthetic_data = generate_synthetic_training_data(n=int(len(X_real_train)/3)) # Same number of samples as real training data
+    X_synth_train, y_synth_train = preprocess_synthetic_data(synthetic_data, TARGET)
+    fig_feature_dist = plot_feature_distributions(
+        X_real_train, X_synth_train
+    )
+    synth_scores, synth_classification = synthetic_data_credit_model(
+        X_synth_train, y_synth_train, X_real_test
+    )
+    fig_score_dist = plot_comparative_credit_score_distribution(
+        real_scores, synth_scores
+    )
+    fig_score_by_class = plot_comparative_credit_score_distribution_by_actual_class(
+        y_real_test, real_scores, synth_scores,
+        color_map=COLOR_MAP,
+        label_order=LABEL_ORDER,
+    )
+    fig_metrics = plot_comparison_table(
+        y_real_test, real_classification, synth_classification
+    )
+    fig_cm = plot_comparative_confusion_matrices(
+        y_real_test, real_classification, synth_classification,
+        labels=LABEL_ORDER,
+    )
+    metrics_df = get_metrics_df(y_real_test, real_classification, synth_classification)
+    metrics_df = metrics_df.round(4)
+    return fig_feature_dist, fig_score_dist, fig_score_by_class, fig_metrics, fig_cm, metrics_df
+with gr.Blocks(title="Credit Score Model Dashboard", theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        """
+        # Credit Score Model Dashboard
+        Compare a **Real-Data Model** vs a **Synthetic-Data Model** trained with CTGAN-generated data.
+        Click the button to regenerate synthetic data and retrain the synthetic model.
+        """
+    )
+    run_btn = gr.Button(
+        "Generate New Synthetic Data & Analyze", variant="primary", size="lg"
+    )
+    gr.Markdown(
+    """
+    ## Feature Distribution Comparison
+    Below are the distributions of the features in the real vs synthetic training datasets.
+    """
+    )
+    with gr.Row():
+        plot_feature_dist = gr.Plot(label='')
+    gr.Markdown(
+    """
+    ## Credit Models Metrics
+    Below are the metrics for the real-data and synthetic-data models.
+    """
+    )
+    with gr.Row():
+        plot_metrics = gr.Plot(label='')
+    gr.Markdown(
+    """
+    ## Credit Score Distribution Comparison
+    Below are the distributions of the predicted credit scores for the real-data and synthetic-data models.
+    """
+    )
+    with gr.Row():
+        plot_score_dist = gr.Plot(label='')
+    gr.Markdown(
+    """
+    ## Credit Score Distribution by Actual Class
+    Below are the distributions of the predicted credit scores for each actual class (Good, Standard, Poor) for both models.
+    """
+    )
+    with gr.Row():
+        plot_score_by_class = gr.Plot(label='')
+    gr.Markdown(
+    """
+    ## Confusion Matrix Comparison
+    Below are the confusion matrices for the real-data and synthetic-data models.
+    """
+    )
+    with gr.Row():
+        plot_cm = gr.Plot(label='')
+    run_btn.click(
+        fn=run_analysis,
+        inputs=[],
+        outputs=[plot_feature_dist, plot_score_dist, plot_score_by_class, plot_metrics, plot_cm],
+    )
+demo.launch()

credit_models.py ADDED Viewed

	@@ -0,0 +1,95 @@

+from sklearn.linear_model import LogisticRegression
+import pandas as pd
+def compute_credit_score(
+        coef_dict,
+        num_credit_card, changed_credit_limit, delay_from_due_date, interest_rate,
+        outstanding_debt, credit_mix_good, credit_mix_standard
+):
+    """Computes a credit score based on the logistic regression coefficients and input features.
+    Args:
+        coef_dict (dict): A dictionary containing the logistic regression coefficients.
+        num_credit_card (float): The number of credit cards.
+        changed_credit_limit (float): The change in credit limit.
+        delay_from_due_date (float): The delay from the due date.
+        interest_rate (float): The interest rate.
+        outstanding_debt (float): The outstanding debt.
+        credit_mix_good (float): The proportion of good credit mix.
+        credit_mix_standard (float): The proportion of standard credit mix.
+    Returns:
+        float: The computed credit score.
+    """
+    score = (
+        coef_dict['Num_Credit_Card'] * num_credit_card +
+        coef_dict['Changed_Credit_Limit'] * changed_credit_limit +
+        coef_dict['Delay_from_due_date'] * delay_from_due_date +
+        coef_dict['Interest_Rate'] * interest_rate +
+        coef_dict['Outstanding_Debt'] * outstanding_debt +
+        coef_dict['Credit_Mix_Good'] * credit_mix_good +
+        coef_dict['Credit_Mix_Standard'] * credit_mix_standard
+    )
+    return score
+def real_data_credit_model(X_train, y_train, X_test):
+    """
+    Trains a logistic regression model on the real training data and evaluates it on the real testing data.
+    Args:
+        X_train (pd.DataFrame): The training features.
+        y_train (pd.Series): The training target variable.
+        X_test (pd.DataFrame): The testing features.
+    Returns:
+        tuple: A tuple containing the computed credit scores and classifications for the testing data.
+    """
+    model = LogisticRegression(
+        max_iter=1000,
+        class_weight='balanced'
+    )
+    model.fit(X_train, y_train)
+    coefficients = model.coef_[0]
+    coef_dict = dict(zip(X_train.columns, coefficients))
+    score = pd.Series([
+        compute_credit_score(
+            coef_dict,
+            row['Num_Credit_Card'], row['Changed_Credit_Limit'], row['Delay_from_due_date'],
+            row['Interest_Rate'], row['Outstanding_Debt'], row['Credit_Mix_Good'], row['Credit_Mix_Standard']
+        ) for _, row in X_test.iterrows()
+    ])
+    classification = model.predict(X_test)
+    return score, classification
+def synthetic_data_credit_model(X_train, y_train, X_test):
+    """Trains a logistic regression model on the synthetic training data and evaluates it on the real testing data.
+    Args:
+        X_train (pd.DataFrame): The synthetic training features.
+        y_train (pd.Series): The synthetic training target variable.
+        X_test (pd.DataFrame): The real testing features.
+    Returns:
+        tuple: A tuple containing the computed credit scores and classifications for the testing data.
+    """
+    model = LogisticRegression(
+        max_iter=1_000,
+    )
+    model.fit(X_train, y_train)
+    coefficients = model.coef_[0]
+    coef_dict = dict(zip(X_train.columns, coefficients))
+    score = pd.Series([
+        compute_credit_score(
+            coef_dict,
+            row['Num_Credit_Card'], row['Changed_Credit_Limit'], row['Delay_from_due_date'],
+            row['Interest_Rate'], row['Outstanding_Debt'], row['Credit_Mix_Good'], row['Credit_Mix_Standard']
+        ) for _, row in X_test.iterrows()
+    ])
+    classification = model.predict(X_test)
+    return score, classification

data/external/.gitkeep ADDED Viewed

File without changes

data/interim/.gitkeep ADDED Viewed

File without changes

data/interim/modifying_data.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import pandas as pd
+#
+df_original = pd.read_csv("../data/raw/train_clean.csv")
+def clean_loans(text):
+    if pd.isna(text):
+        return []
+    text = text.replace(" and ", ", ") # this might not work as they are classified with ", and"    | like it did work, but imma keep this comment just i case
+    loans = [l.strip() for l in text.split(",")]
+    loans = [l for l in loans if l != ""]
+    return list(set(loans))
+df_original["Loan_List"] = df_original["Type_of_Loan"].apply(clean_loans)
+# Get all unique loan types
+all_loans = set()
+for row in df_original["Loan_List"]:
+    all_loans.update(row)
+print(all_loans)
+# Create binary columns
+for loan in all_loans:
+    df_original[loan] = df_original["Loan_List"].apply(lambda x: int(loan in x))
+# Drop original columns
+df_original = df_original.drop(columns=["Type_of_Loan", "Loan_List"])
+# Save new dataset
+output_path = "./train_clean_type.csv"
+df_original.to_csv(output_path, index=False)
+print(f" File saved to: {output_path}")
+print(f"shape: {df_original.shape}")
+print("New columns addeeeeeddd:", list(all_loans))

data/interim/train_clean_type.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:537dc0040723321f7b968d65fd49d5ddf666a01a2cfc935d76ca00bc26731d47
+size 16601166

data/processed/.gitkeep ADDED Viewed

File without changes

data/processed/v2/train_balanced_mixed.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b741acf7cc7512f40e0013e32b4e77911763ba07a53650bd27236615ed30df1f
+size 27855735

data/processed/v2/train_balanced_synthetic.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8ac8bd3b577f29b3752a7cc64a6f9c960280e565bdf31fbb225df33947811084
+size 18773470

data/processed/v2/train_balanced_synthetic2.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:282f24dc0094bc5545000890516662cd3bba2c25c134612d8104aa5762ba718d
+size 18774085

data/processed/v4/real_test_data.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/processed/v4/real_train_data.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d17e48e95a445d0829bd6884db9a39472792e93e0e5130fcd7040bb6d95daccc
+size 12081755

data/processed/v4/synthetic_train_data.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0addf8d57b4ef9b0b32db4b53382a29f787e47cfb5cbac0359127cfb1b8ca66b
+size 18017035

data/raw/.gitkeep ADDED Viewed

File without changes

data/raw/train_clean.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:afe6a49d21938d60d482b326b821cea6dcfa41f55f7f0cf15a3a517bf590403c
+size 20561019

data_generation.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import pandas as pd
+from sdv.single_table import CTGANSynthesizer
+def generate_synthetic_training_data(n=30_000):
+    """Generates synthetic training data using pre-trained CTGAN models for each credit score category.
+    Args:
+        n (int, optional): The number of samples to generate for each category. Defaults to 30_000.
+    Returns:
+        pd.DataFrame: The generated synthetic training data.
+    """
+    good_generator = CTGANSynthesizer.load("../models/v4/synth_good.pkl")
+    poor_generator = CTGANSynthesizer.load("../models/v4/synth_poor.pkl")
+    standard_generator = CTGANSynthesizer.load("../models/v4/synth_standard.pkl")
+    synth_good = good_generator.sample(n)
+    synth_poor = poor_generator.sample(n)
+    synth_standard = standard_generator.sample(n)
+    full_data = pd.concat([synth_good, synth_poor, synth_standard], ignore_index=True)
+    shuffled_data = full_data.sample(frac=1).reset_index(drop=True)
+    return shuffled_data

data_preprocessing.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import pandas as pd
+def preprocess_real_data(train, test, target):
+    """Preprocesses the real training and testing datasets by selecting relevant features and encoding categorical variables.
+    Args:
+        train (pd.DataFrame): The real training dataset.
+        test (pd.DataFrame): The real testing dataset.
+        target (str): The name of the target variable.
+    Returns:
+        tuple: A tuple containing the preprocessed training features, training target, testing features, and testing target.
+    """
+    train['Outstanding_Debt'] = train['Outstanding_Debt'] / 1000
+    test['Outstanding_Debt'] = test['Outstanding_Debt'] / 1000
+    cols = [
+        'Num_Credit_Card',
+        'Changed_Credit_Limit',
+        'Delay_from_due_date',
+        'Interest_Rate',
+        'Credit_Mix',
+        'Outstanding_Debt',
+        target
+    ]
+    train = train[cols]
+    test = test[cols]
+    train = pd.get_dummies(train, columns=['Credit_Mix'], drop_first=True)
+    test = pd.get_dummies(test, columns=['Credit_Mix'], drop_first=True)
+    X_real_train = train.drop(columns=[target])
+    y_real_train = train[target]
+    X_real_test = test.drop(columns=[target])
+    y_real_test = test[target]
+    return X_real_train, y_real_train, X_real_test, y_real_test
+def preprocess_synthetic_data(synthetic_data, target):
+    """Preprocesses the synthetic dataset by selecting relevant features and encoding categorical variables.
+    Args:
+        synthetic_data (pd.DataFrame): The synthetic dataset to preprocess.
+        target (str): The name of the target variable.
+    Returns:
+        tuple: A tuple containing the preprocessed synthetic features and synthetic target.
+    """
+    synthetic_data['Outstanding_Debt'] = synthetic_data['Outstanding_Debt'] / 1000
+    synthetic_data = synthetic_data[[
+        'Num_Credit_Card',
+        'Changed_Credit_Limit',
+        'Delay_from_due_date',
+        'Interest_Rate',
+        'Credit_Mix',
+        'Outstanding_Debt',
+        target
+    ]]
+    synthetic_data = pd.get_dummies(synthetic_data, columns=['Credit_Mix'], drop_first=True)
+    X_synthetic_train = synthetic_data.drop(columns=[target])
+    y_synthetic_train = synthetic_data[target]
+    return X_synthetic_train, y_synthetic_train

models/.gitkeep ADDED Viewed

File without changes

models/v2/model_good.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:709c2649ce4180a137e34382ae3239a8f6b69e7d1e3371434865bd6879eb7ed9
+size 4194460

models/v2/model_poor.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:af85d29197a3d4845b8534605c4ccd848b9e27de189b003889e361fc4a03a902
+size 5667880

models/v2/model_standard.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:af53dccff9f076a49ef6b2fcf0dc91fdab2815ed8d8fffcb693fdd6eab250bd2
+size 8651019

models/v4/synth_good.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a9beb0dc022c97a97f3815a8038258b8269c1c2882bbc851d280ddbf2d3e0dca
+size 2458679

models/v4/synth_poor.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:30af7c40ed01cf22acf77a33bce931dbe166e7ee25890e38e3feb27019140467
+size 2951615

models/v4/synth_standard.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e6d5e4bb29162a02f59f25bd8f4db5ebd286b9844eef20e0067641a20a911d6a
+size 3941298

visualization.py ADDED Viewed

	@@ -0,0 +1,262 @@

+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
+plt.rcParams['figure.facecolor'] = '#1F2937'
+plt.rcParams['axes.facecolor'] = '#0B0F19'
+plt.rcParams['text.color'] = 'white'
+plt.rcParams['axes.labelcolor'] = 'white'
+plt.rcParams['xtick.color'] = 'white'
+plt.rcParams['ytick.color'] = 'white'
+def plot_feature_distributions(real_data, synthetic_data):
+    features = real_data.columns.to_list()
+    n_cols = 3
+    n_rows = (len(features) + n_cols - 1) // n_cols
+    fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 5 * n_rows))
+    for i, feature in enumerate(features):
+        row = i // n_cols
+        col = i % n_cols
+        sns.histplot(
+            real_data[feature],
+            bins=30,
+            color='skyblue',
+            stat='count',
+            element='step',
+            fill=True,
+            alpha=0.2,
+            ax=axes[row, col]
+        )
+        sns.histplot(
+            synthetic_data[feature],
+            bins=30,
+            color='indianred',
+            stat='count',
+            element='step',
+            fill=True,
+            alpha=0.2,
+            ax=axes[row, col]
+        )
+        axes[row, col].set_title(f'Distribution of {feature}')
+        axes[row, col].set_xlabel(feature)
+        axes[row, col].set_ylabel('Frequency')
+        axes[row, col].legend(['Real Data', 'Synthetic Data'])
+    for j in range(i + 1, n_rows * n_cols):
+        fig.delaxes(axes[j // n_cols, j % n_cols])
+    plt.tight_layout()
+    return fig
+def get_metrics_df(y_true, y_real_pred, y_synth_pred):
+    metrics = {
+        'Model': ['Real Data Model', 'Synthetic Data Model'],
+        'Accuracy': [
+            accuracy_score(y_true, y_real_pred),
+            accuracy_score(y_true, y_synth_pred)
+        ],
+        'Precision': [
+            precision_score(y_true, y_real_pred, average='weighted'),
+            precision_score(y_true, y_synth_pred, average='weighted')
+        ],
+        'Recall': [
+            recall_score(y_true, y_real_pred, average='weighted'),
+            recall_score(y_true, y_synth_pred, average='weighted')
+        ],
+        'F1-Score': [
+            f1_score(y_true, y_real_pred, average='weighted'),
+            f1_score(y_true, y_synth_pred, average='weighted')
+        ]
+    }
+    return pd.DataFrame(metrics)
+def plot_comparative_credit_score_distribution(
+    real_scores,
+    synth_scores,
+    bins=50,
+    title='Comparative Credit Score Distribution: Real vs Synthetic Models'
+):
+    fig, axes = plt.subplots(1, 2, figsize=(16, 5), sharey=True)
+    sns.histplot(
+        real_scores,
+        bins=bins,
+        stat='count',
+        element='step',
+        fill=True,
+        alpha=0.2,
+        color='skyblue',
+        ax=axes[0]
+    )
+    axes[0].set_title('Real-Data Model Score Distribution')
+    axes[0].set_xlabel('Predicted Credit Score')
+    axes[0].set_ylabel('Frequency')
+    sns.histplot(
+        synth_scores,
+        bins=bins,
+        stat='count',
+        element='step',
+        fill=True,
+        alpha=0.2,
+        color='skyblue',
+        ax=axes[1]
+    )
+    axes[1].set_title('Synthetic-Data Model Score Distribution')
+    axes[1].set_xlabel('Predicted Credit Score')
+    axes[1].set_ylabel('Frequency')
+    plt.tight_layout()
+    return fig
+def plot_comparison_table(
+        y_true, y_real_pred, y_synth_pred,
+        title='Model Comparison: Real Data vs Synthetic Data'
+):
+    metrics_df = get_metrics_df(y_true, y_real_pred, y_synth_pred)
+    display_df = metrics_df.copy().round(4).set_index('Model')
+    fig, ax = plt.subplots(figsize=(18, 2))
+    ax.axis('off')
+    table = ax.table(
+        cellText=display_df.values,
+        rowLabels=display_df.index,
+        colLabels=display_df.columns,
+        cellLoc='center',
+        loc='center',
+    )
+    table.auto_set_font_size(False)
+    table.set_fontsize(16)
+    table.scale(1.2, 1.9)
+    for j in range(len(display_df.columns)):
+        table[(0, j)].set_facecolor('#1F77B4')
+        table[(0, j)].set_text_props(color='white', weight='bold')
+        table[(0, j)].set_edgecolor('white')
+        table[(0, j)].set_linewidth(1)
+    for i in range(1, len(display_df.index) + 1):
+        bg = '#0B0F19' if i % 2 else '#0B0F19'
+        table[(i, -1)].set_text_props(color='white', weight='bold')
+        table[(i, -1)].set_facecolor(bg)
+        table[(i, -1)].set_edgecolor('white')
+        table[(i, -1)].set_linewidth(1)
+        for j in range(len(display_df.columns)):
+            table[(i, j)].set_facecolor(bg)
+            table[(i, j)].set_text_props(color='white')
+            table[(i, j)].set_edgecolor('white')
+            table[(i, j)].set_linewidth(1)
+    plt.tight_layout()
+    return fig
+def plot_comparative_confusion_matrices(
+    y_true,
+    y_pred_real,
+    y_pred_synth,
+    labels=None,
+    normalize=False,
+    cmap='Blues'
+):
+    cm_real = confusion_matrix(y_true, y_pred_real, labels=labels)
+    cm_synth = confusion_matrix(y_true, y_pred_synth, labels=labels)
+    if normalize:
+        cm_real_plot = cm_real.astype(float) / cm_real.sum(axis=1, keepdims=True)
+        cm_synth_plot = cm_synth.astype(float) / cm_synth.sum(axis=1, keepdims=True)
+        fmt = '.2f'
+    else:
+        cm_real_plot = cm_real
+        cm_synth_plot = cm_synth
+        fmt = 'd'
+    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
+    sns.heatmap(
+        cm_real_plot, annot=True, fmt=fmt, cmap=cmap,
+        xticklabels=labels, yticklabels=labels, ax=axes[0]
+    )
+    axes[0].set_title(f"Real Data Confusion Matrix")
+    axes[0].set_xlabel("Predicted")
+    axes[0].set_ylabel("Actual")
+    sns.heatmap(
+        cm_synth_plot, annot=True, fmt=fmt, cmap=cmap,
+        xticklabels=labels, yticklabels=labels, ax=axes[1]
+    )
+    axes[1].set_title(f"Synthetic Data Confusion Matrix")
+    axes[1].set_xlabel("Predicted")
+    axes[1].set_ylabel("Actual")
+    plt.tight_layout()
+    return fig
+def plot_comparative_credit_score_distribution_by_actual_class(
+    y_true,
+    real_scores,
+    synth_scores,
+    color_map,
+    label_order=None,
+    bins=50,
+):
+    fig, (ax_left, ax_right) = plt.subplots(1, 2, figsize=(16, 5), sharey=True)
+    y_true_arr = pd.Series(y_true).values
+    for label in label_order:
+        mask = (y_true_arr == label)
+        sns.histplot(
+            real_scores[mask],
+            bins=bins,
+            stat='count',
+            element='step',
+            fill=True,
+            alpha=0.2,
+            color=color_map.get(label, None),
+            label=label,
+            ax=ax_left
+        )
+        sns.histplot(
+            synth_scores[mask],
+            bins=bins,
+            stat='count',
+            element='step',
+            fill=True,
+            alpha=0.2,
+            color=color_map.get(label, None),
+            label=label,
+            ax=ax_right
+        )
+    ax_left.set_title('Real-Data Model: Actual Class Distribution')
+    ax_left.set_xlabel('Predicted Credit Score')
+    ax_left.set_ylabel('Frequency')
+    ax_left.legend(title='Actual Class')
+    ax_right.set_title('Synthetic-Data Model: Actual Class Distribution')
+    ax_right.set_xlabel('Predicted Credit Score')
+    ax_right.set_ylabel('Frequency')
+    ax_right.legend(title='Actual Class')
+    plt.tight_layout()
+    return fig