Spaces:
Sleeping
Sleeping
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import pandas as pd | |
| from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix | |
| plt.rcParams['figure.facecolor'] = '#1F2937' | |
| plt.rcParams['axes.facecolor'] = '#0B0F19' | |
| plt.rcParams['text.color'] = 'white' | |
| plt.rcParams['axes.labelcolor'] = 'white' | |
| plt.rcParams['xtick.color'] = 'white' | |
| plt.rcParams['ytick.color'] = 'white' | |
| def plot_feature_distributions(real_data, synthetic_data): | |
| features = real_data.columns.to_list() | |
| n_cols = 3 | |
| n_rows = (len(features) + n_cols - 1) // n_cols | |
| fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 5 * n_rows)) | |
| for i, feature in enumerate(features): | |
| row = i // n_cols | |
| col = i % n_cols | |
| sns.histplot( | |
| real_data[feature], | |
| bins=30, | |
| color='skyblue', | |
| stat='count', | |
| element='step', | |
| fill=True, | |
| alpha=0.2, | |
| ax=axes[row, col] | |
| ) | |
| sns.histplot( | |
| synthetic_data[feature], | |
| bins=30, | |
| color='indianred', | |
| stat='count', | |
| element='step', | |
| fill=True, | |
| alpha=0.2, | |
| ax=axes[row, col] | |
| ) | |
| axes[row, col].set_title(f'Distribution of {feature}') | |
| axes[row, col].set_xlabel(feature) | |
| axes[row, col].set_ylabel('Frequency') | |
| axes[row, col].legend(['Real Data', 'Synthetic Data']) | |
| for j in range(i + 1, n_rows * n_cols): | |
| fig.delaxes(axes[j // n_cols, j % n_cols]) | |
| plt.tight_layout() | |
| return fig | |
| def get_metrics_df(y_true, y_real_pred, y_synth_pred): | |
| metrics = { | |
| 'Model': ['Real Data Model', 'Synthetic Data Model'], | |
| 'Accuracy': [ | |
| accuracy_score(y_true, y_real_pred), | |
| accuracy_score(y_true, y_synth_pred) | |
| ], | |
| 'Precision': [ | |
| precision_score(y_true, y_real_pred, average='weighted'), | |
| precision_score(y_true, y_synth_pred, average='weighted') | |
| ], | |
| 'Recall': [ | |
| recall_score(y_true, y_real_pred, average='weighted'), | |
| recall_score(y_true, y_synth_pred, average='weighted') | |
| ], | |
| 'F1-Score': [ | |
| f1_score(y_true, y_real_pred, average='weighted'), | |
| f1_score(y_true, y_synth_pred, average='weighted') | |
| ] | |
| } | |
| return pd.DataFrame(metrics) | |
| def plot_comparative_credit_score_distribution( | |
| real_scores, | |
| synth_scores, | |
| bins=50, | |
| title='Comparative Credit Score Distribution: Real vs Synthetic Models' | |
| ): | |
| fig, axes = plt.subplots(1, 2, figsize=(16, 5), sharey=True) | |
| sns.histplot( | |
| real_scores, | |
| bins=bins, | |
| stat='count', | |
| element='step', | |
| fill=True, | |
| alpha=0.2, | |
| color='skyblue', | |
| ax=axes[0] | |
| ) | |
| axes[0].set_title('Real-Data Model Score Distribution') | |
| axes[0].set_xlabel('Predicted Credit Score') | |
| axes[0].set_ylabel('Frequency') | |
| sns.histplot( | |
| synth_scores, | |
| bins=bins, | |
| stat='count', | |
| element='step', | |
| fill=True, | |
| alpha=0.2, | |
| color='skyblue', | |
| ax=axes[1] | |
| ) | |
| axes[1].set_title('Synthetic-Data Model Score Distribution') | |
| axes[1].set_xlabel('Predicted Credit Score') | |
| axes[1].set_ylabel('Frequency') | |
| plt.tight_layout() | |
| return fig | |
| def plot_comparison_table( | |
| y_true, y_real_pred, y_synth_pred, | |
| title='Model Comparison: Real Data vs Synthetic Data' | |
| ): | |
| metrics_df = get_metrics_df(y_true, y_real_pred, y_synth_pred) | |
| display_df = metrics_df.copy().round(4).set_index('Model') | |
| fig, ax = plt.subplots(figsize=(18, 2)) | |
| ax.axis('off') | |
| table = ax.table( | |
| cellText=display_df.values, | |
| rowLabels=display_df.index, | |
| colLabels=display_df.columns, | |
| cellLoc='center', | |
| loc='center', | |
| ) | |
| table.auto_set_font_size(False) | |
| table.set_fontsize(16) | |
| table.scale(1.2, 1.9) | |
| for j in range(len(display_df.columns)): | |
| table[(0, j)].set_facecolor('#1F77B4') | |
| table[(0, j)].set_text_props(color='white', weight='bold') | |
| table[(0, j)].set_edgecolor('white') | |
| table[(0, j)].set_linewidth(1) | |
| for i in range(1, len(display_df.index) + 1): | |
| bg = '#0B0F19' if i % 2 else '#0B0F19' | |
| table[(i, -1)].set_text_props(color='white', weight='bold') | |
| table[(i, -1)].set_facecolor(bg) | |
| table[(i, -1)].set_edgecolor('white') | |
| table[(i, -1)].set_linewidth(1) | |
| for j in range(len(display_df.columns)): | |
| table[(i, j)].set_facecolor(bg) | |
| table[(i, j)].set_text_props(color='white') | |
| table[(i, j)].set_edgecolor('white') | |
| table[(i, j)].set_linewidth(1) | |
| plt.tight_layout() | |
| return fig | |
| def plot_comparative_confusion_matrices( | |
| y_true, | |
| y_pred_real, | |
| y_pred_synth, | |
| labels=None, | |
| normalize=False, | |
| cmap='Blues' | |
| ): | |
| cm_real = confusion_matrix(y_true, y_pred_real, labels=labels) | |
| cm_synth = confusion_matrix(y_true, y_pred_synth, labels=labels) | |
| if normalize: | |
| cm_real_plot = cm_real.astype(float) / cm_real.sum(axis=1, keepdims=True) | |
| cm_synth_plot = cm_synth.astype(float) / cm_synth.sum(axis=1, keepdims=True) | |
| fmt = '.2f' | |
| else: | |
| cm_real_plot = cm_real | |
| cm_synth_plot = cm_synth | |
| fmt = 'd' | |
| fig, axes = plt.subplots(1, 2, figsize=(16, 6)) | |
| sns.heatmap( | |
| cm_real_plot, annot=True, fmt=fmt, cmap=cmap, | |
| xticklabels=labels, yticklabels=labels, ax=axes[0] | |
| ) | |
| axes[0].set_title(f"Real Data Confusion Matrix") | |
| axes[0].set_xlabel("Predicted") | |
| axes[0].set_ylabel("Actual") | |
| sns.heatmap( | |
| cm_synth_plot, annot=True, fmt=fmt, cmap=cmap, | |
| xticklabels=labels, yticklabels=labels, ax=axes[1] | |
| ) | |
| axes[1].set_title(f"Synthetic Data Confusion Matrix") | |
| axes[1].set_xlabel("Predicted") | |
| axes[1].set_ylabel("Actual") | |
| plt.tight_layout() | |
| return fig | |
| def plot_comparative_credit_score_distribution_by_actual_class( | |
| y_true, | |
| real_scores, | |
| synth_scores, | |
| color_map, | |
| label_order=None, | |
| bins=50, | |
| ): | |
| fig, (ax_left, ax_right) = plt.subplots(1, 2, figsize=(16, 5), sharey=True) | |
| y_true_arr = pd.Series(y_true).values | |
| for label in label_order: | |
| mask = (y_true_arr == label) | |
| sns.histplot( | |
| real_scores[mask], | |
| bins=bins, | |
| stat='count', | |
| element='step', | |
| fill=True, | |
| alpha=0.2, | |
| color=color_map.get(label, None), | |
| label=label, | |
| ax=ax_left | |
| ) | |
| sns.histplot( | |
| synth_scores[mask], | |
| bins=bins, | |
| stat='count', | |
| element='step', | |
| fill=True, | |
| alpha=0.2, | |
| color=color_map.get(label, None), | |
| label=label, | |
| ax=ax_right | |
| ) | |
| ax_left.set_title('Real-Data Model: Actual Class Distribution') | |
| ax_left.set_xlabel('Predicted Credit Score') | |
| ax_left.set_ylabel('Frequency') | |
| ax_left.legend(title='Actual Class') | |
| ax_right.set_title('Synthetic-Data Model: Actual Class Distribution') | |
| ax_right.set_xlabel('Predicted Credit Score') | |
| ax_right.set_ylabel('Frequency') | |
| ax_right.legend(title='Actual Class') | |
| plt.tight_layout() | |
| return fig | |
| def plot_evaluation_table(summary_df, title="Synthetic Data Evaluation Summary"): | |
| display_df = summary_df.copy().round(4) | |
| fig, ax = plt.subplots(figsize=(18, 2)) | |
| ax.axis("off") | |
| table = ax.table( | |
| cellText = display_df.values, | |
| rowLabels = display_df.index, | |
| colLabels = display_df.columns, | |
| cellLoc = "center", | |
| loc = "center", | |
| ) | |
| table.auto_set_font_size(False) | |
| table.set_fontsize(16) | |
| table.scale(1.2, 1.9) | |
| for j in range(len(display_df.columns)): | |
| table[(0, j)].set_facecolor("#1F77B4") | |
| table[(0, j)].set_text_props(color="white", weight="bold") | |
| table[(0, j)].set_edgecolor("white") | |
| table[(0, j)].set_linewidth(1) | |
| for i in range(1, len(display_df.index) + 1): | |
| bg = "#0B0F19" | |
| table[(i, -1)].set_text_props(color="white", weight="bold") | |
| table[(i, -1)].set_facecolor(bg) | |
| table[(i, -1)].set_edgecolor("white") | |
| table[(i, -1)].set_linewidth(1) | |
| for j in range(len(display_df.columns)): | |
| table[(i, j)].set_facecolor(bg) | |
| table[(i, j)].set_text_props(color="white") | |
| table[(i, j)].set_edgecolor("white") | |
| table[(i, j)].set_linewidth(1) | |
| ax.set_title(title, color="white", fontsize=16, weight="bold", pad=12) | |
| plt.tight_layout() | |
| return fig |