Spaces:

luisejdm
/

Proyecto2_Deep_Learning

Sleeping

App Files Files Community

Proyecto2_Deep_Learning / app.py

luisejdm

Update app.py

9301d5d verified about 1 month ago

raw

history blame contribute delete

5.61 kB

	import gradio as gr
	import pandas as pd

	from data_generation import generate_synthetic_training_data, evaluate_synthetic_data
	from data_preprocessing import preprocess_real_data, preprocess_synthetic_data
	from credit_models import real_data_credit_model, synthetic_data_credit_model
	from visualization import (
	plot_feature_distributions,
	plot_comparative_credit_score_distribution,
	plot_comparison_table,
	plot_comparative_confusion_matrices,
	plot_comparative_credit_score_distribution_by_actual_class,
	plot_evaluation_table,
	get_metrics_df,
	)

	COLOR_MAP = {
	'Good': '#28B463',
	'Standard': '#F1C40F',
	'Poor': '#E74C3C',
	}

	LABEL_ORDER = ['Good', 'Standard', 'Poor']
	TARGET = 'Credit_Score'

	# Load and preprocess real data once at startup
	real_train = pd.read_csv('data/processed/v4/real_train_data.csv')
	real_test = pd.read_csv('data/processed/v4/real_test_data.csv')

	X_real_train, y_real_train, X_real_test, y_real_test = preprocess_real_data(
	real_train, real_test, TARGET
	)

	# Train real-data model once at startup
	real_scores, real_classification = real_data_credit_model(
	X_real_train, y_real_train, X_real_test
	)

	def run_analysis():
	"""Generate new synthetic data, train the synthetic model, and return all comparison plots."""
	synthetic_data = generate_synthetic_training_data(n=int(len(X_real_train)/3)) # Same number of samples as real training data
	X_synth_train, y_synth_train = preprocess_synthetic_data(synthetic_data, TARGET)

	categorical_cols = [col for col in X_real_train.columns if X_real_train[col].dtype in ["object", "bool", "uint8"]]
	numeric_cols = [col for col in X_real_train.columns if X_real_train[col].dtype in ["int64", "float64"]]

	fig_feature_dist = plot_feature_distributions(
	X_real_train, X_synth_train
	)

	summary_rows = []
	for cls in LABEL_ORDER:
	real_cls = X_real_train[y_real_train == cls]
	synth_cls = X_synth_train[y_synth_train == cls]
	ks_pass_rate, mean_ks, chi_pass_rate, mean_corr_diff = evaluate_synthetic_data(
	real_cls, synth_cls, categorical_cols, numeric_cols
	)
	summary_rows.append({
	"ks_passed": ks_pass_rate == 1.0,
	"mean_ks_stat": round(mean_ks, 4),
	"chi_passed": chi_pass_rate == 1.0 if chi_pass_rate is not None else None,
	"mean_corr_diff": round(mean_corr_diff, 4),
	})
	summary_df = pd.DataFrame(summary_rows, index=LABEL_ORDER)
	summary = plot_evaluation_table(summary_df)

	synth_scores, synth_classification = synthetic_data_credit_model(
	X_synth_train, y_synth_train, X_real_test
	)

	fig_score_dist = plot_comparative_credit_score_distribution(
	real_scores, synth_scores
	)
	fig_score_by_class = plot_comparative_credit_score_distribution_by_actual_class(
	y_real_test, real_scores, synth_scores,
	color_map=COLOR_MAP,
	label_order=LABEL_ORDER,
	)
	fig_metrics = plot_comparison_table(
	y_real_test, real_classification, synth_classification
	)
	fig_cm = plot_comparative_confusion_matrices(
	y_real_test, real_classification, synth_classification,
	labels=LABEL_ORDER,
	)

	metrics_df = get_metrics_df(y_real_test, real_classification, synth_classification)
	metrics_df = metrics_df.round(4)

	return fig_feature_dist, summary, fig_score_dist, fig_score_by_class, fig_metrics, fig_cm, metrics_df


	with gr.Blocks(title="Credit Score Model Dashboard", theme=gr.themes.Soft()) as demo:
	gr.Markdown(
	"""
	# Credit Score Model Dashboard
	Compare a Real-Data Model vs a Synthetic-Data Model trained with CTGAN-generated data.
	Click the button to regenerate synthetic data and retrain the synthetic model.
	"""
	)

	run_btn = gr.Button(
	"Generate New Synthetic Data & Analyze", variant="primary", size="lg"
	)

	gr.Markdown(
	"""
	## Feature Distribution Comparison

	Below are the distributions of the features in the real vs synthetic training datasets.
	"""
	)

	with gr.Row():
	plot_feature_dist = gr.Plot(label='')

	gr.Markdown(
	"""
	## Generated Data Quality Summary

	Below is a summary of the data quality evaluation comparing the synthetic training data to the real training data across multiple metrics.
	"""
	)

	with gr.Row():
	plot_summary = gr.Plot(label='')

	gr.Markdown(
	"""
	## Credit Models Metrics

	Below are the metrics for the real-data and synthetic-data models.
	"""
	)

	with gr.Row():
	plot_metrics = gr.Plot(label='')

	gr.Markdown(
	"""
	## Credit Score Distribution Comparison
	Below are the distributions of the predicted credit scores for the real-data and synthetic-data models.
	"""
	)

	with gr.Row():
	plot_score_dist = gr.Plot(label='')

	gr.Markdown(
	"""
	## Credit Score Distribution by Actual Class
	Below are the distributions of the predicted credit scores for each actual class (Good, Standard, Poor) for both models.
	"""
	)

	with gr.Row():
	plot_score_by_class = gr.Plot(label='')

	gr.Markdown(
	"""
	## Confusion Matrix Comparison
	Below are the confusion matrices for the real-data and synthetic-data models.
	"""
	)

	with gr.Row():
	plot_cm = gr.Plot(label='')

	run_btn.click(
	fn=run_analysis,
	inputs=[],
	outputs=[plot_feature_dist, plot_summary, plot_score_dist, plot_score_by_class, plot_metrics, plot_cm],
	)

	demo.launch()