Spaces:

AndreaMelioli
/

Corporate_Bankruptcy_Prediction

Runtime error

App Files Files

Corporate_Bankruptcy_Prediction / app.py

AndreaMelioli

Update app.py

0d7b3b3 almost 2 years ago

raw history blame

No virus

5.79 kB

	import gradio as gr
	import pandas as pd
	import joblib
	import numpy as np
	import shap
	import matplotlib.pyplot as plt

	# Path of the desired scaler to be tested, can be changed
	PATH_SCALER = "standard_scaler_1.sav"

	# Path of the desired model to be tested, can be changed
	PATH_MODEL = "random_forest_1.sav"

	# Select the type of interface, "Single" for manually inserting a record
	# "Multiple" for automatically inserting multiple records from a pickle file
	type_interface = "Single"

	loaded_scaler = joblib.load(PATH_SCALER)
	loaded_model = joblib.load(PATH_MODEL)

	feature_names = loaded_model.feature_names




	# Define what to do with data given from the user form
	def classify_company(*feature_values):
	# Create Pandas dataframe
	x = pd.DataFrame([feature_values], columns=loaded_model.feature_names, dtype=float)

	# Scale it accordingly with the used scaler during the training phase
	scaled_x = loaded_scaler.transform(x)

	# Obtain prediction from model
	prediction = loaded_model.predict_proba(scaled_x)[0]
	prediction_dict = {"Active": float(prediction[0]), "Bankruptcy": float(prediction[1])}


	# Create SHAP explainer
	explainer = shap.TreeExplainer(loaded_model)
	shap_values = explainer.shap_values(scaled_x)

	# Obtain SHAP values plot
	shap.force_plot(
	base_value=explainer.expected_value[0],
	shap_values=shap_values[0],
	features=x,
	feature_names=loaded_model.feature_names,
	out_names=["Active", "Bankruptcy"],
	matplotlib=True,
	figsize=(30,7),
	show=False,
	)

	# Create figure
	plt.tight_layout()
	fig = plt.gcf()
	plt.close()

	# Return prediction values and shap plot
	return prediction_dict, fig

	# Define what to do with data given from the Pickle file
	def classify_companies(file):

	# Read file as Pickle
	input_dataset = pd.read_pickle(file.name)
	input_dataset = input_dataset[feature_names]

	# Remove unused index
	input_dataset.reset_index(drop=True, inplace=True)

	# Force all the feature to be numeric and drop wrong values
	for column in input_dataset.columns:
	input_dataset[column] = pd.to_numeric(input_dataset[column], errors='coerce')
	input_dataset.dropna(inplace=True)



	# Scale it accordingly with the used scaler during the training phase
	x = loaded_scaler.transform(input_dataset)

	# Obtain prediction from model
	predictions = loaded_model.predict_proba(x)
	predictions_bankruptcy = predictions[:, 1]

	# Generate the plot of predictions
	fig = plt.figure(figsize=(15,7))
	plt.hist(predictions_bankruptcy, bins=50)
	plt.xlabel('Probability of bankruptcy', fontsize=25)
	plt.ylabel('Number of records', fontsize=25)
	plt.legend(fontsize=15)
	plt.tick_params(axis='both', labelsize=25, pad=5)

	export_predictions_dict = {
	"Bankruptcy probability 0-10%": float(sum(0.00 <= prediction < 0.10 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
	"Bankruptcy probability 10-20%": float(sum(0.10 <= prediction < 0.20 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
	"Bankruptcy probability 20-30%": float(sum(0.20 <= prediction < 0.30 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
	"Bankruptcy probability 30-40%": float(sum(0.30 <= prediction < 0.40 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
	"Bankruptcy probability 40-50%": float(sum(0.40 <= prediction < 0.50 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
	"Bankruptcy probability 50-60%": float(sum(0.50 <= prediction < 0.60 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
	"Bankruptcy probability 60-70%": float(sum(0.60 <= prediction < 0.70 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
	"Bankruptcy probability 70-80%": float(sum(0.70 <= prediction < 0.80 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
	"Bankruptcy probability 80-90%": float(sum(0.80 <= prediction < 0.90 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
	"Bankruptcy probability 90-100%": float(sum(0.90 <= prediction < 1 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy))
	}

	# Return prediction plot
	return fig, export_predictions_dict



	# Single record interface
	if type_interface == "Single":
	# Input file components
	# For each feature create an optional numeric field
	gradio_inputs = []
	for feature_name in feature_names:
	gradio_inputs.append(gr.inputs.Number(default=0.0, label=feature_name, optional=False))

	# Create output components
	gradio_outputs = [gr.outputs.Label(num_top_classes = 2, label="Prediction probability"), gr.outputs.Plot(type="auto", label="SHAP values")]

	# Create the web app interface
	demo = gr.Interface(
	fn=classify_company,
	inputs=gradio_inputs,
	outputs=gradio_outputs,
	theme="dark"
	)

	# Multiple records interface
	else:
	# Input file component
	gradio_description = "Il file in formato Pickle deve contenere tutti i campi previsti dal modello."
	gradio_input = gr.inputs.File(file_count="single", type="file", label="Pickle file", optional=False)

	# Output file component
	gradio_output = [gr.outputs.Plot(type="auto", label="Prediction probabilities"), "label"]


	# Create the web app interface
	demo = gr.Interface(
	fn=classify_companies,
	inputs=gradio_input,
	outputs=gradio_output,
	description=gradio_description,
	theme="dark",
	live=True
	)


	demo.launch(show_error=True, inline=False)