Spaces:

AndreaMelioli
/

Corporate_Bankruptcy_Prediction

Runtime error

File size: 5,791 Bytes

import gradio as gr
import pandas as pd
import joblib
import numpy as np
import shap
import matplotlib.pyplot as plt

# Path of the desired scaler to be tested, can be changed
PATH_SCALER = "standard_scaler_1.sav"

# Path of the desired model to be tested, can be changed
PATH_MODEL = "random_forest_1.sav"

# Select the type of interface, "Single" for manually inserting a record
# "Multiple" for automatically inserting multiple records from a pickle file
type_interface = "Single"

loaded_scaler = joblib.load(PATH_SCALER)
loaded_model = joblib.load(PATH_MODEL)

feature_names = loaded_model.feature_names




# Define what to do with data given from the user form
def classify_company(*feature_values):
    # Create Pandas dataframe
    x = pd.DataFrame([feature_values], columns=loaded_model.feature_names, dtype=float)

    # Scale it accordingly with the used scaler during the training phase
    scaled_x = loaded_scaler.transform(x)

    # Obtain prediction from model
    prediction = loaded_model.predict_proba(scaled_x)[0]
    prediction_dict = {"Active": float(prediction[0]), "Bankruptcy": float(prediction[1])}


    # Create SHAP explainer
    explainer = shap.TreeExplainer(loaded_model)
    shap_values = explainer.shap_values(scaled_x)

    # Obtain SHAP values plot
    shap.force_plot(
        base_value=explainer.expected_value[0],
        shap_values=shap_values[0],
        features=x,
        feature_names=loaded_model.feature_names,
        out_names=["Active", "Bankruptcy"],
        matplotlib=True,
        figsize=(30,7),
        show=False,
    )

    # Create figure
    plt.tight_layout()
    fig = plt.gcf()
    plt.close()

    # Return prediction values and shap plot
    return prediction_dict, fig

# Define what to do with data given from the Pickle file
def classify_companies(file):

    # Read file as Pickle
    input_dataset = pd.read_pickle(file.name)
    input_dataset = input_dataset[feature_names]

    # Remove unused index
    input_dataset.reset_index(drop=True, inplace=True)

    # Force all the feature to be numeric and drop wrong values
    for column in input_dataset.columns:
        input_dataset[column] = pd.to_numeric(input_dataset[column], errors='coerce')
    input_dataset.dropna(inplace=True)



    # Scale it accordingly with the used scaler during the training phase
    x = loaded_scaler.transform(input_dataset)

    # Obtain prediction from model
    predictions = loaded_model.predict_proba(x)
    predictions_bankruptcy = predictions[:, 1]

    # Generate the plot of predictions
    fig = plt.figure(figsize=(15,7))
    plt.hist(predictions_bankruptcy, bins=50)
    plt.xlabel('Probability of bankruptcy', fontsize=25)
    plt.ylabel('Number of records', fontsize=25)
    plt.legend(fontsize=15)
    plt.tick_params(axis='both', labelsize=25, pad=5)

    export_predictions_dict = {
        "Bankruptcy probability 0-10%": float(sum(0.00 <= prediction < 0.10 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
        "Bankruptcy probability 10-20%": float(sum(0.10 <= prediction < 0.20 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
        "Bankruptcy probability 20-30%": float(sum(0.20 <= prediction < 0.30 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
        "Bankruptcy probability 30-40%": float(sum(0.30 <= prediction < 0.40 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
        "Bankruptcy probability 40-50%": float(sum(0.40 <= prediction < 0.50 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
        "Bankruptcy probability 50-60%": float(sum(0.50 <= prediction < 0.60 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
        "Bankruptcy probability 60-70%": float(sum(0.60 <= prediction < 0.70 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
        "Bankruptcy probability 70-80%": float(sum(0.70 <= prediction < 0.80 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
        "Bankruptcy probability 80-90%": float(sum(0.80 <= prediction < 0.90 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
        "Bankruptcy probability 90-100%": float(sum(0.90 <= prediction < 1 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy))
    }

    # Return prediction plot
    return fig, export_predictions_dict
    
    
 
 # Single record interface
if type_interface == "Single":
    # Input file components
    # For each feature create an optional numeric field
    gradio_inputs = []
    for feature_name in feature_names:
        gradio_inputs.append(gr.inputs.Number(default=0.0, label=feature_name, optional=False))

    # Create output components
    gradio_outputs = [gr.outputs.Label(num_top_classes = 2, label="Prediction probability"), gr.outputs.Plot(type="auto", label="SHAP values")]

    # Create the web app interface
    demo = gr.Interface(
        fn=classify_company,
        inputs=gradio_inputs,
        outputs=gradio_outputs,
        theme="dark"
    )

# Multiple records interface
else:
    # Input file component
    gradio_description = "Il file in formato Pickle deve contenere tutti i campi previsti dal modello."
    gradio_input = gr.inputs.File(file_count="single", type="file", label="Pickle file", optional=False)

    # Output file component
    gradio_output = [gr.outputs.Plot(type="auto", label="Prediction probabilities"), "label"]


    # Create the web app interface
    demo = gr.Interface(
        fn=classify_companies,
        inputs=gradio_input,
        outputs=gradio_output,
        description=gradio_description,
        theme="dark",
        live=True
    )

    
demo.launch(show_error=True, inline=False)