File size: 5,791 Bytes
8b2fb3f
 
 
 
 
 
 
 
0d7b3b3
8b2fb3f
 
0d7b3b3
8b2fb3f
 
 
 
 
 
 
 
 
 
 
 
 
 
755a039
a3b9ac3
755a039
8b2fb3f
 
c8e2d7a
8b2fb3f
 
 
 
 
 
f5d5b92
8b2fb3f
 
 
f5d5b92
8b2fb3f
 
 
 
 
 
 
 
 
 
 
f5d5b92
8b2fb3f
 
 
f5d5b92
8b2fb3f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f2c255f
8b2fb3f
f2c255f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import gradio as gr
import pandas as pd
import joblib
import numpy as np
import shap
import matplotlib.pyplot as plt

# Path of the desired scaler to be tested, can be changed
PATH_SCALER = "standard_scaler_1.sav"

# Path of the desired model to be tested, can be changed
PATH_MODEL = "random_forest_1.sav"

# Select the type of interface, "Single" for manually inserting a record
# "Multiple" for automatically inserting multiple records from a pickle file
type_interface = "Single"

loaded_scaler = joblib.load(PATH_SCALER)
loaded_model = joblib.load(PATH_MODEL)

feature_names = loaded_model.feature_names




# Define what to do with data given from the user form
def classify_company(*feature_values):
    # Create Pandas dataframe
    x = pd.DataFrame([feature_values], columns=loaded_model.feature_names, dtype=float)

    # Scale it accordingly with the used scaler during the training phase
    scaled_x = loaded_scaler.transform(x)

    # Obtain prediction from model
    prediction = loaded_model.predict_proba(scaled_x)[0]
    prediction_dict = {"Active": float(prediction[0]), "Bankruptcy": float(prediction[1])}


    # Create SHAP explainer
    explainer = shap.TreeExplainer(loaded_model)
    shap_values = explainer.shap_values(scaled_x)

    # Obtain SHAP values plot
    shap.force_plot(
        base_value=explainer.expected_value[0],
        shap_values=shap_values[0],
        features=x,
        feature_names=loaded_model.feature_names,
        out_names=["Active", "Bankruptcy"],
        matplotlib=True,
        figsize=(30,7),
        show=False,
    )

    # Create figure
    plt.tight_layout()
    fig = plt.gcf()
    plt.close()

    # Return prediction values and shap plot
    return prediction_dict, fig

# Define what to do with data given from the Pickle file
def classify_companies(file):

    # Read file as Pickle
    input_dataset = pd.read_pickle(file.name)
    input_dataset = input_dataset[feature_names]

    # Remove unused index
    input_dataset.reset_index(drop=True, inplace=True)

    # Force all the feature to be numeric and drop wrong values
    for column in input_dataset.columns:
        input_dataset[column] = pd.to_numeric(input_dataset[column], errors='coerce')
    input_dataset.dropna(inplace=True)



    # Scale it accordingly with the used scaler during the training phase
    x = loaded_scaler.transform(input_dataset)

    # Obtain prediction from model
    predictions = loaded_model.predict_proba(x)
    predictions_bankruptcy = predictions[:, 1]

    # Generate the plot of predictions
    fig = plt.figure(figsize=(15,7))
    plt.hist(predictions_bankruptcy, bins=50)
    plt.xlabel('Probability of bankruptcy', fontsize=25)
    plt.ylabel('Number of records', fontsize=25)
    plt.legend(fontsize=15)
    plt.tick_params(axis='both', labelsize=25, pad=5)

    export_predictions_dict = {
        "Bankruptcy probability 0-10%": float(sum(0.00 <= prediction < 0.10 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
        "Bankruptcy probability 10-20%": float(sum(0.10 <= prediction < 0.20 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
        "Bankruptcy probability 20-30%": float(sum(0.20 <= prediction < 0.30 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
        "Bankruptcy probability 30-40%": float(sum(0.30 <= prediction < 0.40 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
        "Bankruptcy probability 40-50%": float(sum(0.40 <= prediction < 0.50 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
        "Bankruptcy probability 50-60%": float(sum(0.50 <= prediction < 0.60 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
        "Bankruptcy probability 60-70%": float(sum(0.60 <= prediction < 0.70 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
        "Bankruptcy probability 70-80%": float(sum(0.70 <= prediction < 0.80 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
        "Bankruptcy probability 80-90%": float(sum(0.80 <= prediction < 0.90 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
        "Bankruptcy probability 90-100%": float(sum(0.90 <= prediction < 1 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy))
    }

    # Return prediction plot
    return fig, export_predictions_dict
    
    
 
 # Single record interface
if type_interface == "Single":
    # Input file components
    # For each feature create an optional numeric field
    gradio_inputs = []
    for feature_name in feature_names:
        gradio_inputs.append(gr.inputs.Number(default=0.0, label=feature_name, optional=False))

    # Create output components
    gradio_outputs = [gr.outputs.Label(num_top_classes = 2, label="Prediction probability"), gr.outputs.Plot(type="auto", label="SHAP values")]

    # Create the web app interface
    demo = gr.Interface(
        fn=classify_company,
        inputs=gradio_inputs,
        outputs=gradio_outputs,
        theme="dark"
    )

# Multiple records interface
else:
    # Input file component
    gradio_description = "Il file in formato Pickle deve contenere tutti i campi previsti dal modello."
    gradio_input = gr.inputs.File(file_count="single", type="file", label="Pickle file", optional=False)

    # Output file component
    gradio_output = [gr.outputs.Plot(type="auto", label="Prediction probabilities"), "label"]


    # Create the web app interface
    demo = gr.Interface(
        fn=classify_companies,
        inputs=gradio_input,
        outputs=gradio_output,
        description=gradio_description,
        theme="dark",
        live=True
    )

    
demo.launch(show_error=True, inline=False)