AndreaMelioli's picture
Update app.py
0d7b3b3
raw history blame
No virus
5.79 kB
import gradio as gr
import pandas as pd
import joblib
import numpy as np
import shap
import matplotlib.pyplot as plt
# Path of the desired scaler to be tested, can be changed
PATH_SCALER = "standard_scaler_1.sav"
# Path of the desired model to be tested, can be changed
PATH_MODEL = "random_forest_1.sav"
# Select the type of interface, "Single" for manually inserting a record
# "Multiple" for automatically inserting multiple records from a pickle file
type_interface = "Single"
loaded_scaler = joblib.load(PATH_SCALER)
loaded_model = joblib.load(PATH_MODEL)
feature_names = loaded_model.feature_names
# Define what to do with data given from the user form
def classify_company(*feature_values):
# Create Pandas dataframe
x = pd.DataFrame([feature_values], columns=loaded_model.feature_names, dtype=float)
# Scale it accordingly with the used scaler during the training phase
scaled_x = loaded_scaler.transform(x)
# Obtain prediction from model
prediction = loaded_model.predict_proba(scaled_x)[0]
prediction_dict = {"Active": float(prediction[0]), "Bankruptcy": float(prediction[1])}
# Create SHAP explainer
explainer = shap.TreeExplainer(loaded_model)
shap_values = explainer.shap_values(scaled_x)
# Obtain SHAP values plot
shap.force_plot(
base_value=explainer.expected_value[0],
shap_values=shap_values[0],
features=x,
feature_names=loaded_model.feature_names,
out_names=["Active", "Bankruptcy"],
matplotlib=True,
figsize=(30,7),
show=False,
)
# Create figure
plt.tight_layout()
fig = plt.gcf()
plt.close()
# Return prediction values and shap plot
return prediction_dict, fig
# Define what to do with data given from the Pickle file
def classify_companies(file):
# Read file as Pickle
input_dataset = pd.read_pickle(file.name)
input_dataset = input_dataset[feature_names]
# Remove unused index
input_dataset.reset_index(drop=True, inplace=True)
# Force all the feature to be numeric and drop wrong values
for column in input_dataset.columns:
input_dataset[column] = pd.to_numeric(input_dataset[column], errors='coerce')
input_dataset.dropna(inplace=True)
# Scale it accordingly with the used scaler during the training phase
x = loaded_scaler.transform(input_dataset)
# Obtain prediction from model
predictions = loaded_model.predict_proba(x)
predictions_bankruptcy = predictions[:, 1]
# Generate the plot of predictions
fig = plt.figure(figsize=(15,7))
plt.hist(predictions_bankruptcy, bins=50)
plt.xlabel('Probability of bankruptcy', fontsize=25)
plt.ylabel('Number of records', fontsize=25)
plt.legend(fontsize=15)
plt.tick_params(axis='both', labelsize=25, pad=5)
export_predictions_dict = {
"Bankruptcy probability 0-10%": float(sum(0.00 <= prediction < 0.10 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
"Bankruptcy probability 10-20%": float(sum(0.10 <= prediction < 0.20 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
"Bankruptcy probability 20-30%": float(sum(0.20 <= prediction < 0.30 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
"Bankruptcy probability 30-40%": float(sum(0.30 <= prediction < 0.40 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
"Bankruptcy probability 40-50%": float(sum(0.40 <= prediction < 0.50 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
"Bankruptcy probability 50-60%": float(sum(0.50 <= prediction < 0.60 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
"Bankruptcy probability 60-70%": float(sum(0.60 <= prediction < 0.70 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
"Bankruptcy probability 70-80%": float(sum(0.70 <= prediction < 0.80 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
"Bankruptcy probability 80-90%": float(sum(0.80 <= prediction < 0.90 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy)),
"Bankruptcy probability 90-100%": float(sum(0.90 <= prediction < 1 for prediction in predictions_bankruptcy) / len(predictions_bankruptcy))
}
# Return prediction plot
return fig, export_predictions_dict
# Single record interface
if type_interface == "Single":
# Input file components
# For each feature create an optional numeric field
gradio_inputs = []
for feature_name in feature_names:
gradio_inputs.append(gr.inputs.Number(default=0.0, label=feature_name, optional=False))
# Create output components
gradio_outputs = [gr.outputs.Label(num_top_classes = 2, label="Prediction probability"), gr.outputs.Plot(type="auto", label="SHAP values")]
# Create the web app interface
demo = gr.Interface(
fn=classify_company,
inputs=gradio_inputs,
outputs=gradio_outputs,
theme="dark"
)
# Multiple records interface
else:
# Input file component
gradio_description = "Il file in formato Pickle deve contenere tutti i campi previsti dal modello."
gradio_input = gr.inputs.File(file_count="single", type="file", label="Pickle file", optional=False)
# Output file component
gradio_output = [gr.outputs.Plot(type="auto", label="Prediction probabilities"), "label"]
# Create the web app interface
demo = gr.Interface(
fn=classify_companies,
inputs=gradio_input,
outputs=gradio_output,
description=gradio_description,
theme="dark",
live=True
)
demo.launch(show_error=True, inline=False)