EduardoPacheco's picture
Event listener
a9ffd34
raw
history blame
4.42 kB
import numpy as np
import gradio as gr
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.datasets import fetch_openml
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import utils
def app_fn(seed: int, n_cat: int, n_estimators: int, min_samples_leaf: int):
X, y = fetch_openml(
"titanic", version=1, as_frame=True, return_X_y=True, parser="pandas"
)
rng = np.random.RandomState(seed=seed)
X["random_cat"] = rng.randint(n_cat, size=X.shape[0])
X["random_num"] = rng.randn(X.shape[0])
categorical_columns = ["pclass", "sex", "embarked", "random_cat"]
numerical_columns = ["age", "sibsp", "parch", "fare", "random_num"]
X = X[categorical_columns + numerical_columns]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=seed)
categorical_encoder = OrdinalEncoder(
handle_unknown="use_encoded_value", unknown_value=-1, encoded_missing_value=-1
)
numerical_pipe = SimpleImputer(strategy="mean")
preprocessing = ColumnTransformer(
[
("cat", categorical_encoder, categorical_columns),
("num", numerical_pipe, numerical_columns),
],
verbose_feature_names_out=False,
)
clf = Pipeline(
[
("preprocess", preprocessing),
("classifier", RandomForestClassifier(
random_state=seed,
n_estimators=n_estimators,
min_samples_leaf=min_samples_leaf
)
),
]
)
clf.fit(X_train, y_train)
fig_mdi = utils.plot_rf_importance(clf)
fig_perm_train = utils.plot_permutation_boxplot(clf, X_train, y_train, set_="train set")
fig_perm_test = utils.plot_permutation_boxplot(clf, X_test, y_test, set_="test set")
return fig_mdi, fig_perm_train, fig_perm_test
title = "Permutation Importance vs Random Forest Feature Importance (MDI)"
with gr.Blocks(title=title) as demo:
gr.Markdown(f"# {title}")
gr.Markdown(
"""
This demo compares the feature importances of a Random Forest classifier using the Mean Decrease Impurity (MDI) method and the Permutation Importance method. \
To showcase the difference between the two methods, we add two random features to the Titanic dataset. \
The first random feature is categorical and the second one is numerical. \
The categorical feature can have its number of categories changed \
and the numerical feature is sampled from a Standard Normal Distribution. \
Random Forest hyperparameters can also be changed to verify the impact of model complexity on the feature importances.
See the original scikit-learn example [here](https://scikit-learn.org/stable/auto_examples/inspection/plot_permutation_importance.html#sphx-glr-auto-examples-inspection-plot-permutation-importance-py).
"""
)
with gr.Row():
seed = gr.inputs.Slider(0, 42, 1, default=42, label="Seed")
n_cat = gr.inputs.Slider(2, 30, 1, default=3, label="Number of categories in random_cat")
n_estimators = gr.inputs.Slider(5, 150, 5, default=100, label="Number of Trees")
min_samples_leaf = gr.inputs.Slider(1, 30, 5, default=1, label="Minimum number of samples to create a leaf")
fig_mdi = gr.Plot(label="Mean Decrease Impurity (MDI)")
with gr.Row():
fig_perm_train = gr.Plot(label="Permutation Importance (Train)")
fig_perm_test = gr.Plot(label="Permutation Importance (Test)")
seed.change(fn=app_fn, outputs=[fig_mdi, fig_perm_train, fig_perm_test], inputs=[seed, n_cat, n_estimators, min_samples_leaf])
n_cat.change(fn=app_fn, outputs=[fig_mdi, fig_perm_train, fig_perm_test], inputs=[seed, n_cat, n_estimators, min_samples_leaf])
n_estimators.change(fn=app_fn, outputs=[fig_mdi, fig_perm_train, fig_perm_test], inputs=[seed, n_cat, n_estimators, min_samples_leaf])
min_samples_leaf.change(fn=app_fn, outputs=[fig_mdi, fig_perm_train, fig_perm_test], inputs=[seed, n_cat, n_estimators, min_samples_leaf])
demo.load(fn=app_fn, outputs=[fig_mdi, fig_perm_train, fig_perm_test], inputs=[seed, n_cat, n_estimators, min_samples_leaf])
demo.launch()