vumichien's picture
import gradio as gr
import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVC
theme = gr.themes.Monochrome(
model_card = f"""
## Description
**Univariate feature selection** can be used to improve classification accuracy on a noisy dataset.
In **univariate feature selection**, each feature is evaluated independently, and a statistical test is used to determine its strength of association with the target variable.
The most important features are then selected based on their statistical significance, typically using a threshold p-value or a pre-defined number of top features to select.
In this demo, some noisy (non informative) features are added to the iris dataset then use **Support vector machine (SVM)** to classify the Iris dataset both before and after applying univariate feature selection.
The results of the feature selection are presented through p-values and weights of SVMs, which are plotted for comparison.
The objective of this demo is to evaluate the accuracy of the models and assess the impact of univariate feature selection on the model weights.
You can play around with different ``number of top features`` and ``random seed``.
## Dataset
Iris dataset
# The iris dataset
X, y = load_iris(return_X_y=True)
# Some noisy data not correlated
E = np.random.RandomState(42).uniform(0, 0.1, size=(X.shape[0], 20))
# Add the noisy data to the informative features
X = np.hstack((X, E))
def do_train(k_features, random_state):
# Split dataset to select feature and evaluate the classifier
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=random_state)
selector = SelectKBest(f_classif, k=k_features), y_train)
scores = -np.log10(selector.pvalues_)
scores /= scores.max()
fig1, axes1 = plt.subplots()
X_indices = np.arange(X.shape[-1]) - 0.05, scores, width=0.2)
axes1.set_title("Feature univariate score")
axes1.set_xlabel("Feature number")
axes1.set_ylabel(r"Univariate score ($-Log(p_{value})$)")
clf = make_pipeline(MinMaxScaler(), LinearSVC()), y_train)
svm_weights = np.abs(clf[-1].coef_).sum(axis=0)
svm_weights /= svm_weights.sum()
clf_selected = make_pipeline(SelectKBest(f_classif, k=k_features), MinMaxScaler(), LinearSVC()), y_train)
svm_weights_selected = np.abs(clf_selected[-1].coef_).sum(axis=0)
svm_weights_selected /= svm_weights_selected.sum()
fig2, axes2 = plt.subplots()
X_indices - 0.45, scores, width=0.2, label=r"Univariate score ($-Log(p_{value})$)"
) - 0.25, svm_weights, width=0.2, label="SVM weight")
X_indices[selector.get_support()] - 0.05,
label="SVM weights after selection",
axes2.set_title("Comparing feature selection")
axes2.set_xlabel("Feature number")
axes2.legend(loc="upper right")
text = f"Classification accuracy without selecting features: {clf.score(X_test, y_test)*100:.2f}%. Classification accuracy after univariate feature selection: {clf_selected.score(X_test, y_test)*100:.2f}%"
return fig1, fig2, text
with gr.Blocks(theme=theme) as demo:
<h1 style='text-align: center'>Univariate Feature Selection</h1>
gr.Markdown("Author: <a href=\"\">Vu Minh Chien</a>. Based on the example from <a href=\"\">scikit-learn</a>")
k_features = gr.Slider(minimum=2, maximum=10, step=1, value=2, label="Number of top features to select")
random_state = gr.Slider(minimum=0, maximum=2000, step=1, value=0, label="Random seed")
with gr.Row():
with gr.Column():
plot_1 = gr.Plot(label="Univariate score")
with gr.Column():
plot_2 = gr.Plot(label="Comparing feature selection")
with gr.Row():
resutls = gr.Textbox(label="Results")
k_features.change(fn=do_train, inputs=[k_features, random_state], outputs=[plot_1, plot_2, resutls])
random_state.change(fn=do_train, inputs=[k_features, random_state], outputs=[plot_1, plot_2, resutls])