Spaces:
Runtime error
Runtime error
import gradio as gr | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from sklearn.decomposition import PCA | |
from sklearn.model_selection import train_test_split | |
from sklearn.pipeline import make_pipeline | |
from sklearn.linear_model import LinearRegression | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.decomposition import PCA | |
from sklearn.cross_decomposition import PLSRegression | |
#Data preparation | |
def make_data(): | |
rng = np.random.RandomState(0) | |
n_samples = 500 | |
cov = [[3, 3], [3, 4]] | |
X = rng.multivariate_normal(mean=[0, 0], cov=cov, size=n_samples) | |
return X,rng,n_samples | |
def plot_scatter_pca(alpha): | |
plt.scatter(X[:, 0], X[:, 1], alpha=alpha, label="samples") | |
for i, (comp, var) in enumerate(zip(pca.components_, pca.explained_variance_)): | |
comp = comp * var # scale component by its variance explanation power | |
plt.plot( | |
[0, comp[0]], | |
[0, comp[1]], | |
label=f"Component {i}", | |
linewidth=5, | |
color=f"C{i + 2}", | |
) | |
plt.gca().set( | |
aspect="equal", | |
title="2-dimensional dataset with principal components", | |
xlabel="first feature", | |
ylabel="second feature", | |
) | |
plt.legend() | |
# plt.show() | |
return plt | |
def datagen_y(): | |
y = X.dot(pca.components_[1]) + rng.normal(size=n_samples) / 2 | |
return y | |
def data_projections(): | |
y = datagen_y() | |
fig, axes = plt.subplots(1, 2, figsize=(10, 3)) | |
axes[0].scatter(X.dot(pca.components_[0]), y, alpha=0.3) | |
axes[0].set(xlabel="Projected data onto first PCA component", ylabel="y") | |
axes[1].scatter(X.dot(pca.components_[1]), y, alpha=0.3) | |
axes[1].set(xlabel="Projected data onto second PCA component", ylabel="y") | |
plt.tight_layout() | |
# plt.show() | |
return plt | |
def plot_pca_ls(): | |
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) | |
pcr = make_pipeline(StandardScaler(), PCA(n_components=1), LinearRegression()) | |
pcr.fit(X_train, y_train) | |
pca = pcr.named_steps["pca"] # retrieve the PCA step of the pipeline | |
pls = PLSRegression(n_components=1) | |
pls.fit(X_train, y_train) | |
fig, axes = plt.subplots(1, 2, figsize=(10, 3)) | |
axes[0].scatter(pca.transform(X_test), y_test, alpha=0.3, label="ground truth") | |
axes[0].scatter( | |
pca.transform(X_test), pcr.predict(X_test), alpha=0.3, label="predictions" | |
) | |
axes[0].set( | |
xlabel="Projected data onto first PCA component", ylabel="y", title="PCR / PCA" | |
) | |
axes[0].legend() | |
axes[1].scatter(pls.transform(X_test), y_test, alpha=0.3, label="ground truth") | |
axes[1].scatter( | |
pls.transform(X_test), pls.predict(X_test), alpha=0.3, label="predictions" | |
) | |
axes[1].set(xlabel="Projected data onto first PLS component", ylabel="y", title="PLS") | |
axes[1].legend() | |
plt.tight_layout() | |
# plt.show() | |
return plt | |
def get_components(): | |
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) | |
pcr = make_pipeline(StandardScaler(), PCA(n_components=1), LinearRegression()) | |
pls = PLSRegression(n_components=1) | |
return X_train, X_test, y_train, y_test, pcr, pls | |
def print_results(): | |
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) | |
pcr = make_pipeline(StandardScaler(), PCA(n_components=1), LinearRegression()) | |
pcr.fit(X_train, y_train) | |
pca = pcr.named_steps["pca"] # retrieve the PCA step of the pipeline | |
pls = PLSRegression(n_components=1) | |
pls.fit(X_train, y_train) | |
result1 = f"PCR r-squared {pcr.score(X_test, y_test):.3f}" | |
result2 = f"PLS r-squared {pls.score(X_test, y_test):.3f}" | |
mystr = result1 +"\n"+ result2 | |
return mystr | |
def calc_pcr_r2(): | |
X_train, X_test, y_train, y_test, pcr, pls = get_components() | |
pca_2 = make_pipeline(PCA(n_components=2), LinearRegression()) | |
pca_2.fit(X_train, y_train) | |
r2 = f"PCR r-squared with 2 components {pca_2.score(X_test, y_test):.3f}" | |
return r2 | |
X, rng, n_samples = make_data() | |
pca = PCA(n_components=2).fit(X) | |
y = datagen_y() | |
# plot_scatter_pca(alpha) | |
title = " Principal Component Regression vs Partial Least Squares Regression." | |
with gr.Blocks(title=title, theme=gr.themes.Default(font=[gr.themes.GoogleFont("Inconsolata"), "Arial", "sans-serif"])) as demo: | |
gr.Markdown(f" # {title}") | |
gr.Markdown( | |
""" | |
This example compares Principal Component Regression (PCR) and Partial Least Squares Regression (PLS) on a toy dataset. | |
Our goal is to illustrate how PLS can outperform PCR when the target is strongly correlated with some directions in the | |
data that have a low variance. | |
PCR is a regressor composed of two steps: first, PCA is applied to the training data, possibly performing dimensionality reduction; | |
then, a regressor (e.g. a linear regressor) is trained on the transformed samples. | |
In PCA, the transformation is purely unsupervised, meaning that no information about the targets is used. | |
As a result, PCR may perform poorly in some datasets where the target is strongly correlated with directions that have low variance. | |
Indeed, the dimensionality reduction of PCA projects the data into a lower dimensional space where the variance of the projected data | |
is greedily maximized along each axis. Despite them having the most predictive power on the target, | |
the directions with a lower variance will be dropped, and the final regressor will not be able to leverage them. | |
PLS is both a transformer and a regressor, and it is quite similar to PCR: | |
it also applies a dimensionality reduction to the samples before applying a linear regressor to the transformed data. | |
The main difference with PCR is that the PLS transformation is supervised. Therefore, as we will see in this example, | |
it does not suffer from the issue we just mentioned. | |
""") | |
gr.Markdown("You can see the associated scikit-learn example [here](https://scikit-learn.org/stable/auto_examples/cross_decomposition/plot_pcr_vs_pls.html#sphx-glr-auto-examples-cross-decomposition-plot-pcr-vs-pls-py).") | |
# loaded_model = load_hf_model_hub() | |
with gr.Tab("Visualize Input dataset"): | |
with gr.Row(equal_height=True): | |
slider1 = gr.Slider(label="alpha", minimum=0.0, maximum=1.0) | |
slider1.change(plot_scatter_pca, slider1, outputs= gr.Plot(label='Visualizing input dataset') ) | |
with gr.Tab("PCA data projections"): | |
btn_decision = gr.Button(value="PCA data projections") | |
btn_decision.click(data_projections, outputs= gr.Plot(label='PCA data projections') ) | |
with gr.Tab("predictive power"): | |
btn_power = gr.Button(value="Predictive power") | |
btn_power.click(plot_pca_ls, outputs= gr.Plot(label='Predictive power') ) | |
with gr.Tab("Results tab"): | |
gr.Markdown( | |
""" | |
As a final remark, | |
we note that PCR with 2 components performs as well as PLS: this is because in this case, | |
PCR was able to leverage the second component which has the most preditive power on the target. | |
""") | |
btn_power = gr.Button(value="Results") | |
out = gr.Textbox(label="r2 score of both estimators") | |
btn_power.click(print_results, outputs= out ) | |
with gr.Tab("r2_score of predictors comparison"): | |
with gr.Row(equal_height=True): | |
gr.Markdown( | |
""" | |
We also print the R-squared scores of both estimators, which further confirms that PLS is a better alternative than PCR in this case. | |
A negative R-squared indicates that PCR performs worse than a regressor that would simply predict the mean of the target. | |
""") | |
btn_1 = gr.Button(value="r2_score of predictors") | |
out1 = gr.Textbox(label="r2_score of predictors") | |
btn_1.click(calc_pcr_r2, outputs= out1 ) | |
gr.Markdown( f"## End of page") | |
demo.launch() |