Spaces:

NTaylor
/

compare-bayesian-regressors

Runtime error

App Files Files Community

NTaylor commited on Apr 27, 2023

Commit

e1527f1

1 Parent(s): 057098a

Initial commit - uploading app and requirements

Browse files

Files changed (2) hide show

app.py +367 -0
requirements.txt +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,367 @@

+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import PolynomialFeatures, StandardScaler
+import numpy as np
+from sklearn.datasets import make_regression
+import pandas as pd
+from sklearn.linear_model import ARDRegression, LinearRegression, BayesianRidge
+import matplotlib.pyplot as plt
+import seaborn as sns
+from matplotlib.colors import SymLogNorm
+import gradio as gr
+# def make_regression_data(n_samples=100,
+#     n_features=100,
+#     n_informative=10,
+#     noise=8,
+#     coef=True,
+#     random_state=42,):
+#     X, y, true_weights = make_regression(
+#         n_samples=n_samples,
+#         n_features=n_features,
+#         n_informative=n_informative,
+#         noise=noise,
+#         coef=coef,
+#         random_state=random_state,
+#     )
+#     return X, y, true_weights
+X, y, true_weights = make_regression(
+    n_samples=100,
+    n_features=100,
+    n_informative=10,
+    noise=8,
+    coef=True,
+    random_state=42,
+)
+# Fit the regressors
+# ------------------
+#
+# We now fit both Bayesian models and the OLS to later compare the models'
+# coefficients.
+# olr = LinearRegression().fit(X, y)
+# brr = BayesianRidge(compute_score=True, n_iter=30).fit(X, y)
+# ard = ARDRegression(compute_score=True, n_iter=30).fit(X, y)
+# df = pd.DataFrame(
+#     {
+#         "Weights of true generative process": true_weights,
+#         "ARDRegression": ard.coef_,
+#         "BayesianRidge": brr.coef_,
+#         "LinearRegression": olr.coef_,
+#     }
+# )
+def fit_regression_models(n_iter=30, X=X, y=y, true_weights=true_weights):
+    olr = LinearRegression().fit(X, y)
+    print(f"inside fit_regression n_iter={n_iter}")
+    brr = BayesianRidge(compute_score=True, n_iter=n_iter).fit(X, y)
+    ard = ARDRegression(compute_score=True, n_iter=n_iter).fit(X, y)
+    df = pd.DataFrame(
+        {
+            "Weights of true generative process": true_weights,
+            "ARDRegression": ard.coef_,
+            "BayesianRidge": brr.coef_,
+            "LinearRegression": olr.coef_,
+        }
+    )
+    return df, olr, brr, ard
+# %%
+# Plot the true and estimated coefficients
+# ----------------------------------------
+#
+# Now we compare the coefficients of each model with the weights of
+# the true generative model.
+# plt.figure(figsize=(10, 6))
+# ax = sns.heatmap(
+#     df.T,
+#     norm=SymLogNorm(linthresh=10e-4, vmin=-80, vmax=80),
+#     cbar_kws={"label": "coefficients' values"},
+#     cmap="seismic_r",
+# )
+# plt.ylabel("linear model")
+# plt.xlabel("coefficients")
+# plt.tight_layout(rect=(0, 0, 1, 0.95))
+# _ = plt.title("Models' coefficients")
+def visualize_coefficients(df=None):
+    fig = plt.figure(figsize=(10, 6))
+    ax = sns.heatmap(
+        df.T,
+        norm=SymLogNorm(linthresh=10e-4, vmin=-80, vmax=80),
+        cbar_kws={"label": "coefficients' values"},
+        cmap="seismic_r",
+    )
+    plt.ylabel("linear model")
+    plt.xlabel("coefficients")
+    plt.tight_layout(rect=(0, 0, 1, 0.95))
+    _ = plt.title("Models' coefficients")
+    return fig
+# %%
+# Due to the added noise, none of the models recover the true weights. Indeed,
+# all models always have more than 10 non-zero coefficients. Compared to the OLS
+# estimator, the coefficients using a Bayesian Ridge regression are slightly
+# shifted toward zero, which stabilises them. The ARD regression provides a
+# sparser solution: some of the non-informative coefficients are set exactly to
+# zero, while shifting others closer to zero. Some non-informative coefficients
+# are still present and retain large values.
+# %%
+# Plot the marginal log-likelihood
+# --------------------------------
+# ard_scores = -np.array(ard.scores_)
+# brr_scores = -np.array(brr.scores_)
+# plt.plot(ard_scores, color="navy", label="ARD")
+# plt.plot(brr_scores, color="red", label="BayesianRidge")
+# plt.ylabel("Log-likelihood")
+# plt.xlabel("Iterations")
+# plt.xlim(1, 30)
+# plt.legend()
+# _ = plt.title("Models log-likelihood")
+def plot_marginal_log_likelihood(ard=None, brr=None, n_iter=30):
+    fig = plt.figure(figsize=(10, 6))
+    ard_scores = -np.array(ard.scores_)
+    brr_scores = -np.array(brr.scores_)
+    # print(f"ard_scores = {ard_scores}")
+    # print(f"brr_scores = {brr_scores}")
+    plt.plot(ard_scores, color="navy", label="ARD")
+    plt.plot(brr_scores, color="red", label="BayesianRidge")
+    plt.ylabel("Log-likelihood")
+    plt.xlabel("Iterations")
+    plt.xlim(1, n_iter)
+    plt.legend()
+    _ = plt.title("Models log-likelihood")
+    print("fig inside plot marginal = ", fig)
+    return fig
+def make_regression_comparison_plot(n_iter=30):
+    # print(f"n_iter = {n_iter}")
+    # fit models
+    df, olr, brr, ard = fit_regression_models(n_iter=n_iter, X=X, y=y, true_weights=true_weights)
+    # print(f"df = {df}")
+    # get figure
+    fig = visualize_coefficients(df=df)
+    return fig
+def make_log_likelihood_plot(n_iter=30):
+    # print(f"n_iter = {n_iter}")
+    # fit models
+    df, olr, brr, ard = fit_regression_models(n_iter=n_iter, X=X, y=y, true_weights=true_weights)
+    # print(f"df = {df}")
+    # get figure
+    fig = plot_marginal_log_likelihood(ard=ard, brr=brr, n_iter=n_iter)
+    print(f"fig = {fig}")
+    return fig
+    # visualize coefficients
+# # %%
+# # Indeed, both models minimize the log-likelihood up to an arbitrary cutoff
+# # defined by the `n_iter` parameter.
+# #
+# # Bayesian regressions with polynomial feature expansion
+# # ======================================================
+# Generate synthetic dataset
+# --------------------------
+# We create a target that is a non-linear function of the input feature.
+# Noise following a standard uniform distribution is added.
+rng = np.random.RandomState(0)
+n_samples = 110
+# sort the data to make plotting easier later
+g_X = np.sort(-10 * rng.rand(n_samples) + 10)
+noise = rng.normal(0, 1, n_samples) * 1.35
+g_y = np.sqrt(g_X) * np.sin(g_X) + noise
+full_data = pd.DataFrame({"input_feature": g_X, "target": g_y})
+g_X = g_X.reshape((-1, 1))
+# extrapolation
+X_plot = np.linspace(10, 10.4, 10)
+y_plot = np.sqrt(X_plot) * np.sin(X_plot)
+X_plot = np.concatenate((g_X, X_plot.reshape((-1, 1))))
+y_plot = np.concatenate((g_y - noise, y_plot))
+# %%
+# Fit the regressors
+# ------------------
+#
+# Here we try a degree 10 polynomial to potentially overfit, though the bayesian
+# linear models regularize the size of the polynomial coefficients. As
+# `fit_intercept=True` by default for
+# :class:`~sklearn.linear_model.ARDRegression` and
+# :class:`~sklearn.linear_model.BayesianRidge`, then
+# :class:`~sklearn.preprocessing.PolynomialFeatures` should not introduce an
+# additional bias feature. By setting `return_std=True`, the bayesian regressors
+# return the standard deviation of the posterior distribution for the model
+# parameters.
+#TODO - make this function that can be adapted with the gr.slider
+def generate_polynomial_dataset(degree = 10):
+    ard_poly = make_pipeline(
+        PolynomialFeatures(degree=degree, include_bias=False),
+        StandardScaler(),
+        ARDRegression(),
+    ).fit(g_X, g_y)
+    brr_poly = make_pipeline(
+        PolynomialFeatures(degree=degree, include_bias=False),
+        StandardScaler(),
+        BayesianRidge(),
+    ).fit(g_X, g_y)
+    y_ard, y_ard_std = ard_poly.predict(X_plot, return_std=True)
+    y_brr, y_brr_std = brr_poly.predict(X_plot, return_std=True)
+    return y_ard, y_ard_std, y_brr, y_brr_std
+# %%
+# Plotting polynomial regressions with std errors of the scores
+# -------------------------------------------------------------
+# ax = sns.scatterplot(
+#     data=full_data, x="input_feature", y="target", color="black", alpha=0.75
+# )
+# ax.plot(X_plot, y_plot, color="black", label="Ground Truth")
+# ax.plot(X_plot, y_brr, color="red", label="BayesianRidge with polynomial features")
+# ax.plot(X_plot, y_ard, color="navy", label="ARD with polynomial features")
+# ax.fill_between(
+#     X_plot.ravel(),
+#     y_ard - y_ard_std,
+#     y_ard + y_ard_std,
+#     color="navy",
+#     alpha=0.3,
+# )
+# ax.fill_between(
+#     X_plot.ravel(),
+#     y_brr - y_brr_std,
+#     y_brr + y_brr_std,
+#     color="red",
+#     alpha=0.3,
+# )
+# ax.legend()
+# _ = ax.set_title("Polynomial fit of a non-linear feature")
+def visualize_bayes_regressions_polynomial_features(degree = 10):
+    #TODO - get data dynamically from the gr.slider
+    y_ard, y_ard_std, y_brr, y_brr_std = generate_polynomial_dataset(degree)
+    fig = plt.figure(figsize=(10, 6))
+    ax = sns.scatterplot(
+        data=full_data, x="input_feature", y="target", color="black", alpha=0.75)
+    ax.plot(X_plot, y_plot, color="black", label="Ground Truth")
+    ax.plot(X_plot, y_brr, color="red", label="BayesianRidge with polynomial features")
+    ax.plot(X_plot, y_ard, color="navy", label="ARD with polynomial features")
+    ax.fill_between(
+        X_plot.ravel(),
+        y_ard - y_ard_std,
+        y_ard + y_ard_std,
+        color="navy",
+        alpha=0.3,
+    )
+    ax.fill_between(
+        X_plot.ravel(),
+        y_brr - y_brr_std,
+        y_brr + y_brr_std,
+        color="red",
+        alpha=0.3,
+    )
+    ax.legend()
+    _ = ax.set_title("Polynomial fit of a non-linear feature")
+    # print(f"ax = {ax}")
+    return fig
+# def make_polynomial_comparison_plot():
+#     return fig
+title = " Illustration of Comparing Linear Bayesian Regressors with synthetic data"
+with gr.Blocks(title=title) as demo:
+    gr.Markdown(f"# {title}")
+    gr.Markdown(""" This example shows a comparison of two different bayesian regressors:
+        Automatic Relevance Determination - ARD see [sklearn-docs](https://scikit-learn.org/stable/modules/linear_model.html#automatic-relevance-determination)
+         Bayesian Ridge Regression -  see [sklearn-docs](https://scikit-learn.org/stable/modules/linear_model.html#bayesian-ridge-regression)
+        The tutorial is split into sections, with the first comparing model coeffecients produced by Ordinary Least Squares (OLS), Bayesian Ridge Regression, and ARD with the known true coefficients. For this
+        We generated a dataset where X and y are linearly linked: 10 of the features of X will be used to generate y. The other features are not useful at predicting y.
+        n addition, we generate a dataset where n_samples == n_features. Such a setting is challenging for an OLS model and leads potentially to arbitrary large weights.
+        Having a prior on the weights and a penalty alleviates the problem. Finally, gaussian noise is added.
+        For the final tab, we investigate bayesian regressors with polynomial features and generate an additional dataset where the target is a non-linear function of the input feature, with
+        added noise following a standard uniform distribution.
+     For further details please see the sklearn docs:
+    """)
+    gr.Markdown(" **[Demo is based on sklearn docs found here](https://scikit-learn.org/stable/auto_examples/linear_model/plot_ard.html#sphx-glr-auto-examples-linear-model-plot-ard-py)** <br>")
+    with gr.Tab("# Plot true and estimated coefficients"):
+        with gr.Row():
+            n_iter = gr.Slider(value=5, minimum=5, maximum=50, step=1, label="n_iterations")
+        btn = gr.Button(value="Plot true and estimated coefficients")
+        btn.click(make_regression_comparison_plot, inputs = [n_iter], outputs= gr.Plot(label='Plot true and estimated coefficients') )
+        gr.Markdown(
+        """
+        # Details
+         One can observe that with the added noise, none of the models can perfectly recover the coefficients of the original model. All models have more thab 10 non-zero coefficients,
+        where only 10 are useful. The Bayesian Ridge Regression manages to recover most of the coefficients, while the ARD is more conservative.
+        """)
+    with gr.Tab("# Plot marginal log likelihoods"):
+        with gr.Row():
+            n_iter = gr.Slider(value=5, minimum=5, maximum=50, step=1, label="n_iterations")
+        btn = gr.Button(value="Plot marginal log likelihoods")
+        btn.click(make_log_likelihood_plot, inputs = [n_iter], outputs= gr.Plot(label='Plot marginal log likelihoods') )
+        gr.Markdown(
+        """
+        # Confirm with marginal log likelihoods
+        Both ARD and Bayesian Ridge minimized the log-likelihood upto an arbitrary cuttoff defined the the n_iter parameter.
+        """
+        )
+    with gr.Tab("# Plot bayesian regression with polynomial features"):
+        with gr.Row():
+            degree = gr.Slider(value=5, minimum=5, maximum=50, step=1, label="n_degrees")
+        btn = gr.Button(value="Plot bayesian regression with polynomial features")
+        btn.click(visualize_bayes_regressions_polynomial_features, inputs = [degree], outputs= gr.Plot(label='Plot bayesian regression with polynomial features') )
+        gr.Markdown(
+        """
+        # Details
+        Here we try a degree 10 polynomial to potentially overfit, though the bayesian linear models regularize the size of the polynomial coefficients.
+        As fit_intercept=True by default for ARDRegression and BayesianRidge, then PolynomialFeatures should not introduce an additional bias feature. By setting return_std=True,
+        the bayesian regressors return the standard deviation of the posterior distribution for the model parameters.
+        """)
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+scikit-learn==1.2.2
+matplotlib==3.5.1
+numpy==1.21.6