File size: 3,884 Bytes
d81fb12 f4f634e d81fb12 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
plt.rcParams['figure.dpi'] = 100
plt.style.use('ggplot')
from sklearn.linear_model import HuberRegressor, Ridge
import gradio as gr
C1, C2, C3 = '#ff0000', '#09bd00', '#0000ff'
#=====================================================
def create_plot(outlier_ratio=0.1, epsilon=1.35):
num_samples = 100
x = np.linspace(-15, 15, num_samples)
y = 2*x + 2 + np.random.normal(loc=0, scale=2.5, size=x.shape[0])
num_outliers = int(num_samples * outlier_ratio)//2
outliers_x = np.random.normal(loc=11, scale=1, size=num_outliers)
outliers_y = np.random.normal(loc=-30, scale=4, size=num_outliers)
x = np.concatenate([x, outliers_x])
y = np.concatenate([y, outliers_y])
outliers_x = np.random.normal(loc=-11, scale=1, size=num_outliers)
outliers_y = np.random.normal(loc=30, scale=4, size=num_outliers)
x = np.concatenate([x, outliers_x])
y = np.concatenate([y, outliers_y])
X = x[..., None]
x = np.concatenate([x, outliers_x])
y = np.concatenate([y, outliers_y])
X = x[..., None]
ridge = Ridge(alpha=0)
ridge.fit(X, y)
huber = HuberRegressor(epsilon=epsilon)
huber.fit(X, y)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(x, y, c=C1, edgecolor='k', s=40)
line_x = np.linspace(-15, 15, 10)
ax.plot(line_x, ridge.coef_*line_x + ridge.intercept_, c=C2, label='Ridge')
ax.plot(line_x, huber.coef_*line_x + huber.intercept_, c=C3, label='Huber')
ax.set_xlabel('X'); ax.set_ylabel('Y')
ax.legend()
ax.set_title('Huber Regressor vs Ridge Regressor with Outliers')
return fig
info = '''
# Robustness Against Outliers: Huber vs Ridge Regression
This example demonstrates a simple linear regression problem in the existence of outliers, and compares the effectiveness of Huber regression vs Ridge regression.
[Ridge regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html), which is essentially basic L2 linear regression with regularization (but regularization is neglected here), suffers from outliers because the outlying data points are going to heavily increase the loss, forcing the best-fit line to lean towards the outliers to decrease that loss.
[Huber regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.HuberRegressor.html) uses the Huber loss instead of the L2 loss. The Huber loss function behaves quadratically when the error is small and linearly when the error is large. Consequently, the loss resulting from outlying points is weighed less heavily than if we use quadratic loss all over.
The epsilon parameter controls the cut-off point between the quadratic and linear regions of the Huber loss. Use the sliders to increase the outlier ratio and see when the Huber regressor breaks down and how the value of epsilon affects that.
Created by [huabdul](https://huggingface.co/huabdul) based on [scikit-learn docs](https://scikit-learn.org/stable/auto_examples/linear_model/plot_huber_vs_ridge.html#sphx-glr-auto-examples-linear-model-plot-huber-vs-ridge-py).
'''
with gr.Blocks(analytics_enabled=False) as demo:
with gr.Row():
with gr.Column():
gr.Markdown(info)
s_outlier_ratio = gr.Slider(0.01, 0.5, value=0.15, step=0.01, label='Outlier Ratio')
s_epsilon = gr.Slider(1, 2, 1.35, step=0.005, label='Epsilon')
with gr.Column():
plot = gr.Plot(label='Comparison')
s_outlier_ratio.change(create_plot, inputs=[s_outlier_ratio, s_epsilon], outputs=[plot])
s_epsilon.change(create_plot, inputs=[s_outlier_ratio, s_epsilon], outputs=[plot])
demo.load(create_plot, inputs=[s_outlier_ratio, s_epsilon], outputs=[plot])
demo.launch()
#===================================================== |