Spaces:

sklearn-docs
/

huber-vs-ridge-regression-for-outliers

Sleeping

App Files Files Community

huber-vs-ridge-regression-for-outliers / app.py

huabdul

Update app.py

f4f634e over 1 year ago

raw

history blame

3.88 kB

	import numpy as np
	import matplotlib.pyplot as plt
	from matplotlib.colors import ListedColormap
	plt.rcParams['figure.dpi'] = 100
	plt.style.use('ggplot')

	from sklearn.linear_model import HuberRegressor, Ridge

	import gradio as gr

	C1, C2, C3 = '#ff0000', '#09bd00', '#0000ff'
	#=====================================================
	def create_plot(outlier_ratio=0.1, epsilon=1.35):
	num_samples = 100
	x = np.linspace(-15, 15, num_samples)
	y = 2*x + 2 + np.random.normal(loc=0, scale=2.5, size=x.shape[0])

	num_outliers = int(num_samples * outlier_ratio)//2
	outliers_x = np.random.normal(loc=11, scale=1, size=num_outliers)
	outliers_y = np.random.normal(loc=-30, scale=4, size=num_outliers)
	x = np.concatenate([x, outliers_x])
	y = np.concatenate([y, outliers_y])
	outliers_x = np.random.normal(loc=-11, scale=1, size=num_outliers)
	outliers_y = np.random.normal(loc=30, scale=4, size=num_outliers)
	x = np.concatenate([x, outliers_x])
	y = np.concatenate([y, outliers_y])
	X = x[..., None]

	x = np.concatenate([x, outliers_x])
	y = np.concatenate([y, outliers_y])
	X = x[..., None]

	ridge = Ridge(alpha=0)
	ridge.fit(X, y)

	huber = HuberRegressor(epsilon=epsilon)
	huber.fit(X, y)

	fig = plt.figure()
	ax = fig.add_subplot(111)

	ax.scatter(x, y, c=C1, edgecolor='k', s=40)

	line_x = np.linspace(-15, 15, 10)
	ax.plot(line_x, ridge.coef_*line_x + ridge.intercept_, c=C2, label='Ridge')
	ax.plot(line_x, huber.coef_*line_x + huber.intercept_, c=C3, label='Huber')

	ax.set_xlabel('X'); ax.set_ylabel('Y')
	ax.legend()
	ax.set_title('Huber Regressor vs Ridge Regressor with Outliers')

	return fig


	info = '''
	# Robustness Against Outliers: Huber vs Ridge Regression

	This example demonstrates a simple linear regression problem in the existence of outliers, and compares the effectiveness of Huber regression vs Ridge regression.

	[Ridge regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html), which is essentially basic L2 linear regression with regularization (but regularization is neglected here), suffers from outliers because the outlying data points are going to heavily increase the loss, forcing the best-fit line to lean towards the outliers to decrease that loss.

	[Huber regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.HuberRegressor.html) uses the Huber loss instead of the L2 loss. The Huber loss function behaves quadratically when the error is small and linearly when the error is large. Consequently, the loss resulting from outlying points is weighed less heavily than if we use quadratic loss all over.

	The epsilon parameter controls the cut-off point between the quadratic and linear regions of the Huber loss. Use the sliders to increase the outlier ratio and see when the Huber regressor breaks down and how the value of epsilon affects that.

	Created by [huabdul](https://huggingface.co/huabdul) based on [scikit-learn docs](https://scikit-learn.org/stable/auto_examples/linear_model/plot_huber_vs_ridge.html#sphx-glr-auto-examples-linear-model-plot-huber-vs-ridge-py).
	'''
	with gr.Blocks(analytics_enabled=False) as demo:
	with gr.Row():
	with gr.Column():
	gr.Markdown(info)
	s_outlier_ratio = gr.Slider(0.01, 0.5, value=0.15, step=0.01, label='Outlier Ratio')
	s_epsilon = gr.Slider(1, 2, 1.35, step=0.005, label='Epsilon')
	with gr.Column():
	plot = gr.Plot(label='Comparison')

	s_outlier_ratio.change(create_plot, inputs=[s_outlier_ratio, s_epsilon], outputs=[plot])
	s_epsilon.change(create_plot, inputs=[s_outlier_ratio, s_epsilon], outputs=[plot])
	demo.load(create_plot, inputs=[s_outlier_ratio, s_epsilon], outputs=[plot])

	demo.launch()
	#=====================================================