feature-engineering-guide / feature_cleaning /rare_values.py

Upload 52 files

0ab7b0c verified over 1 year ago

7.76 kB

	import pandas as pd
	# import numpy as np
	# from warnings import warn

	# 2018.11.07 Created by Eamon.Zhang
	# 2018.11.12 change into fit() transform() format

	class GroupingRareValues():
	"""
	Grouping the observations that show rare labels into a unique category ('rare')

	Parameters
	----------

	"""

	def __init__(self, mapping=None, cols=None, threshold=0.01):
	self.cols = cols
	self.mapping = mapping
	self._dim = None
	self.threshold = threshold


	def fit(self, X, y=None, **kwargs):
	"""Fit encoder according to X and y.
	Parameters
	----------
	X : array-like, shape = [n_samples, n_features]
	Training vectors, where n_samples is the number of samples
	and n_features is the number of features.
	y : array-like, shape = [n_samples]
	Target values.
	Returns
	-------
	self : encoder
	Returns self.
	"""

	self._dim = X.shape[1]

	_, categories = self.grouping(
	X,
	mapping=self.mapping,
	cols=self.cols,
	threshold=self.threshold
	)
	self.mapping = categories
	return self


	def transform(self, X):
	"""Perform the transformation to new categorical data.
	Will use the mapping (if available) and the column list to encode the
	data.
	Parameters
	----------
	X : array-like, shape = [n_samples, n_features]
	Returns
	-------
	X : Transformed values with encoding applied.
	"""

	if self._dim is None:
	raise ValueError('Must train encoder before it can be used to transform data.')

	# make sure that it is the right size
	if X.shape[1] != self._dim:
	raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))

	X, _ = self.grouping(
	X,
	mapping=self.mapping,
	cols=self.cols,
	threshold=self.threshold
	)

	return X


	def grouping(self, X_in, threshold, mapping=None, cols=None):
	"""
	Grouping the observations that show rare labels into a unique category ('rare')

	"""

	X = X_in.copy(deep=True)

	# if cols is None:
	# cols = X.columns.values

	if mapping is not None: # transform
	mapping_out = mapping
	for i in mapping:
	column = i.get('col') # get the column name
	X[column] = X[column].map(i['mapping'])

	# try:
	# X[column] = X[column].astype(int)
	# except ValueError as e:
	# X[column] = X[column].astype(float)
	else: # fit
	mapping_out = []
	for col in cols:
	# if util.is_category(X[col].dtype):
	# categories = X[col].cat.categories
	# else:
	temp_df = pd.Series(X[col].value_counts()/len(X))
	mapping = { k: ('rare' if k not in temp_df[temp_df >= threshold].index else k)
	for k in temp_df.index}

	mapping = pd.Series(mapping)
	mapping_out.append({'col': col, 'mapping': mapping, 'data_type': X[col].dtype}, )

	return X, mapping_out



	#==============================================================================
	# def rare_imputation(X_train, X_test, variable):
	#
	# # find the most frequent category
	# frequent_cat = X_train.groupby(variable)[variable].count().sort_values().tail(1).index.values[0]
	#
	# # find rare labels
	# temp = X_train.groupby([variable])[variable].count()/np.float(len(X_train))
	# rare_cat = [x for x in temp.loc[temp<0.05].index.values]
	#
	# # create new variables, with Rare labels imputed
	#
	# # by the most frequent category
	# X_train[variable+'_freq_imp'] = np.where(X_train[variable].isin(rare_cat), frequent_cat, X_train[variable])
	# X_test[variable+'_freq_imp'] = np.where(X_test[variable].isin(rare_cat), frequent_cat, X_test[variable])
	#
	# # by adding a new label 'Rare'
	# X_train[variable+'_rare_imp'] = np.where(X_train[variable].isin(rare_cat), 'Rare', X_train[variable])
	# X_test[variable+'_rare_imp'] = np.where(X_test[variable].isin(rare_cat), 'Rare', X_test[variable])
	#==============================================================================

	# 2018.11.26 created by Eamon.Zhang
	class ModeImputation():
	"""
	Replacing the rare label by most frequent label

	Parameters
	----------

	"""

	def __init__(self, mapping=None, cols=None, threshold=0.01):
	self.cols = cols
	self.mapping = mapping
	self._dim = None
	self.threshold = threshold


	def fit(self, X, y=None, **kwargs):
	"""Fit encoder according to X and y.
	Parameters
	----------
	X : array-like, shape = [n_samples, n_features]
	Training vectors, where n_samples is the number of samples
	and n_features is the number of features.
	y : array-like, shape = [n_samples]
	Target values.
	Returns
	-------
	self : encoder
	Returns self.
	"""

	self._dim = X.shape[1]

	_, categories = self.impute_with_mode(
	X,
	mapping=self.mapping,
	cols=self.cols,
	threshold=self.threshold
	)
	self.mapping = categories
	return self


	def transform(self, X):
	"""Perform the transformation to new categorical data.
	Will use the mapping (if available) and the column list to encode the
	data.
	Parameters
	----------
	X : array-like, shape = [n_samples, n_features]
	Returns
	-------
	X : Transformed values with encoding applied.
	"""

	if self._dim is None:
	raise ValueError('Must train encoder before it can be used to transform data.')

	# make sure that it is the right size
	if X.shape[1] != self._dim:
	raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))

	X, _ = self.impute_with_mode(
	X,
	mapping=self.mapping,
	cols=self.cols,
	threshold=self.threshold
	)

	return X


	def impute_with_mode(self, X_in, threshold, mapping=None, cols=None):
	"""
	Grouping the observations that show rare labels into a unique category ('rare')

	"""

	X = X_in.copy(deep=True)

	# if cols is None:
	# cols = X.columns.values

	if mapping is not None: # transform
	mapping_out = mapping
	for i in mapping:
	column = i.get('col') # get the column name
	X[column] = X[column].map(i['mapping'])

	# try:
	# X[column] = X[column].astype(int)
	# except ValueError as e:
	# X[column] = X[column].astype(float)
	else: # fit
	mapping_out = []
	for col in cols:
	# if util.is_category(X[col].dtype):
	# categories = X[col].cat.categories
	# else:
	temp_df = pd.Series(X[col].value_counts()/len(X))
	median = X[col].mode()[0]
	mapping = { k: (median if k not in temp_df[temp_df >= threshold].index else k)
	for k in temp_df.index}

	mapping = pd.Series(mapping)
	mapping_out.append({'col': col, 'mapping': mapping, 'data_type': X[col].dtype}, )

	return X, mapping_out