|
|
import pandas as pd |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class GroupingRareValues(): |
|
|
""" |
|
|
Grouping the observations that show rare labels into a unique category ('rare') |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
|
|
|
""" |
|
|
|
|
|
def __init__(self, mapping=None, cols=None, threshold=0.01): |
|
|
self.cols = cols |
|
|
self.mapping = mapping |
|
|
self._dim = None |
|
|
self.threshold = threshold |
|
|
|
|
|
|
|
|
def fit(self, X, y=None, **kwargs): |
|
|
"""Fit encoder according to X and y. |
|
|
Parameters |
|
|
---------- |
|
|
X : array-like, shape = [n_samples, n_features] |
|
|
Training vectors, where n_samples is the number of samples |
|
|
and n_features is the number of features. |
|
|
y : array-like, shape = [n_samples] |
|
|
Target values. |
|
|
Returns |
|
|
------- |
|
|
self : encoder |
|
|
Returns self. |
|
|
""" |
|
|
|
|
|
self._dim = X.shape[1] |
|
|
|
|
|
_, categories = self.grouping( |
|
|
X, |
|
|
mapping=self.mapping, |
|
|
cols=self.cols, |
|
|
threshold=self.threshold |
|
|
) |
|
|
self.mapping = categories |
|
|
return self |
|
|
|
|
|
|
|
|
def transform(self, X): |
|
|
"""Perform the transformation to new categorical data. |
|
|
Will use the mapping (if available) and the column list to encode the |
|
|
data. |
|
|
Parameters |
|
|
---------- |
|
|
X : array-like, shape = [n_samples, n_features] |
|
|
Returns |
|
|
------- |
|
|
X : Transformed values with encoding applied. |
|
|
""" |
|
|
|
|
|
if self._dim is None: |
|
|
raise ValueError('Must train encoder before it can be used to transform data.') |
|
|
|
|
|
|
|
|
if X.shape[1] != self._dim: |
|
|
raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,)) |
|
|
|
|
|
X, _ = self.grouping( |
|
|
X, |
|
|
mapping=self.mapping, |
|
|
cols=self.cols, |
|
|
threshold=self.threshold |
|
|
) |
|
|
|
|
|
return X |
|
|
|
|
|
|
|
|
def grouping(self, X_in, threshold, mapping=None, cols=None): |
|
|
""" |
|
|
Grouping the observations that show rare labels into a unique category ('rare') |
|
|
|
|
|
""" |
|
|
|
|
|
X = X_in.copy(deep=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if mapping is not None: |
|
|
mapping_out = mapping |
|
|
for i in mapping: |
|
|
column = i.get('col') |
|
|
X[column] = X[column].map(i['mapping']) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
else: |
|
|
mapping_out = [] |
|
|
for col in cols: |
|
|
|
|
|
|
|
|
|
|
|
temp_df = pd.Series(X[col].value_counts()/len(X)) |
|
|
mapping = { k: ('rare' if k not in temp_df[temp_df >= threshold].index else k) |
|
|
for k in temp_df.index} |
|
|
|
|
|
mapping = pd.Series(mapping) |
|
|
mapping_out.append({'col': col, 'mapping': mapping, 'data_type': X[col].dtype}, ) |
|
|
|
|
|
return X, mapping_out |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ModeImputation(): |
|
|
""" |
|
|
Replacing the rare label by most frequent label |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
|
|
|
""" |
|
|
|
|
|
def __init__(self, mapping=None, cols=None, threshold=0.01): |
|
|
self.cols = cols |
|
|
self.mapping = mapping |
|
|
self._dim = None |
|
|
self.threshold = threshold |
|
|
|
|
|
|
|
|
def fit(self, X, y=None, **kwargs): |
|
|
"""Fit encoder according to X and y. |
|
|
Parameters |
|
|
---------- |
|
|
X : array-like, shape = [n_samples, n_features] |
|
|
Training vectors, where n_samples is the number of samples |
|
|
and n_features is the number of features. |
|
|
y : array-like, shape = [n_samples] |
|
|
Target values. |
|
|
Returns |
|
|
------- |
|
|
self : encoder |
|
|
Returns self. |
|
|
""" |
|
|
|
|
|
self._dim = X.shape[1] |
|
|
|
|
|
_, categories = self.impute_with_mode( |
|
|
X, |
|
|
mapping=self.mapping, |
|
|
cols=self.cols, |
|
|
threshold=self.threshold |
|
|
) |
|
|
self.mapping = categories |
|
|
return self |
|
|
|
|
|
|
|
|
def transform(self, X): |
|
|
"""Perform the transformation to new categorical data. |
|
|
Will use the mapping (if available) and the column list to encode the |
|
|
data. |
|
|
Parameters |
|
|
---------- |
|
|
X : array-like, shape = [n_samples, n_features] |
|
|
Returns |
|
|
------- |
|
|
X : Transformed values with encoding applied. |
|
|
""" |
|
|
|
|
|
if self._dim is None: |
|
|
raise ValueError('Must train encoder before it can be used to transform data.') |
|
|
|
|
|
|
|
|
if X.shape[1] != self._dim: |
|
|
raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,)) |
|
|
|
|
|
X, _ = self.impute_with_mode( |
|
|
X, |
|
|
mapping=self.mapping, |
|
|
cols=self.cols, |
|
|
threshold=self.threshold |
|
|
) |
|
|
|
|
|
return X |
|
|
|
|
|
|
|
|
def impute_with_mode(self, X_in, threshold, mapping=None, cols=None): |
|
|
""" |
|
|
Grouping the observations that show rare labels into a unique category ('rare') |
|
|
|
|
|
""" |
|
|
|
|
|
X = X_in.copy(deep=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if mapping is not None: |
|
|
mapping_out = mapping |
|
|
for i in mapping: |
|
|
column = i.get('col') |
|
|
X[column] = X[column].map(i['mapping']) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
else: |
|
|
mapping_out = [] |
|
|
for col in cols: |
|
|
|
|
|
|
|
|
|
|
|
temp_df = pd.Series(X[col].value_counts()/len(X)) |
|
|
median = X[col].mode()[0] |
|
|
mapping = { k: (median if k not in temp_df[temp_df >= threshold].index else k) |
|
|
for k in temp_df.index} |
|
|
|
|
|
mapping = pd.Series(mapping) |
|
|
mapping_out.append({'col': col, 'mapping': mapping, 'data_type': X[col].dtype}, ) |
|
|
|
|
|
return X, mapping_out |
|
|
|