"""Data pre-processing functions.""" import numpy from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, KBinsDiscretizer def _get_pipeline_replace_one_hot(func, value): return Pipeline([ ("replace", FunctionTransformer( func, kw_args={"value": value}, feature_names_out='one-to-one', )), ("one_hot", OneHotEncoder(),), ]) def _replace_values_geq(column, value): return numpy.where(column >= value, f"{value}_or_more", column) def _replace_values_eq(column, value): for desired_value, values_to_replace in value.items(): column = numpy.where(numpy.isin(column, values_to_replace), desired_value, column) return column def get_pre_processors(): pre_processor_user = ColumnTransformer( transformers=[ ( "replace_num_children", _get_pipeline_replace_one_hot(_replace_values_geq, 2), ['Num_children'] ), ( "replace_num_family", _get_pipeline_replace_one_hot(_replace_values_geq, 3), ['Num_family'] ), ( "replace_income_type", _get_pipeline_replace_one_hot(_replace_values_eq, {"State servant": ["Pensioner", "Student"]}), ['Income_type'] ), ( "replace_education_type", _get_pipeline_replace_one_hot(_replace_values_eq, {"Higher education": ["Academic degree"]}), ['Education_type'] ), ( "replace_occupation_type_labor", _get_pipeline_replace_one_hot( _replace_values_eq, { "Labor_work": ["Cleaning staff", "Cooking staff", "Drivers", "Laborers", "Low-skill Laborers", "Security staff", "Waiters/barmen staff"], "Office_work": ["Accountants", "Core staff", "HR staff", "Medicine staff", "Private service staff", "Realty agents", "Sales staff", "Secretaries"], "High_tech_work": ["Managers", "High skill tech staff", "IT staff"], }, ), ['Occupation_type'] ), ('one_hot_housing_fam_status', OneHotEncoder(), ['Housing_type', 'Family_status']), ('qbin_total_income', KBinsDiscretizer(n_bins=3, strategy='quantile', encode="onehot"), ['Total_income']), ('bin_age', KBinsDiscretizer(n_bins=5, strategy='uniform', encode="onehot"), ['Age']), ], remainder='passthrough', verbose_feature_names_out=False, ) pre_processor_third_party = ColumnTransformer( transformers=[ ('bin_years_employed', KBinsDiscretizer(n_bins=5, strategy='uniform', encode="onehot"), ['Years_employed']) ], remainder='passthrough', verbose_feature_names_out=False, ) return pre_processor_user, pre_processor_third_party def select_and_pop_features(data, columns): new_data = data[columns].copy() data.drop(columns, axis=1, inplace=True) return new_data