File size: 3,285 Bytes
9a997e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
"""Data pre-processing functions."""

import numpy
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, KBinsDiscretizer


def _get_pipeline_replace_one_hot(func, value):
    return Pipeline([ 
        ("replace", FunctionTransformer(
            func, 
            kw_args={"value": value},
            feature_names_out='one-to-one',
        )),
        ("one_hot", OneHotEncoder(),),
    ])


def _replace_values_geq(column, value):
    return numpy.where(column >= value, f"{value}_or_more", column)

def _replace_values_eq(column, value):
    for desired_value, values_to_replace in value.items():
        column = numpy.where(numpy.isin(column, values_to_replace), desired_value, column)
    return column

def get_pre_processors():
    pre_processor_user = ColumnTransformer(
        transformers=[
            (
                "replace_num_children",
                _get_pipeline_replace_one_hot(_replace_values_geq, 2),
                ['Num_children']
            ),
            (
                "replace_num_family",
                _get_pipeline_replace_one_hot(_replace_values_geq, 3),
                ['Num_family']
            ),
            (
                "replace_income_type",
                _get_pipeline_replace_one_hot(_replace_values_eq, {"State servant": ["Pensioner", "Student"]}),
                ['Income_type']
            ),
            (
                "replace_education_type",
                _get_pipeline_replace_one_hot(_replace_values_eq, {"Higher education": ["Academic degree"]}),
                ['Education_type']
            ),
            (
                "replace_occupation_type_labor",
                _get_pipeline_replace_one_hot(
                    _replace_values_eq, 
                        {
                            "Labor_work": ["Cleaning staff", "Cooking staff", "Drivers", "Laborers", "Low-skill Laborers", "Security staff", "Waiters/barmen staff"],
                            "Office_work": ["Accountants", "Core staff", "HR staff", "Medicine staff", "Private service staff", "Realty agents", "Sales staff", "Secretaries"],
                            "High_tech_work": ["Managers", "High skill tech staff", "IT staff"],
                        },
                ),
                ['Occupation_type']
            ),
            ('one_hot_housing_fam_status', OneHotEncoder(), ['Housing_type', 'Family_status']),
            ('qbin_total_income', KBinsDiscretizer(n_bins=3, strategy='quantile', encode="onehot"), ['Total_income']),
            ('bin_age', KBinsDiscretizer(n_bins=5, strategy='uniform', encode="onehot"), ['Age']),
        ],
        remainder='passthrough',
        verbose_feature_names_out=False,
    )

    pre_processor_third_party = ColumnTransformer(
        transformers=[
            ('bin_years_employed', KBinsDiscretizer(n_bins=5, strategy='uniform', encode="onehot"), ['Years_employed'])
        ],
        remainder='passthrough',
        verbose_feature_names_out=False,
    )

    return pre_processor_user, pre_processor_third_party


def select_and_pop_features(data, columns):
    new_data = data[columns].copy()
    data.drop(columns, axis=1, inplace=True)
    return new_data