File size: 3,407 Bytes
747c295
 
 
 
 
 
 
 
9a997e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
993f2a6
9a997e4
993f2a6
9a997e4
 
 
993f2a6
9a997e4
 
 
 
 
 
 
 
 
 
 
 
993f2a6
9a997e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a241bb3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
"""Data pre-processing functions.

The pre-processing steps are heavily inspired by the following notebook :
https://www.kaggle.com/code/rikdifos/credit-card-approval-prediction-using-ml

Additional steps, mostly including renaming some values or features, were added for better user 
experience.
"""

import numpy
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, KBinsDiscretizer


def _get_pipeline_replace_one_hot(func, value):
    return Pipeline([ 
        ("replace", FunctionTransformer(
            func, 
            kw_args={"value": value},
            feature_names_out='one-to-one',
        )),
        ("one_hot", OneHotEncoder(),),
    ])


def _replace_values_geq(column, value):
    return numpy.where(column >= value, f"{value}_or_more", column)

def _replace_values_eq(column, value):
    for desired_value, values_to_replace in value.items():
        column = numpy.where(numpy.isin(column, values_to_replace), desired_value, column)
    return column

def get_pre_processors():
    pre_processor_user = ColumnTransformer(
        transformers=[
            (
                "replace_num_children",
                _get_pipeline_replace_one_hot(_replace_values_geq, 2),
                ['Num_children']
            ),
            (
                "replace_household_size",
                _get_pipeline_replace_one_hot(_replace_values_geq, 3),
                ['Household_size']
            ),
            (
                "replace_income_type",
                _get_pipeline_replace_one_hot(_replace_values_eq, {"Public Sector": ["Retired", "Student"]}),
                ['Income_type']
            ),
            (
                "replace_education_type",
                _get_pipeline_replace_one_hot(_replace_values_eq, {"Higher education": ["Academic degree"]}),
                ['Education_type']
            ),
            (
                "replace_occupation_type_labor",
                _get_pipeline_replace_one_hot(
                    _replace_values_eq, 
                        {
                            "Labor_work": ["Cleaning staff", "Cooking staff", "Drivers", "Laborers", "Low-wage laborers", "Security staff", "Waiters/barmen staff"],
                            "Office_work": ["Accountants", "Core staff", "HR staff", "Medicine staff", "Private service staff", "Realty agents", "Sales staff", "Secretaries"],
                            "High_tech_work": ["Managers", "High skill tech staff", "IT staff"],
                        },
                ),
                ['Occupation_type']
            ),
            ('one_hot_housing_fam_status', OneHotEncoder(), ['Housing_type', 'Family_status']),
            ('qbin_total_income', KBinsDiscretizer(n_bins=3, strategy='quantile', encode="onehot"), ['Total_income']),
            ('bin_age', KBinsDiscretizer(n_bins=5, strategy='uniform', encode="onehot"), ['Age']),
        ],
        remainder='passthrough',
        verbose_feature_names_out=False,
    )

    pre_processor_third_party = ColumnTransformer(
        transformers=[
            ('bin_years_employed', KBinsDiscretizer(n_bins=5, strategy='uniform', encode="onehot"), ['Years_employed'])
        ],
        remainder='passthrough',
        verbose_feature_names_out=False,
    )

    return pre_processor_user, pre_processor_third_party