|
|
from numpy import nan, ndarray |
|
|
from pandas import DataFrame, concat |
|
|
from scipy.sparse import spmatrix |
|
|
from sklearn.compose import ColumnTransformer |
|
|
from sklearn.impute import SimpleImputer |
|
|
from sklearn.pipeline import Pipeline |
|
|
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder |
|
|
|
|
|
|
|
|
def preprocess_data(train_df: DataFrame, test_df: DataFrame) -> tuple[ndarray, ndarray]: |
|
|
""" |
|
|
Pre process data for modeling. Receives train and test dataframes, cleans them up, and returns ndarrays with feature engineering already performed. |
|
|
|
|
|
Args: |
|
|
train_df (DataFrame): The training dataframe. |
|
|
test_df (DataFrame): The test dataframe. |
|
|
|
|
|
Returns: |
|
|
tuple[ndarray, ndarray]: A tuple with the preprocessed train and test data as ndarrays |
|
|
""" |
|
|
aux_train_df = train_df.copy() |
|
|
aux_test_df = test_df.copy() |
|
|
|
|
|
|
|
|
aux_train_df["DAYS_EMPLOYED"] = aux_train_df["DAYS_EMPLOYED"].replace({365243: nan}) |
|
|
aux_test_df["DAYS_EMPLOYED"] = aux_test_df["DAYS_EMPLOYED"].replace({365243: nan}) |
|
|
|
|
|
|
|
|
categorical_cols = aux_train_df.select_dtypes(include="object").columns |
|
|
binary_cols = [col for col in categorical_cols if aux_train_df[col].nunique() == 2] |
|
|
multi_cols = [col for col in categorical_cols if aux_train_df[col].nunique() > 2] |
|
|
|
|
|
|
|
|
ordinal_encoder = OrdinalEncoder() |
|
|
|
|
|
ordinal_encoder.fit(aux_train_df[binary_cols]) |
|
|
aux_train_df[binary_cols] = ordinal_encoder.transform(aux_train_df[binary_cols]) |
|
|
aux_test_df[binary_cols] = ordinal_encoder.transform(aux_test_df[binary_cols]) |
|
|
|
|
|
|
|
|
one_hot_encoder = OneHotEncoder( |
|
|
handle_unknown="ignore", |
|
|
sparse_output=False, |
|
|
) |
|
|
|
|
|
one_hot_encoder.fit(aux_train_df[multi_cols]) |
|
|
ohe_train = one_hot_encoder.transform(aux_train_df[multi_cols]) |
|
|
ohe_test = one_hot_encoder.transform(aux_test_df[multi_cols]) |
|
|
|
|
|
|
|
|
ohe_cols = one_hot_encoder.get_feature_names_out(input_features=multi_cols) |
|
|
|
|
|
|
|
|
ohe_train_df = DataFrame(data=ohe_train, columns=ohe_cols, index=aux_train_df.index) |
|
|
ohe_test_df = DataFrame(data=ohe_test, columns=ohe_cols, index=aux_test_df.index) |
|
|
|
|
|
|
|
|
aux_train_df.drop(columns=multi_cols, inplace=True) |
|
|
aux_test_df.drop(columns=multi_cols, inplace=True) |
|
|
|
|
|
|
|
|
aux_train_df = concat([aux_train_df, ohe_train_df], axis=1) |
|
|
aux_test_df = concat([aux_test_df, ohe_test_df], axis=1) |
|
|
|
|
|
|
|
|
imputer = SimpleImputer(strategy="median") |
|
|
imputer.fit(aux_train_df) |
|
|
|
|
|
imputer_train = imputer.transform(aux_train_df) |
|
|
imputer_test = imputer.transform(aux_test_df) |
|
|
|
|
|
aux_train_df = DataFrame( |
|
|
data=imputer_train, |
|
|
columns=aux_train_df.columns, |
|
|
index=aux_train_df.index, |
|
|
) |
|
|
aux_test_df = DataFrame( |
|
|
data=imputer_test, |
|
|
columns=aux_test_df.columns, |
|
|
index=aux_test_df.index, |
|
|
) |
|
|
|
|
|
|
|
|
scaler = MinMaxScaler() |
|
|
scaler.fit(aux_train_df) |
|
|
|
|
|
scaler_train = scaler.transform(aux_train_df) |
|
|
scaler_test = scaler.transform(aux_test_df) |
|
|
|
|
|
return scaler_train, scaler_test |
|
|
|
|
|
|
|
|
def preprocess_data_pipeline( |
|
|
train_df: DataFrame, test_df: DataFrame |
|
|
) -> tuple[ndarray | spmatrix, ndarray | spmatrix]: |
|
|
""" |
|
|
Pre process data for modeling. Receives train and test dataframes, cleans them up, and returns ndarrays with feature engineering already performed. |
|
|
|
|
|
Args: |
|
|
train_df (DataFrame): The training dataframe. |
|
|
test_df (DataFrame): The test dataframe. |
|
|
|
|
|
Returns: |
|
|
tuple[ndarray, ndarray]: A tuple with the preprocessed train and test data as ndarrays |
|
|
""" |
|
|
|
|
|
aux_train_df = train_df.copy() |
|
|
aux_test_df = test_df.copy() |
|
|
|
|
|
|
|
|
aux_train_df["DAYS_EMPLOYED"] = aux_train_df["DAYS_EMPLOYED"].replace({365243: nan}) |
|
|
aux_test_df["DAYS_EMPLOYED"] = aux_test_df["DAYS_EMPLOYED"].replace({365243: nan}) |
|
|
|
|
|
|
|
|
numerical_cols = aux_train_df.select_dtypes(include="number").columns.to_list() |
|
|
categorical_cols = aux_train_df.select_dtypes(include="object").columns.to_list() |
|
|
|
|
|
binary_cols = [col for col in categorical_cols if aux_train_df[col].nunique() == 2] |
|
|
multi_cols = [col for col in categorical_cols if aux_train_df[col].nunique() > 2] |
|
|
|
|
|
|
|
|
|
|
|
numerical_pipeline = Pipeline( |
|
|
steps=[ |
|
|
("imputer", SimpleImputer(strategy="median")), |
|
|
("scaler", MinMaxScaler()), |
|
|
] |
|
|
) |
|
|
|
|
|
binary_pipeline = Pipeline( |
|
|
steps=[ |
|
|
("imputer", SimpleImputer(strategy="most_frequent")), |
|
|
("ordinal", OrdinalEncoder()), |
|
|
("scaler", MinMaxScaler()), |
|
|
] |
|
|
) |
|
|
|
|
|
multi_pipeline = Pipeline( |
|
|
steps=[ |
|
|
("imputer", SimpleImputer(strategy="most_frequent")), |
|
|
("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)), |
|
|
("scaler", MinMaxScaler()), |
|
|
] |
|
|
) |
|
|
|
|
|
|
|
|
preprocessor = ColumnTransformer( |
|
|
transformers=[ |
|
|
|
|
|
("binary", binary_pipeline, binary_cols), |
|
|
("multi", multi_pipeline, multi_cols), |
|
|
("numerical", numerical_pipeline, numerical_cols), |
|
|
], |
|
|
remainder="passthrough", |
|
|
) |
|
|
|
|
|
|
|
|
preprocessor.fit(aux_train_df) |
|
|
train_preprocessed = preprocessor.transform(aux_train_df) |
|
|
test_preprocessed = preprocessor.transform(aux_test_df) |
|
|
|
|
|
return train_preprocessed, test_preprocessed |
|
|
|