Spaces:
Runtime error
Runtime error
import pandas as pd | |
CAT_COLUMNS = ['currency', 'operation_kind', 'card_type', | |
'operation_type', 'operation_type_group', 'ecommerce_flag', | |
'payment_system', 'income_flag', 'mcc', 'country', 'city', | |
'mcc_category', 'day_of_week', 'hour','weekofyear'] | |
NUMERIC_COLUMNS = ['days_before', 'hour_diff'] | |
REAL_COLUMNS = ['amnt'] | |
def __amnt_pivot_table_by_column_as_frame(frame, column, agg_funcs=None) -> pd.DataFrame: | |
""" | |
Generates pivot table for `app_id` and a specified column by aggregating `amnt` column | |
:param frame: pd.DataFrame containing card transactions | |
:param column: column with keys to group by on the pivot table column | |
:param agg_funcs: list of aggregation functions, default is ['sum', 'mean', 'count'] | |
:return: pd.DataFrame pivot table | |
""" | |
if agg_funcs is None: | |
agg_funcs = ['sum', 'mean', 'count'] | |
aggs = pd.pivot_table(frame, values='amnt', | |
index=['app_id'], columns=[column], | |
aggfunc={'amnt': agg_funcs}, | |
fill_value=0) | |
aggs.columns = [f'amnt_{col[0]}_{column}_{col[1]}' for col in aggs.columns.values] | |
return aggs | |
def extract_basic_aggregations(transactions_frame: pd.DataFrame, cat_columns=None, agg_funcs=None) -> pd.DataFrame: | |
""" | |
Extracts basic features from a card transaction dataframe | |
:param transactions_frame: pd.DataFrame containing card transactions | |
:param cat_columns: list of categorical columns for which we want to aggregate `amnt`, default is all | |
:param agg_funcs: list of aggregation functions for cat_columns, default is | |
['sum', 'mean', 'count'] | |
:return: pd.DataFrame with extracted features | |
""" | |
if not cat_columns: | |
cat_columns = CAT_COLUMNS | |
if not agg_funcs: | |
agg_funcs = ['sum', 'mean', 'count'] | |
pivot_tables = [] | |
for col in cat_columns: | |
pivot_tables.append(__amnt_pivot_table_by_column_as_frame(transactions_frame, column=col, | |
agg_funcs=agg_funcs)) | |
pivot_tables = pd.concat(pivot_tables, axis=1) | |
# we will also generate total statistics grouped by app_id | |
aggs = { | |
# transation amount | |
'amnt': ['max', 'min', 'mean', 'median', 'sum', 'std'], | |
# time difference between transactions | |
'hour_diff': ['max', 'mean', 'median', 'var', 'std'], | |
# days left before application at the moment when transaction took place | |
'days_before': ['min', 'max', 'median']} | |
numeric_stats = transactions_frame.groupby(['app_id']).agg(aggs) | |
numeric_stats.columns = numeric_stats.columns.map('_'.join) | |
return pd.concat([pivot_tables, numeric_stats], axis=1).reset_index() | |