File size: 2,774 Bytes
7c53b01
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import pandas as pd

CAT_COLUMNS = ['currency', 'operation_kind', 'card_type',
               'operation_type', 'operation_type_group', 'ecommerce_flag',
               'payment_system', 'income_flag', 'mcc', 'country', 'city',
               'mcc_category', 'day_of_week', 'hour','weekofyear']

NUMERIC_COLUMNS = ['days_before', 'hour_diff']

REAL_COLUMNS = ['amnt']


def __amnt_pivot_table_by_column_as_frame(frame, column, agg_funcs=None) -> pd.DataFrame:
    """
    Generates pivot table for `app_id` and a specified column by aggregating `amnt` column

    :param frame: pd.DataFrame containing card transactions
    :param column: column with keys to group by on the pivot table column
    :param agg_funcs: list of aggregation functions, default is ['sum', 'mean', 'count']
    :return: pd.DataFrame pivot table
    """
    if agg_funcs is None:
        agg_funcs = ['sum', 'mean', 'count']
    aggs = pd.pivot_table(frame, values='amnt',
                          index=['app_id'], columns=[column],
                          aggfunc={'amnt': agg_funcs},
                          fill_value=0)
    aggs.columns = [f'amnt_{col[0]}_{column}_{col[1]}' for col in aggs.columns.values]
    return aggs


def extract_basic_aggregations(transactions_frame: pd.DataFrame, cat_columns=None, agg_funcs=None) -> pd.DataFrame:
    """
    Extracts basic features from a card transaction dataframe

    :param transactions_frame: pd.DataFrame containing card transactions
    :param cat_columns: list of categorical columns for which we want to aggregate `amnt`, default is all
    :param agg_funcs: list of aggregation functions for cat_columns, default is
    ['sum', 'mean', 'count']
    :return: pd.DataFrame with extracted features
    """
    if not cat_columns:
        cat_columns = CAT_COLUMNS

    if not agg_funcs:
        agg_funcs = ['sum', 'mean', 'count']

    pivot_tables = []
    for col in cat_columns:
        pivot_tables.append(__amnt_pivot_table_by_column_as_frame(transactions_frame, column=col,
                                                                  agg_funcs=agg_funcs))
    pivot_tables = pd.concat(pivot_tables, axis=1)

    # we will also generate total statistics grouped by app_id
    aggs = {
        # transation amount
        'amnt': ['max', 'min', 'mean', 'median', 'sum', 'std'],
        # time difference between transactions
        'hour_diff': ['max', 'mean', 'median', 'var', 'std'],
        # days left before application at the moment when transaction took place
        'days_before': ['min', 'max', 'median']}

    numeric_stats = transactions_frame.groupby(['app_id']).agg(aggs)
    numeric_stats.columns = numeric_stats.columns.map('_'.join)

    return pd.concat([pivot_tables, numeric_stats], axis=1).reset_index()