alex42t commited on
Commit
7c53b01
1 Parent(s): b0237ad

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +56 -0
  2. features.py +66 -0
  3. requirements.txt +7 -0
  4. utils.py +38 -0
  5. xgb_cpu.joblib +3 -0
app.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from joblib import load
2
+ import gradio as gr
3
+ import pandas as pd
4
+ import numpy as np
5
+ import shap
6
+ import matplotlib.pyplot as plt
7
+ from features import extract_basic_aggregations
8
+ from os import listdir
9
+ from os.path import join, isfile
10
+
11
+ examples_path = './csv_examples/'
12
+ examples = [[join(examples_path, f), 'A'] for f in listdir(examples_path)]
13
+ model = load('xgb_cpu.joblib')
14
+
15
+ explainer = shap.TreeExplainer(model)
16
+ products = {'A': 0,
17
+ 'B': 1,
18
+ 'C': 2,
19
+ 'D': 3,
20
+ 'E': 4
21
+ }
22
+ def score_client(card_transactions_file, product: str):
23
+ df = pd.read_csv(card_transactions_file)
24
+ assert product in products
25
+
26
+ features = extract_basic_aggregations(df, cat_columns=['mcc_category', 'day_of_week', 'operation_type'])
27
+ features = features.reindex(columns=model.feature_names_in_, fill_value=0)
28
+ features['product'] = products[product]
29
+ default_proba = model.predict_proba(features)[0][0]
30
+ shap_values = explainer.shap_values(features)
31
+ shap.plots.waterfall(explainer(features)[0], max_display=14, show=False)
32
+ plt.tight_layout()
33
+ shap_fig = plt.gcf()
34
+
35
+ plt.close()
36
+ return default_proba, shap_fig
37
+
38
+
39
+ title = "Check your credit score"
40
+ description = "Check your credit score dude"
41
+
42
+ inputs = [gr.File(), gr.Dropdown(choices=list(products.keys()), value=list(products.keys())[0])]
43
+ outputs = [gr.Textbox(label='Your credit score (the more, the better)', interactive=False),
44
+ gr.Plot(label='SHAP')
45
+ ]
46
+
47
+ demo = gr.Interface(
48
+ fn=score_client,
49
+ inputs=inputs,
50
+ outputs=outputs,
51
+ allow_flagging='never',
52
+ examples=examples,
53
+ title=title,
54
+ description=description,
55
+ )
56
+ demo.launch()
features.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ CAT_COLUMNS = ['currency', 'operation_kind', 'card_type',
4
+ 'operation_type', 'operation_type_group', 'ecommerce_flag',
5
+ 'payment_system', 'income_flag', 'mcc', 'country', 'city',
6
+ 'mcc_category', 'day_of_week', 'hour','weekofyear']
7
+
8
+ NUMERIC_COLUMNS = ['days_before', 'hour_diff']
9
+
10
+ REAL_COLUMNS = ['amnt']
11
+
12
+
13
+ def __amnt_pivot_table_by_column_as_frame(frame, column, agg_funcs=None) -> pd.DataFrame:
14
+ """
15
+ Generates pivot table for `app_id` and a specified column by aggregating `amnt` column
16
+
17
+ :param frame: pd.DataFrame containing card transactions
18
+ :param column: column with keys to group by on the pivot table column
19
+ :param agg_funcs: list of aggregation functions, default is ['sum', 'mean', 'count']
20
+ :return: pd.DataFrame pivot table
21
+ """
22
+ if agg_funcs is None:
23
+ agg_funcs = ['sum', 'mean', 'count']
24
+ aggs = pd.pivot_table(frame, values='amnt',
25
+ index=['app_id'], columns=[column],
26
+ aggfunc={'amnt': agg_funcs},
27
+ fill_value=0)
28
+ aggs.columns = [f'amnt_{col[0]}_{column}_{col[1]}' for col in aggs.columns.values]
29
+ return aggs
30
+
31
+
32
+ def extract_basic_aggregations(transactions_frame: pd.DataFrame, cat_columns=None, agg_funcs=None) -> pd.DataFrame:
33
+ """
34
+ Extracts basic features from a card transaction dataframe
35
+
36
+ :param transactions_frame: pd.DataFrame containing card transactions
37
+ :param cat_columns: list of categorical columns for which we want to aggregate `amnt`, default is all
38
+ :param agg_funcs: list of aggregation functions for cat_columns, default is
39
+ ['sum', 'mean', 'count']
40
+ :return: pd.DataFrame with extracted features
41
+ """
42
+ if not cat_columns:
43
+ cat_columns = CAT_COLUMNS
44
+
45
+ if not agg_funcs:
46
+ agg_funcs = ['sum', 'mean', 'count']
47
+
48
+ pivot_tables = []
49
+ for col in cat_columns:
50
+ pivot_tables.append(__amnt_pivot_table_by_column_as_frame(transactions_frame, column=col,
51
+ agg_funcs=agg_funcs))
52
+ pivot_tables = pd.concat(pivot_tables, axis=1)
53
+
54
+ # we will also generate total statistics grouped by app_id
55
+ aggs = {
56
+ # transation amount
57
+ 'amnt': ['max', 'min', 'mean', 'median', 'sum', 'std'],
58
+ # time difference between transactions
59
+ 'hour_diff': ['max', 'mean', 'median', 'var', 'std'],
60
+ # days left before application at the moment when transaction took place
61
+ 'days_before': ['min', 'max', 'median']}
62
+
63
+ numeric_stats = transactions_frame.groupby(['app_id']).agg(aggs)
64
+ numeric_stats.columns = numeric_stats.columns.map('_'.join)
65
+
66
+ return pd.concat([pivot_tables, numeric_stats], axis=1).reset_index()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
1
+ joblib==1.2.0
2
+ gradio==3.12.0
3
+ shap==0.41.0
4
+ matplotlib==3.5.3
5
+ jupytext==1.14.1
6
+ pandas==1.3.5
7
+ tqdm==4.64.0
utils.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import tqdm
4
+
5
+
6
+ def read_parquet_dataset_from_local(path_to_dataset: str, start_from: int = 0,
7
+ num_parts_to_read: int = 1, columns=None, verbose=False) -> pd.DataFrame:
8
+ """
9
+ Reads num_parts_to_read parquet partitions and returns the resulting pd.DataFrame
10
+
11
+ :param path_to_dataset: directory with parquet partitions
12
+ :param start_from: partition number to start with
13
+ :param num_parts_to_read: amount of partitions to read
14
+ :param columns: columns to read and include
15
+ :return: pd.DataFrame
16
+ """
17
+
18
+ res = []
19
+ dataset_paths = sorted([os.path.join(path_to_dataset, filename) for filename in os.listdir(path_to_dataset)
20
+ if filename.startswith('part')])
21
+
22
+ start_from = max(0, start_from)
23
+ if num_parts_to_read < 0:
24
+ chunks = dataset_paths[start_from: ]
25
+ else:
26
+ chunks = dataset_paths[start_from: start_from + num_parts_to_read]
27
+ if verbose:
28
+ print('Reading chunks:\n')
29
+ for chunk in chunks:
30
+ print(chunk)
31
+ for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):
32
+ chunk = pd.read_parquet(chunk_path, columns=columns)
33
+ for col_name, col_type in [('amnt', 'float32'), ('hour_diff', 'int32')]:
34
+ if col_name in chunk.columns:
35
+ chunk[col_name] = chunk[col_name].astype(col_type)
36
+
37
+ res.append(chunk)
38
+ return pd.concat(res).reset_index(drop=True)
xgb_cpu.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98109adf5d911c8839e61f88484823e7e71554c827b1d73485ef769c470bd39a
3
+ size 2865173