Spaces:
Runtime error
Runtime error
Upload 5 files
Browse files- app.py +56 -0
- features.py +66 -0
- requirements.txt +7 -0
- utils.py +38 -0
- xgb_cpu.joblib +3 -0
app.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from joblib import load
|
2 |
+
import gradio as gr
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
import shap
|
6 |
+
import matplotlib.pyplot as plt
|
7 |
+
from features import extract_basic_aggregations
|
8 |
+
from os import listdir
|
9 |
+
from os.path import join, isfile
|
10 |
+
|
11 |
+
examples_path = './csv_examples/'
|
12 |
+
examples = [[join(examples_path, f), 'A'] for f in listdir(examples_path)]
|
13 |
+
model = load('xgb_cpu.joblib')
|
14 |
+
|
15 |
+
explainer = shap.TreeExplainer(model)
|
16 |
+
products = {'A': 0,
|
17 |
+
'B': 1,
|
18 |
+
'C': 2,
|
19 |
+
'D': 3,
|
20 |
+
'E': 4
|
21 |
+
}
|
22 |
+
def score_client(card_transactions_file, product: str):
|
23 |
+
df = pd.read_csv(card_transactions_file)
|
24 |
+
assert product in products
|
25 |
+
|
26 |
+
features = extract_basic_aggregations(df, cat_columns=['mcc_category', 'day_of_week', 'operation_type'])
|
27 |
+
features = features.reindex(columns=model.feature_names_in_, fill_value=0)
|
28 |
+
features['product'] = products[product]
|
29 |
+
default_proba = model.predict_proba(features)[0][0]
|
30 |
+
shap_values = explainer.shap_values(features)
|
31 |
+
shap.plots.waterfall(explainer(features)[0], max_display=14, show=False)
|
32 |
+
plt.tight_layout()
|
33 |
+
shap_fig = plt.gcf()
|
34 |
+
|
35 |
+
plt.close()
|
36 |
+
return default_proba, shap_fig
|
37 |
+
|
38 |
+
|
39 |
+
title = "Check your credit score"
|
40 |
+
description = "Check your credit score dude"
|
41 |
+
|
42 |
+
inputs = [gr.File(), gr.Dropdown(choices=list(products.keys()), value=list(products.keys())[0])]
|
43 |
+
outputs = [gr.Textbox(label='Your credit score (the more, the better)', interactive=False),
|
44 |
+
gr.Plot(label='SHAP')
|
45 |
+
]
|
46 |
+
|
47 |
+
demo = gr.Interface(
|
48 |
+
fn=score_client,
|
49 |
+
inputs=inputs,
|
50 |
+
outputs=outputs,
|
51 |
+
allow_flagging='never',
|
52 |
+
examples=examples,
|
53 |
+
title=title,
|
54 |
+
description=description,
|
55 |
+
)
|
56 |
+
demo.launch()
|
features.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
CAT_COLUMNS = ['currency', 'operation_kind', 'card_type',
|
4 |
+
'operation_type', 'operation_type_group', 'ecommerce_flag',
|
5 |
+
'payment_system', 'income_flag', 'mcc', 'country', 'city',
|
6 |
+
'mcc_category', 'day_of_week', 'hour','weekofyear']
|
7 |
+
|
8 |
+
NUMERIC_COLUMNS = ['days_before', 'hour_diff']
|
9 |
+
|
10 |
+
REAL_COLUMNS = ['amnt']
|
11 |
+
|
12 |
+
|
13 |
+
def __amnt_pivot_table_by_column_as_frame(frame, column, agg_funcs=None) -> pd.DataFrame:
|
14 |
+
"""
|
15 |
+
Generates pivot table for `app_id` and a specified column by aggregating `amnt` column
|
16 |
+
|
17 |
+
:param frame: pd.DataFrame containing card transactions
|
18 |
+
:param column: column with keys to group by on the pivot table column
|
19 |
+
:param agg_funcs: list of aggregation functions, default is ['sum', 'mean', 'count']
|
20 |
+
:return: pd.DataFrame pivot table
|
21 |
+
"""
|
22 |
+
if agg_funcs is None:
|
23 |
+
agg_funcs = ['sum', 'mean', 'count']
|
24 |
+
aggs = pd.pivot_table(frame, values='amnt',
|
25 |
+
index=['app_id'], columns=[column],
|
26 |
+
aggfunc={'amnt': agg_funcs},
|
27 |
+
fill_value=0)
|
28 |
+
aggs.columns = [f'amnt_{col[0]}_{column}_{col[1]}' for col in aggs.columns.values]
|
29 |
+
return aggs
|
30 |
+
|
31 |
+
|
32 |
+
def extract_basic_aggregations(transactions_frame: pd.DataFrame, cat_columns=None, agg_funcs=None) -> pd.DataFrame:
|
33 |
+
"""
|
34 |
+
Extracts basic features from a card transaction dataframe
|
35 |
+
|
36 |
+
:param transactions_frame: pd.DataFrame containing card transactions
|
37 |
+
:param cat_columns: list of categorical columns for which we want to aggregate `amnt`, default is all
|
38 |
+
:param agg_funcs: list of aggregation functions for cat_columns, default is
|
39 |
+
['sum', 'mean', 'count']
|
40 |
+
:return: pd.DataFrame with extracted features
|
41 |
+
"""
|
42 |
+
if not cat_columns:
|
43 |
+
cat_columns = CAT_COLUMNS
|
44 |
+
|
45 |
+
if not agg_funcs:
|
46 |
+
agg_funcs = ['sum', 'mean', 'count']
|
47 |
+
|
48 |
+
pivot_tables = []
|
49 |
+
for col in cat_columns:
|
50 |
+
pivot_tables.append(__amnt_pivot_table_by_column_as_frame(transactions_frame, column=col,
|
51 |
+
agg_funcs=agg_funcs))
|
52 |
+
pivot_tables = pd.concat(pivot_tables, axis=1)
|
53 |
+
|
54 |
+
# we will also generate total statistics grouped by app_id
|
55 |
+
aggs = {
|
56 |
+
# transation amount
|
57 |
+
'amnt': ['max', 'min', 'mean', 'median', 'sum', 'std'],
|
58 |
+
# time difference between transactions
|
59 |
+
'hour_diff': ['max', 'mean', 'median', 'var', 'std'],
|
60 |
+
# days left before application at the moment when transaction took place
|
61 |
+
'days_before': ['min', 'max', 'median']}
|
62 |
+
|
63 |
+
numeric_stats = transactions_frame.groupby(['app_id']).agg(aggs)
|
64 |
+
numeric_stats.columns = numeric_stats.columns.map('_'.join)
|
65 |
+
|
66 |
+
return pd.concat([pivot_tables, numeric_stats], axis=1).reset_index()
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
joblib==1.2.0
|
2 |
+
gradio==3.12.0
|
3 |
+
shap==0.41.0
|
4 |
+
matplotlib==3.5.3
|
5 |
+
jupytext==1.14.1
|
6 |
+
pandas==1.3.5
|
7 |
+
tqdm==4.64.0
|
utils.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pandas as pd
|
3 |
+
import tqdm
|
4 |
+
|
5 |
+
|
6 |
+
def read_parquet_dataset_from_local(path_to_dataset: str, start_from: int = 0,
|
7 |
+
num_parts_to_read: int = 1, columns=None, verbose=False) -> pd.DataFrame:
|
8 |
+
"""
|
9 |
+
Reads num_parts_to_read parquet partitions and returns the resulting pd.DataFrame
|
10 |
+
|
11 |
+
:param path_to_dataset: directory with parquet partitions
|
12 |
+
:param start_from: partition number to start with
|
13 |
+
:param num_parts_to_read: amount of partitions to read
|
14 |
+
:param columns: columns to read and include
|
15 |
+
:return: pd.DataFrame
|
16 |
+
"""
|
17 |
+
|
18 |
+
res = []
|
19 |
+
dataset_paths = sorted([os.path.join(path_to_dataset, filename) for filename in os.listdir(path_to_dataset)
|
20 |
+
if filename.startswith('part')])
|
21 |
+
|
22 |
+
start_from = max(0, start_from)
|
23 |
+
if num_parts_to_read < 0:
|
24 |
+
chunks = dataset_paths[start_from: ]
|
25 |
+
else:
|
26 |
+
chunks = dataset_paths[start_from: start_from + num_parts_to_read]
|
27 |
+
if verbose:
|
28 |
+
print('Reading chunks:\n')
|
29 |
+
for chunk in chunks:
|
30 |
+
print(chunk)
|
31 |
+
for chunk_path in tqdm.tqdm_notebook(chunks, desc="Reading dataset with pandas"):
|
32 |
+
chunk = pd.read_parquet(chunk_path, columns=columns)
|
33 |
+
for col_name, col_type in [('amnt', 'float32'), ('hour_diff', 'int32')]:
|
34 |
+
if col_name in chunk.columns:
|
35 |
+
chunk[col_name] = chunk[col_name].astype(col_type)
|
36 |
+
|
37 |
+
res.append(chunk)
|
38 |
+
return pd.concat(res).reset_index(drop=True)
|
xgb_cpu.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:98109adf5d911c8839e61f88484823e7e71554c827b1d73485ef769c470bd39a
|
3 |
+
size 2865173
|