Spaces:
Running
Running
howardroark
commited on
Commit
•
6f4f21f
1
Parent(s):
beb8613
initial commit
Browse files- .gitignore +163 -0
- README.md +1 -12
- app.py +279 -0
- app_old.py +341 -0
- data_utils/data_generation.py +140 -0
- data_utils/data_simulation.py +22 -0
- data_utils/eda_simulation.py +29 -0
- data_utils/exploratory_data_analysis.py +27 -0
- data_utils/feature_importance.py +18 -0
- data_utils/feature_importance_simulation.py +13 -0
- eval_utils/evaluation.py +57 -0
- eval_utils/evaluation_simulation.py +39 -0
- mlops_utils/wandb_utils.py +47 -0
- models_utils/ml_models.py +55 -0
- models_utils/models_simulation.py +22 -0
- notebooks/Demo_Notebook.ipynb +0 -0
- notebooks/Test.ipynb +152 -0
- requirements.txt +7 -0
.gitignore
ADDED
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# poetry
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
102 |
+
#poetry.lock
|
103 |
+
|
104 |
+
# pdm
|
105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
106 |
+
#pdm.lock
|
107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
108 |
+
# in version control.
|
109 |
+
# https://pdm.fming.dev/#use-with-ide
|
110 |
+
.pdm.toml
|
111 |
+
|
112 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
113 |
+
__pypackages__/
|
114 |
+
|
115 |
+
# Celery stuff
|
116 |
+
celerybeat-schedule
|
117 |
+
celerybeat.pid
|
118 |
+
|
119 |
+
# SageMath parsed files
|
120 |
+
*.sage.py
|
121 |
+
|
122 |
+
# Environments
|
123 |
+
.env
|
124 |
+
.venv
|
125 |
+
env/
|
126 |
+
venv/
|
127 |
+
ENV/
|
128 |
+
env.bak/
|
129 |
+
venv.bak/
|
130 |
+
|
131 |
+
# Spyder project settings
|
132 |
+
.spyderproject
|
133 |
+
.spyproject
|
134 |
+
|
135 |
+
# Rope project settings
|
136 |
+
.ropeproject
|
137 |
+
|
138 |
+
# mkdocs documentation
|
139 |
+
/site
|
140 |
+
|
141 |
+
# mypy
|
142 |
+
.mypy_cache/
|
143 |
+
.dmypy.json
|
144 |
+
dmypy.json
|
145 |
+
|
146 |
+
# Pyre type checker
|
147 |
+
.pyre/
|
148 |
+
|
149 |
+
# pytype static type analyzer
|
150 |
+
.pytype/
|
151 |
+
|
152 |
+
# Cython debug symbols
|
153 |
+
cython_debug/
|
154 |
+
|
155 |
+
# PyCharm
|
156 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
157 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
158 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
159 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
160 |
+
#.idea/
|
161 |
+
|
162 |
+
data/
|
163 |
+
wandb/
|
README.md
CHANGED
@@ -1,12 +1 @@
|
|
1 |
-
|
2 |
-
title: Uplift Modeling
|
3 |
-
emoji: 😻
|
4 |
-
colorFrom: pink
|
5 |
-
colorTo: green
|
6 |
-
sdk: streamlit
|
7 |
-
sdk_version: 1.32.2
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
+
# uplift_modeling
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
ADDED
@@ -0,0 +1,279 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from data_utils.data_generation import UpliftSimulation
|
2 |
+
from data_utils.exploratory_data_analysis import ExploratoryAnalysis
|
3 |
+
from data_utils.feature_importance import FeatureImportance
|
4 |
+
from models_utils.ml_models import ModelTraining
|
5 |
+
from eval_utils.evaluation import ModelEvaluator
|
6 |
+
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
import numpy as np
|
9 |
+
import pandas as pd
|
10 |
+
import streamlit as st
|
11 |
+
|
12 |
+
X_names = [
|
13 |
+
'AgeIndex', 'IncomeIndex', 'PurchaseFrequencyIndex',
|
14 |
+
'AccountLifetimeIndex', 'AverageTransactionValueIndex', 'PreferredPaymentMethodIndex', 'RegionIndex',
|
15 |
+
'EmailDiscountCTRIndex', 'WebDiscountCTRIndex', 'SocialMediaEngagementIndex',
|
16 |
+
'DirectMailDiscountResponseIndex', 'InAppDiscountEngagementIndex', 'FlashSaleParticipationIndex',
|
17 |
+
'SeasonalPromoInterestIndex', 'LoyaltyProgramEngagementIndex', 'ReferralBonusUsageIndex',
|
18 |
+
'DiscountCodeRedemptionIndex', 'VIPSaleAccessIndex', 'EarlyAccessOptInIndex',
|
19 |
+
'ProductReviewAfterDiscountIndex', 'UpsellConversionIndex', 'CrossSellInterestIndex',
|
20 |
+
'BundlePurchaseIndex', 'SubscriptionUpgradeIndex', 'CustomerFeedbackIndex',
|
21 |
+
'BrowserTypeIndex', 'DeviceCategoryIndex', 'OperatingSystemIndex',
|
22 |
+
'SessionStartTimeIndex', 'LanguagePreferenceIndex', 'NewsletterSubscriptionIndex',
|
23 |
+
'AccountVerificationStatusIndex', 'AdBlockerPresenceIndex'
|
24 |
+
]
|
25 |
+
|
26 |
+
# Title
|
27 |
+
st.title("Uplift Modeling in Retail Demo")
|
28 |
+
|
29 |
+
tabs = st.sidebar.radio("Navigation", ["Data generation", "Exploratory analysis", "Model training", "Economic effects"])
|
30 |
+
|
31 |
+
if tabs == "Data generation":
|
32 |
+
|
33 |
+
st.header("Data Generation")
|
34 |
+
|
35 |
+
# Description
|
36 |
+
st.write("""
|
37 |
+
This app creates a simulated dataset for a special kind of analysis called uplift modeling, which helps understand the effect of different actions (like promotions) on customer behavior. We use some default settings to make things easy:
|
38 |
+
- We're looking at whether customers make a purchase or not.
|
39 |
+
- We compare different types of promotions (like no discount, 5% off, etc.).
|
40 |
+
- The dataset includes 15 different pieces of information (features) about each customer.
|
41 |
+
""")
|
42 |
+
|
43 |
+
# Interactive number of samples selection
|
44 |
+
n = st.number_input('Number of Samples (n)', min_value=1000, value=10000, step=1000,
|
45 |
+
help="Total number of samples to generate in the dataset.")
|
46 |
+
|
47 |
+
# Default values for other variables
|
48 |
+
y_name = 'conversion'
|
49 |
+
treatment_group_keys = ['control', 'discount_05', 'discount_10', 'discount_15']
|
50 |
+
n_classification_features = 15
|
51 |
+
n_classification_informative = 7
|
52 |
+
n_classification_repeated = 0
|
53 |
+
n_uplift_increase_dict = {'discount_05': 4, 'discount_10': 3, 'discount_15': 3}
|
54 |
+
n_uplift_decrease_dict = {'discount_05': 0, 'discount_10': 0, 'discount_15': 0}
|
55 |
+
positive_class_proportion = 0.05
|
56 |
+
random_seed = 8097
|
57 |
+
|
58 |
+
# Button to generate dataset
|
59 |
+
if st.button('Generate Dataset'):
|
60 |
+
uplift_sim = UpliftSimulation(n=n, y_name=y_name, treatment_group_keys=treatment_group_keys,
|
61 |
+
n_classification_features=n_classification_features,
|
62 |
+
n_classification_informative=n_classification_informative,
|
63 |
+
n_classification_repeated=n_classification_repeated,
|
64 |
+
n_uplift_increase_dict=n_uplift_increase_dict,
|
65 |
+
n_uplift_decrease_dict=n_uplift_decrease_dict,
|
66 |
+
positive_class_proportion=positive_class_proportion,
|
67 |
+
random_seed=random_seed)
|
68 |
+
uplift_sim.simulate_dataset()
|
69 |
+
uplift_sim.apply_discounts_and_clean()
|
70 |
+
uplift_sim.postprocess_tables()
|
71 |
+
uplift_sim.add_monetary_effect()
|
72 |
+
st.session_state.uplift_sim = uplift_sim # Store in session state
|
73 |
+
|
74 |
+
st.write("Dataset Generated Successfully!")
|
75 |
+
|
76 |
+
st.subheader("User profiles")
|
77 |
+
st.write('Features that represent a customer such as age, income, purchase frequency, etc')
|
78 |
+
st.dataframe(uplift_sim.dataframes[0].head(3))
|
79 |
+
|
80 |
+
st.subheader("Treatments data")
|
81 |
+
st.write('Information about the different treatments (discounts) that were applied to the customers as discounts in different channels (web, email, mobile), early access, etc')
|
82 |
+
st.dataframe(uplift_sim.dataframes[1].head(3))
|
83 |
+
|
84 |
+
st.subheader("Other data")
|
85 |
+
st.write('Other data that can be used in the analysis')
|
86 |
+
st.dataframe(uplift_sim.dataframes[2].head(3))
|
87 |
+
|
88 |
+
if tabs == "Exploratory analysis":
|
89 |
+
|
90 |
+
st.header("Exploratory Analysis")
|
91 |
+
|
92 |
+
if 'uplift_sim' in st.session_state:
|
93 |
+
|
94 |
+
st.subheader('Summary statistics')
|
95 |
+
uplift_sim = st.session_state.uplift_sim
|
96 |
+
eda = ExploratoryAnalysis(uplift_sim.df)
|
97 |
+
|
98 |
+
st.write('We begin by computing the total sum of conversions, sales (discounted price) and platform benefit. We can see that the total conversions and the total sales grows as the discount value is bigger. However, the platform benefit decreases.')
|
99 |
+
|
100 |
+
sum_conversions, mean_conversions = eda.compute_summaries()
|
101 |
+
st.write(sum_conversions)
|
102 |
+
st.write(mean_conversions)
|
103 |
+
|
104 |
+
st.write('We can also visualize the tradeoff between conversions and platform benefit by plotting the mean benefit per user on the y-axis and the mean conversion rate on the x-axis, for each treatment group.')
|
105 |
+
mean_benefit_vs_conversion = eda.compute_mean_benefit_vs_conversion()
|
106 |
+
|
107 |
+
fig, ax = plt.subplots()
|
108 |
+
mean_benefit_vs_conversion.plot.scatter(x='conversion', y='benefit', c='DarkBlue', s=50, ax=ax)
|
109 |
+
st.pyplot(fig)
|
110 |
+
|
111 |
+
st.write('''
|
112 |
+
We further compute the Average Treatment Effect (ATE) for both the mean conversion rate and the mean benefit per user:
|
113 |
+
- Conversion ATE = Mean Conversion rate in the discounted group minus Mean Conversion rate in the control group
|
114 |
+
- Benefit ATE = Mean Benefit per user in the discounted group minus Mean Benefit per user in the control group
|
115 |
+
This helps illustrate how the discount value affects Conversion ATE and Benefit ATE.
|
116 |
+
''')
|
117 |
+
mean_conversions_ate = eda.compute_ate()
|
118 |
+
|
119 |
+
fig, ax = plt.subplots()
|
120 |
+
mean_conversions_ate.plot.scatter(x='conversion', y='benefit', c='DarkBlue', s=50, ax=ax)
|
121 |
+
st.pyplot(fig)
|
122 |
+
|
123 |
+
st.subheader('Feature importance')
|
124 |
+
|
125 |
+
# Allow users to select a treatment group
|
126 |
+
treatment_group = st.selectbox(
|
127 |
+
'Select a treatment group',
|
128 |
+
options=['discount_05', 'discount_10', 'discount_15'],
|
129 |
+
index=0 # default to 'discount_05'
|
130 |
+
)
|
131 |
+
|
132 |
+
feature_importance = FeatureImportance(uplift_sim.df, X_names, y_name = 'conversion', treatment_group = treatment_group)
|
133 |
+
fi = feature_importance.compute_feature_importance()
|
134 |
+
fig, ax = plt.subplots()
|
135 |
+
di_df_sorted = fi.sort_values(by='score', ascending=False)
|
136 |
+
di_df_sorted[['feature', 'score']].plot.barh(x='feature', y='score', ax=ax)
|
137 |
+
st.pyplot(fig)
|
138 |
+
|
139 |
+
st.write("""
|
140 |
+
- AccountLifetimeIndex: Longer-standing accounts are key predictors of customer response to promotions \n
|
141 |
+
- CustomerFeedbackIndex: Customer feedback significantly influences the success of marketing strategies \n
|
142 |
+
- UpsellConversionIndex: The success rate of upselling is an important factor \n
|
143 |
+
- PurchaseFrequencyIndex: More frequent purchases indicate higher engagement and response to marketing efforts \n
|
144 |
+
- ReferralBonusUsedIndex and LoyaltyProgramEngagementIndex: Engagement with these programs is highly indicative of responsiveness to promotions
|
145 |
+
""")
|
146 |
+
|
147 |
+
else:
|
148 |
+
st.error("Please generate the dataset first.")
|
149 |
+
|
150 |
+
if tabs == "Model training":
|
151 |
+
|
152 |
+
st.header("Model Training")
|
153 |
+
|
154 |
+
if 'uplift_sim' in st.session_state:
|
155 |
+
|
156 |
+
uplift_sim = st.session_state.uplift_sim
|
157 |
+
|
158 |
+
model_trainer = ModelTraining(uplift_sim.df, 'conversion', X_names)
|
159 |
+
|
160 |
+
model_type = st.radio("Choose the model type", ('Conversion Model', 'Benefit Model'))
|
161 |
+
|
162 |
+
params = {
|
163 |
+
'n_estimators': st.slider('Number of Estimators', 10, 100, 50),
|
164 |
+
'max_depth': st.slider('Max Depth', 1, 10, 4),
|
165 |
+
'colsample_bytree': st.slider('Colsample by Tree', 0.1, 1.0, 0.2),
|
166 |
+
'subsample': st.slider('Subsample', 0.1, 1.0, 0.2),
|
167 |
+
}
|
168 |
+
control_name = 'control' # st.text_input('Control Group Name', 'control')
|
169 |
+
test_size = st.slider('Test Size', 0.1, 0.9, 0.5)
|
170 |
+
random_state = 20143 # st.slider('Random State', 0, 10000, 20143)
|
171 |
+
|
172 |
+
if st.button('Train Model'):
|
173 |
+
|
174 |
+
model_trainer.split_data(test_size=test_size, random_state=random_state)
|
175 |
+
|
176 |
+
if model_type == 'Conversion Model':
|
177 |
+
y_name = 'conversion' # st.selectbox('Select target variable for conversion', options=uplift_sim.target_options)
|
178 |
+
model_trainer.y_name = y_name
|
179 |
+
tau = model_trainer.fit_predict_classifier(params, control_name)
|
180 |
+
elif model_type == 'BATE Model':
|
181 |
+
y_name = 'benefit' # st.selectbox('Select target variable for benefit', options=uplift_sim.benefit_options)
|
182 |
+
model_trainer.y_name = y_name
|
183 |
+
tau = model_trainer.fit_predict_regressor(params, control_name)
|
184 |
+
|
185 |
+
st.session_state.model_trainer = model_trainer
|
186 |
+
|
187 |
+
feature_importances = model_trainer.compute_feature_importance()
|
188 |
+
|
189 |
+
st.subheader('Feature Importances')
|
190 |
+
fig, ax = plt.subplots()
|
191 |
+
|
192 |
+
for k, v in feature_importances.items():
|
193 |
+
st.write(f"Feature importance for {k}")
|
194 |
+
v.plot(kind='barh', ax=ax)
|
195 |
+
ax.set_xlabel("Importance")
|
196 |
+
ax.set_ylabel("Feature")
|
197 |
+
ax.set_title(f"Feature Importance for {model_type}")
|
198 |
+
st.pyplot(fig)
|
199 |
+
|
200 |
+
else:
|
201 |
+
st.error("Please generate and preprocess the dataset first.")
|
202 |
+
|
203 |
+
if tabs == "Economic effects":
|
204 |
+
|
205 |
+
st.header("Economic Effects Analysis")
|
206 |
+
|
207 |
+
if 'uplift_sim' in st.session_state and 'model_trainer' in st.session_state:
|
208 |
+
df_test = st.session_state.model_trainer.df_test
|
209 |
+
model_type = st.radio("Choose the model type for analysis", ('Conversion Model', 'Benefit Model'))
|
210 |
+
|
211 |
+
# Determine which model to use based on user selection
|
212 |
+
if model_type == 'Conversion Model':
|
213 |
+
model = st.session_state.model_trainer.conversion_learner_t
|
214 |
+
elif model_type == 'Benefit Model':
|
215 |
+
model = st.session_state.model_trainer.benefit_learner_t
|
216 |
+
else:
|
217 |
+
st.error("Invalid model type selected.")
|
218 |
+
st.stop()
|
219 |
+
|
220 |
+
if model == None:
|
221 |
+
st.error("Please train the model first.")
|
222 |
+
st.stop()
|
223 |
+
|
224 |
+
evaluator = ModelEvaluator(model,
|
225 |
+
df_test,
|
226 |
+
X_names # df_test.columns.drop(['conversion', 'benefit', 'treatment_group_key'])
|
227 |
+
)
|
228 |
+
discounts = ['discount_05', 'discount_10', 'discount_15']
|
229 |
+
qini_conversions = {}
|
230 |
+
qini_benefits = {}
|
231 |
+
|
232 |
+
for discount in discounts:
|
233 |
+
qini_conv, qini_ben = evaluator.eval_performance(discount)
|
234 |
+
qini_conversions[discount] = qini_conv
|
235 |
+
qini_benefits[discount] = qini_ben
|
236 |
+
|
237 |
+
# Plotting CATE Conversion
|
238 |
+
st.subheader("CATE Conversion vs Targeted Population")
|
239 |
+
fig, ax_conversion = plt.subplots()
|
240 |
+
for discount, color in zip(discounts, ['b', 'g', 'y']):
|
241 |
+
qini_conversions[discount].plot(ax=ax_conversion, x='index', y='S', color=color)
|
242 |
+
qini_conversions[discount].plot(ax=ax_conversion, x='index', y='Random', color='r', ls='--')
|
243 |
+
|
244 |
+
ax_conversion.legend([f'{d} model' for d in discounts] + [f'{d} random' for d in discounts], prop={'size': 10})
|
245 |
+
ax_conversion.set_xlabel('Fraction of Targeted Users')
|
246 |
+
ax_conversion.set_ylabel('CATE Conversion')
|
247 |
+
ax_conversion.set_title('CATE Conversion vs Targeted Population')
|
248 |
+
st.pyplot(fig)
|
249 |
+
|
250 |
+
# Plotting CATE Benefit
|
251 |
+
st.subheader("CATE Benefit vs Targeted Population")
|
252 |
+
fig, ax_benefit = plt.subplots()
|
253 |
+
for discount, color in zip(discounts, ['b', 'g', 'y']):
|
254 |
+
qini_benefits[discount].plot(ax=ax_benefit, x='index', y='S', color=color)
|
255 |
+
qini_benefits[discount].plot(ax=ax_benefit, x='index', y='Random', color='r', ls='--')
|
256 |
+
|
257 |
+
ax_benefit.legend([f'{d} model' for d in discounts] + [f'{d} random' for d in discounts], prop={'size': 10})
|
258 |
+
ax_benefit.set_xlabel('Fraction of Targeted Users')
|
259 |
+
ax_benefit.set_ylabel('CATE Benefit')
|
260 |
+
ax_benefit.set_title('CATE Benefit vs Targeted Population')
|
261 |
+
st.pyplot(fig)
|
262 |
+
|
263 |
+
# Plotting CATE Benefit vs CATE Conversion
|
264 |
+
st.subheader("CATE Benefit vs CATE Conversion")
|
265 |
+
fig, ax_comp = plt.subplots()
|
266 |
+
colors = ['b', 'g', 'y']
|
267 |
+
for i, discount in enumerate(discounts):
|
268 |
+
qini_conc_test = pd.concat([qini_conversions[discount][['S']], qini_benefits[discount][['S']]], axis=1)
|
269 |
+
qini_conc_test.columns = ['cate_conversion', 'cate_benefit']
|
270 |
+
qini_conc_test.plot(ax=ax_comp, x='cate_conversion', y='cate_benefit', color=colors[i], label=f'{discount} model')
|
271 |
+
|
272 |
+
ax_comp.legend(prop={'size': 10})
|
273 |
+
ax_comp.set_xlabel('CATE Conversion')
|
274 |
+
ax_comp.set_ylabel('CATE Benefit')
|
275 |
+
ax_comp.set_title('CATE Benefit vs CATE Conversion')
|
276 |
+
st.pyplot(fig)
|
277 |
+
|
278 |
+
else:
|
279 |
+
st.error("Please ensure the model is trained and the dataset is prepared.")
|
app_old.py
ADDED
@@ -0,0 +1,341 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
|
3 |
+
import pandas as pd
|
4 |
+
import streamlit as st
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
|
7 |
+
from data_utils.data_simulation import UpliftSimulationReady
|
8 |
+
from data_utils.eda_simulation import EDASimulationReady
|
9 |
+
from data_utils.feature_importance_simulation import FISimulationReady
|
10 |
+
from models_utils.models_simulation import CATESimulationReady
|
11 |
+
from eval_utils.evaluation_simulation import CATEConversionEvaluateSimulationReady, CATEBenefitEvaluateSimulationReady
|
12 |
+
|
13 |
+
from mlops_utils.wandb_utils import upload_dataset_to_wandb, eda_work_with_dataset_to_wandb, training_results_to_wandb
|
14 |
+
|
15 |
+
st.title('Causal Uplift Modeling')
|
16 |
+
tabs = st.sidebar.radio("Navigation", ["Data", "EDA", "Modeling", "Effect"])
|
17 |
+
|
18 |
+
if tabs == "Data":
|
19 |
+
|
20 |
+
# Needed raw data
|
21 |
+
uplift_simulation = UpliftSimulationReady('./data/raw_data_client/')
|
22 |
+
user_profiles = uplift_simulation.load_user_profiles('user_profiles.csv')
|
23 |
+
uplift_data = uplift_simulation.load_uplift_data('uplift_data.csv')
|
24 |
+
irrelevant_data = uplift_simulation.load_irrelevant_data('irrelevant_data.csv')
|
25 |
+
transaction_data = uplift_simulation.load_other_data('transaction_data.csv')
|
26 |
+
|
27 |
+
# Subtitle
|
28 |
+
st.subheader('Loading data')
|
29 |
+
|
30 |
+
st.write('User profiles')
|
31 |
+
st.write(user_profiles.head(5))
|
32 |
+
|
33 |
+
st.write('Uplift data')
|
34 |
+
st.write(uplift_data.head(5))
|
35 |
+
|
36 |
+
st.write('Other data')
|
37 |
+
st.write(irrelevant_data.head(5))
|
38 |
+
|
39 |
+
st.write('Transaction data')
|
40 |
+
st.write(transaction_data.head(5))
|
41 |
+
|
42 |
+
if st.button('Upload data to wandb'):
|
43 |
+
upload_dataset_to_wandb(['./data/raw_data_client'], 'nl_cate_modeling', 'uplift_data')
|
44 |
+
st.write('Data uploaded to wandb')
|
45 |
+
|
46 |
+
# TODO: add to WANDB data processing step in the beginning
|
47 |
+
# TODO: the tree of updates
|
48 |
+
# TODO: choose the version from MLOps here exactly
|
49 |
+
|
50 |
+
if tabs == "EDA":
|
51 |
+
|
52 |
+
eda_simulation = EDASimulationReady('./data/processed_data/')
|
53 |
+
sum_conversions, mean_conversions = eda_simulation.load_conversions('uplift_classification_processed.csv')
|
54 |
+
|
55 |
+
st.subheader('Exploratory Data Analysis')
|
56 |
+
|
57 |
+
st.write('We can begin by computing the total sum of conversions, sales (discounted price) and platform benefit. We can see that the total conversions and the total sales grows as the discount value is bigger. However the platform benefit decreases.')
|
58 |
+
st.write(sum_conversions)
|
59 |
+
|
60 |
+
st.write('We can repeat the analysis but using the mean instead of the sum. This will give us the mean conversion rate, the mean sales per user and the mean platform benefit per user.')
|
61 |
+
st.write(mean_conversions)
|
62 |
+
|
63 |
+
st.write('To illustrate the tradeoff between conversions and platform benefit we can plot the mean benefit per user in the y-axis and the mean conversion rate in the x-axis, per treatment group.')
|
64 |
+
|
65 |
+
df_pivot_mean = mean_conversions[['mean']]
|
66 |
+
df_pivot_mean.columns = df_pivot_mean.columns.droplevel()
|
67 |
+
|
68 |
+
fig, ax = plt.subplots()
|
69 |
+
df_pivot_mean.plot.scatter(x='conversion',
|
70 |
+
y='benefit',
|
71 |
+
c='DarkBlue',
|
72 |
+
s=50,
|
73 |
+
ax=ax)
|
74 |
+
st.pyplot(fig)
|
75 |
+
|
76 |
+
st.write('''
|
77 |
+
We can also compute the Average Treatment Effect (ATE) for both the mean conversion rate and the mean benefit per user:
|
78 |
+
Conversion ATE = Mean Converstion rate in discounted group minus Mean Conversion rate in control group
|
79 |
+
Benefit ATE = Mean Benefit per user in discounted group minus Mean Benefit per user in control group
|
80 |
+
We can see in the plot below that the bigger the discount value the stronger the Conversion ATE (x-axis), but at the same time the more negative the Benefit ATE (y-axis).
|
81 |
+
''')
|
82 |
+
|
83 |
+
df_pivot_mean_ate = df_pivot_mean - df_pivot_mean.loc['control'].values.squeeze()
|
84 |
+
df_pivot_mean_ate.columns = ['benefit_ate', 'conversion_ate', 'discounted_price_ate']
|
85 |
+
|
86 |
+
fig, ax = plt.subplots()
|
87 |
+
df_pivot_mean_ate.plot.scatter(x='conversion_ate',
|
88 |
+
y='benefit_ate',
|
89 |
+
c='DarkBlue',
|
90 |
+
s=50,
|
91 |
+
ax=ax)
|
92 |
+
st.pyplot(fig)
|
93 |
+
|
94 |
+
st.subheader('Feature Importance')
|
95 |
+
|
96 |
+
fi = FISimulationReady('./data/eda_data/')
|
97 |
+
di_df = fi.load_feature_importance('kl_feature_importance.csv')
|
98 |
+
|
99 |
+
st.write('Feature importance')
|
100 |
+
fig, ax = plt.subplots()
|
101 |
+
di_df_sorted = di_df.sort_values(by='score', ascending=False)
|
102 |
+
di_df_sorted[['feature', 'score']].plot.barh(x='feature', y='score', ax=ax)
|
103 |
+
st.pyplot(fig)
|
104 |
+
|
105 |
+
if st.button('Upload EDA to wandb'):
|
106 |
+
eda_work_with_dataset_to_wandb(
|
107 |
+
dirs = ['./data/eda_data/'],
|
108 |
+
project_name = 'nl_cate_modeling',
|
109 |
+
dataset_name = 'uplift_data:latest',
|
110 |
+
dataset_type = 'raw_dataset',
|
111 |
+
artifact_type = 'eda')
|
112 |
+
st.write('EDA uploaded to wandb')
|
113 |
+
|
114 |
+
# TODO: add report to WANDB
|
115 |
+
# TODO: add artifacts to WANDB
|
116 |
+
|
117 |
+
if tabs == "Modeling":
|
118 |
+
|
119 |
+
st.subheader('Causal ML modeling')
|
120 |
+
|
121 |
+
st.write('We can begin by modeling the Conditional Average Treatment Effect')
|
122 |
+
if st.button('Train & run CATE conversion model'):
|
123 |
+
# fake trainin via 5 seconds spinner
|
124 |
+
with st.spinner('Training model...'):
|
125 |
+
time.sleep(2)
|
126 |
+
|
127 |
+
st.subheader('Feature importance by discount group')
|
128 |
+
|
129 |
+
model = CATESimulationReady('./data/models_data/model.pkl', './data/models_data/y_pred.pkl')
|
130 |
+
y_pred = model.predict()
|
131 |
+
|
132 |
+
fi05 = model.feature_importance('./data/models_data/discount_05_feature_importance.csv')
|
133 |
+
fi10 = model.feature_importance('./data/models_data/discount_10_feature_importance.csv')
|
134 |
+
fi15 = model.feature_importance('./data/models_data/discount_15_feature_importance.csv')
|
135 |
+
|
136 |
+
st.write('5\% discount group')
|
137 |
+
# plot feature importance as bar chart
|
138 |
+
fig, ax = plt.subplots()
|
139 |
+
fi05_sorted = fi05.sort_values(by='score', ascending=False)
|
140 |
+
fi05_sorted[['feature', 'score']].plot.barh(x='feature', y='score', ax=ax)
|
141 |
+
st.pyplot(fig)
|
142 |
+
|
143 |
+
st.write('10\% discount group')
|
144 |
+
fig, ax = plt.subplots()
|
145 |
+
fi10_sorted = fi10.sort_values(by='score', ascending=False)
|
146 |
+
fi10_sorted[['feature', 'score']].plot.barh(x='feature', y='score', ax=ax)
|
147 |
+
st.pyplot(fig)
|
148 |
+
|
149 |
+
st.write('15\% discount group')
|
150 |
+
fig, ax = plt.subplots()
|
151 |
+
fi15_sorted = fi15.sort_values(by='score', ascending=False)
|
152 |
+
fi15_sorted[['feature', 'score']].plot.barh(x='feature', y='score', ax=ax)
|
153 |
+
st.pyplot(fig)
|
154 |
+
if st.button('Upload convesion model to wandb'):
|
155 |
+
training_results_to_wandb(['./data/models_data'],
|
156 |
+
'nl_cate_modeling',
|
157 |
+
'uplift_data:latest',
|
158 |
+
'raw_dataset',
|
159 |
+
'model_artifacts',
|
160 |
+
'causal_model_conversion')
|
161 |
+
st.write('Models uploaded to wandb')
|
162 |
+
|
163 |
+
st.write('Similarly we can now train a T-Learner on the benefit label, and use the model predictions to evaluate the performance on the CATE conversion and CATE benefit.')
|
164 |
+
if st.button('Train & run CATE benefit model'):
|
165 |
+
# fake trainin via 5 seconds spinner
|
166 |
+
with st.spinner('Training model...'):
|
167 |
+
time.sleep(2)
|
168 |
+
|
169 |
+
st.subheader('Feature importance by discount group')
|
170 |
+
|
171 |
+
model = CATESimulationReady('./data/models_data/model.pkl', './data/models_data/y_pred.pkl')
|
172 |
+
y_pred = model.predict()
|
173 |
+
|
174 |
+
fi05 = model.feature_importance('./data/models_data/discount_05_feature_importance_bate.csv')
|
175 |
+
fi10 = model.feature_importance('./data/models_data/discount_10_feature_importance_bate.csv')
|
176 |
+
fi15 = model.feature_importance('./data/models_data/discount_15_feature_importance_bate.csv')
|
177 |
+
|
178 |
+
st.write('5\% discount group')
|
179 |
+
# plot feature importance as bar chart
|
180 |
+
fig, ax = plt.subplots()
|
181 |
+
fi05_sorted = fi05.sort_values(by='score', ascending=False)
|
182 |
+
fi05_sorted[['feature', 'score']].plot.barh(x='feature', y='score', ax=ax)
|
183 |
+
st.pyplot(fig)
|
184 |
+
|
185 |
+
st.write('10\% discount group')
|
186 |
+
fig, ax = plt.subplots()
|
187 |
+
fi10_sorted = fi10.sort_values(by='score', ascending=False)
|
188 |
+
fi10_sorted[['feature', 'score']].plot.barh(x='feature', y='score', ax=ax)
|
189 |
+
st.pyplot(fig)
|
190 |
+
|
191 |
+
st.write('15\% discount group')
|
192 |
+
fig, ax = plt.subplots()
|
193 |
+
fi15_sorted = fi15.sort_values(by='score', ascending=False)
|
194 |
+
fi15_sorted[['feature', 'score']].plot.barh(x='feature', y='score', ax=ax)
|
195 |
+
st.pyplot(fig)
|
196 |
+
if st.button('Upload benefit model to wandb'):
|
197 |
+
training_results_to_wandb(['./data/models_data'],
|
198 |
+
'nl_cate_modeling',
|
199 |
+
'uplift_data:latest',
|
200 |
+
'raw_dataset',
|
201 |
+
'model_artifacts',
|
202 |
+
'causal_model_benefit')
|
203 |
+
st.write('Models uploaded to wandb')
|
204 |
+
|
205 |
+
if tabs == "Effect":
|
206 |
+
|
207 |
+
st.subheader('Causal ML evaluation')
|
208 |
+
st.write('We can evaluate our models by looking at the Qini curves. We can use the CATE conversion model to evaluate the performance on both the Conversion and the Benefit as a function of the fraction of users targeted.')
|
209 |
+
|
210 |
+
# two columns
|
211 |
+
col1, col2 = st.columns(2)
|
212 |
+
|
213 |
+
with col1:
|
214 |
+
|
215 |
+
st.write('CATE conversion model')
|
216 |
+
|
217 |
+
eval = CATEConversionEvaluateSimulationReady('./data/effect_data/')
|
218 |
+
qini_05_conversion_test, qini_05_benefit_test = eval.evaluate(5)
|
219 |
+
qini_10_conversion_test, qini_10_benefit_test = eval.evaluate(10)
|
220 |
+
qini_15_conversion_test, qini_15_benefit_test = eval.evaluate(15)
|
221 |
+
|
222 |
+
# Plot CATE conversion vs Targeted Population
|
223 |
+
fig_conversion, ax_conversion = plt.subplots()
|
224 |
+
qini_05_conversion_test.plot(ax=ax_conversion, x='index', y='Random', color='b', ls='--', lw=0.5, label = '5% random')
|
225 |
+
qini_10_conversion_test.plot(ax=ax_conversion, x='index', y='Random', color='g', ls='--', lw=0.5, label = '10% random')
|
226 |
+
qini_15_conversion_test.plot(ax=ax_conversion, x='index', y='Random', color='y', ls='--', lw=0.5, label = '15% random')
|
227 |
+
qini_05_conversion_test.plot(ax=ax_conversion, x='index', y='S', color='b', label = '5% model')
|
228 |
+
qini_10_conversion_test.plot(ax=ax_conversion, x='index', y='S', color='g', label = '10% model')
|
229 |
+
qini_15_conversion_test.plot(ax=ax_conversion, x='index', y='S', color='y', label = '15% model')
|
230 |
+
ax_conversion.legend()
|
231 |
+
ax_conversion.set_xlabel('Fraction of Targeted Users')
|
232 |
+
ax_conversion.set_ylabel('CATE conversion')
|
233 |
+
ax_conversion.set_title('CATE conversion vs Targeted Population')
|
234 |
+
st.pyplot(fig_conversion)
|
235 |
+
|
236 |
+
# Plot CATE benefit vs Targeted Population
|
237 |
+
fig_benefit, ax_benefit = plt.subplots()
|
238 |
+
qini_05_benefit_test.plot(ax=ax_benefit, x='index', y='Random', color='b', ls='--', lw=0.5, label = '5% random')
|
239 |
+
qini_10_benefit_test.plot(ax=ax_benefit, x='index', y='Random', color='g', ls='--', lw=0.5, label = '10% random')
|
240 |
+
qini_15_benefit_test.plot(ax=ax_benefit, x='index', y='Random', color='y', ls='--', lw=0.5, label = '15% random')
|
241 |
+
qini_05_benefit_test.plot(ax=ax_benefit, x='index', y='S', color='b', label = '5% model')
|
242 |
+
qini_10_benefit_test.plot(ax=ax_benefit, x='index', y='S', color='g', label = '10% model')
|
243 |
+
qini_15_benefit_test.plot(ax=ax_benefit, x='index', y='S', color='y', label = '15% model')
|
244 |
+
ax_benefit.legend()
|
245 |
+
ax_benefit.set_xlabel('Fraction of Targeted Users')
|
246 |
+
ax_benefit.set_ylabel('CATE Benefit')
|
247 |
+
ax_benefit.set_title('CATE benefit vs Targeted Population')
|
248 |
+
st.pyplot(fig_benefit)
|
249 |
+
|
250 |
+
qini_05_conc_test = pd.concat([qini_05_conversion_test[['S']], qini_05_benefit_test[['S']]], axis=1)
|
251 |
+
qini_05_conc_test.columns = ['cate_conversion', 'cate_benefit']
|
252 |
+
qini_10_conc_test = pd.concat([qini_10_conversion_test[['S']], qini_10_benefit_test[['S']]], axis=1)
|
253 |
+
qini_10_conc_test.columns = ['cate_conversion', 'cate_benefit']
|
254 |
+
qini_15_conc_test = pd.concat([qini_15_conversion_test[['S']], qini_15_benefit_test[['S']]], axis=1)
|
255 |
+
qini_15_conc_test.columns = ['cate_conversion', 'cate_benefit']
|
256 |
+
|
257 |
+
fig_conversion, ax_conversion = plt.subplots()
|
258 |
+
qini_05_conc_test.plot(ax=ax_conversion, x='cate_conversion',y='cate_benefit',color='b')
|
259 |
+
qini_10_conc_test.plot(ax=ax_conversion, x='cate_conversion',y='cate_benefit',color='g')
|
260 |
+
qini_15_conc_test.plot(ax=ax_conversion, x='cate_conversion',y='cate_benefit',color='y')
|
261 |
+
ax_conversion.legend(['5% model', '10% model','15% model'], prop={'size': 10})
|
262 |
+
ax_conversion.set_xlabel('CATE Conversion')
|
263 |
+
ax_conversion.set_ylabel('CATE Benefit')
|
264 |
+
ax_conversion.set_title('CATE benefit vs CATE conversion')
|
265 |
+
st.pyplot(fig_conversion)
|
266 |
+
|
267 |
+
if st.button('Upload conversion effects to wandb'):
|
268 |
+
training_results_to_wandb(['./data/effect_data'],
|
269 |
+
'nl_cate_modeling',
|
270 |
+
'causal_model_conversion:latest',
|
271 |
+
'model_artifacts',
|
272 |
+
'effects_artifacts',
|
273 |
+
'convesion_model_evaluation',
|
274 |
+
job_type='evaluation')
|
275 |
+
st.write('Evaluation uploaded to wandb')
|
276 |
+
|
277 |
+
with col2:
|
278 |
+
st.write('CATE benefit model')
|
279 |
+
|
280 |
+
eval = CATEBenefitEvaluateSimulationReady('./data/effect_data/')
|
281 |
+
qini_05_conversion_test, qini_05_benefit_test = eval.evaluate(5)
|
282 |
+
qini_10_conversion_test, qini_10_benefit_test = eval.evaluate(10)
|
283 |
+
qini_15_conversion_test, qini_15_benefit_test = eval.evaluate(15)
|
284 |
+
|
285 |
+
# Plot CATE conversion vs Targeted Population
|
286 |
+
fig_conversion, ax_conversion = plt.subplots()
|
287 |
+
qini_05_conversion_test.plot(ax=ax_conversion, x='index', y='Random', color='b', ls='--', lw=0.5, label = '5% random')
|
288 |
+
qini_10_conversion_test.plot(ax=ax_conversion, x='index', y='Random', color='g', ls='--', lw=0.5, label = '10% random')
|
289 |
+
qini_15_conversion_test.plot(ax=ax_conversion, x='index', y='Random', color='y', ls='--', lw=0.5, label = '15% random')
|
290 |
+
qini_05_conversion_test.plot(ax=ax_conversion, x='index', y='S', color='b', label = '5% model')
|
291 |
+
qini_10_conversion_test.plot(ax=ax_conversion, x='index', y='S', color='g', label = '10% model')
|
292 |
+
qini_15_conversion_test.plot(ax=ax_conversion, x='index', y='S', color='y', label = '15% model')
|
293 |
+
ax_conversion.legend()
|
294 |
+
ax_conversion.set_xlabel('Fraction of Targeted Users')
|
295 |
+
ax_conversion.set_ylabel('CATE conversion')
|
296 |
+
ax_conversion.set_title('CATE conversion vs Targeted Population')
|
297 |
+
st.pyplot(fig_conversion)
|
298 |
+
|
299 |
+
# Plot CATE benefit vs Targeted Population
|
300 |
+
fig_benefit, ax_benefit = plt.subplots()
|
301 |
+
qini_05_benefit_test.plot(ax=ax_benefit, x='index', y='Random', color='b', ls='--', lw=0.5, label = '5% random')
|
302 |
+
qini_10_benefit_test.plot(ax=ax_benefit, x='index', y='Random', color='g', ls='--', lw=0.5, label = '10% random')
|
303 |
+
qini_15_benefit_test.plot(ax=ax_benefit, x='index', y='Random', color='y', ls='--', lw=0.5, label = '15% random')
|
304 |
+
qini_05_benefit_test.plot(ax=ax_benefit, x='index', y='S', color='b', label = '5% model')
|
305 |
+
qini_10_benefit_test.plot(ax=ax_benefit, x='index', y='S', color='g', label = '10% model')
|
306 |
+
qini_15_benefit_test.plot(ax=ax_benefit, x='index', y='S', color='y', label = '15% model')
|
307 |
+
ax_benefit.legend()
|
308 |
+
ax_benefit.set_xlabel('Fraction of Targeted Users')
|
309 |
+
ax_benefit.set_ylabel('CATE Benefit')
|
310 |
+
ax_benefit.set_title('CATE benefit vs Targeted Population')
|
311 |
+
st.pyplot(fig_benefit)
|
312 |
+
|
313 |
+
qini_05_conc_test = pd.concat([qini_05_conversion_test[['S']], qini_05_benefit_test[['S']]], axis=1)
|
314 |
+
qini_05_conc_test.columns = ['cate_conversion', 'cate_benefit']
|
315 |
+
qini_10_conc_test = pd.concat([qini_10_conversion_test[['S']], qini_10_benefit_test[['S']]], axis=1)
|
316 |
+
qini_10_conc_test.columns = ['cate_conversion', 'cate_benefit']
|
317 |
+
qini_15_conc_test = pd.concat([qini_15_conversion_test[['S']], qini_15_benefit_test[['S']]], axis=1)
|
318 |
+
qini_15_conc_test.columns = ['cate_conversion', 'cate_benefit']
|
319 |
+
|
320 |
+
fig_conversion, ax_conversion = plt.subplots()
|
321 |
+
qini_05_conc_test.plot(ax=ax_conversion, x='cate_conversion',y='cate_benefit',color='b')
|
322 |
+
qini_10_conc_test.plot(ax=ax_conversion, x='cate_conversion',y='cate_benefit',color='g')
|
323 |
+
qini_15_conc_test.plot(ax=ax_conversion, x='cate_conversion',y='cate_benefit',color='y')
|
324 |
+
ax_conversion.legend(['5% model', '10% model','15% model'], prop={'size': 10})
|
325 |
+
ax_conversion.set_xlabel('CATE Conversion')
|
326 |
+
ax_conversion.set_ylabel('CATE Benefit')
|
327 |
+
ax_conversion.set_title('CATE benefit vs CATE conversion')
|
328 |
+
st.pyplot(fig_conversion)
|
329 |
+
|
330 |
+
if st.button('Upload benefit effects to wandb'):
|
331 |
+
training_results_to_wandb(['./data/effect_data'],
|
332 |
+
'nl_cate_modeling',
|
333 |
+
'causal_model_benefit:latest',
|
334 |
+
'model_artifacts',
|
335 |
+
'effects_artifacts',
|
336 |
+
'benefit_model_evaluation',
|
337 |
+
job_type='evaluation')
|
338 |
+
st.write('Evaluation uploaded to wandb')
|
339 |
+
|
340 |
+
st.write('To simplify the comparison, we can plot the CATE Benefit as a function of the CATE conversion.')
|
341 |
+
st.write('In the last plot for example we can see that there is a region where offering 15% discount to a targeted group of users is more efficient than giving 10% to everyone. We can obtain the same impact in overall conversion uplift while reducing our benefit loss considerably.')
|
data_utils/data_generation.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from functools import reduce
|
3 |
+
from random import randint
|
4 |
+
|
5 |
+
from causalml.dataset import make_uplift_classification
|
6 |
+
|
7 |
+
class UpliftSimulation:
|
8 |
+
|
9 |
+
def __init__(self, n=50000, y_name='conversion',
|
10 |
+
treatment_group_keys=['control', 'discount_05', 'discount_10', 'discount_15'],
|
11 |
+
n_classification_features=15, n_classification_informative=7,
|
12 |
+
n_classification_repeated=0,
|
13 |
+
n_uplift_increase_dict={'discount_05': 4, 'discount_10': 3, 'discount_15': 3},
|
14 |
+
n_uplift_decrease_dict={'discount_05': 0, 'discount_10': 0, 'discount_15': 0},
|
15 |
+
delta_uplift_increase_dict={'discount_05': 0.0020, 'discount_10': 0.0045, 'discount_15': 0.008},
|
16 |
+
delta_uplift_decrease_dict={'discount_05': 0, 'discount_10': 0, 'discount_15': 0},
|
17 |
+
n_uplift_increase_mix_informative_dict={'discount_05': 3, 'discount_10': 2, 'discount_15': 3},
|
18 |
+
n_uplift_decrease_mix_informative_dict={'discount_05': 0, 'discount_10': 0, 'discount_15': 0},
|
19 |
+
positive_class_proportion=0.05, random_seed=8097):
|
20 |
+
self.n = n
|
21 |
+
self.y_name = y_name
|
22 |
+
self.treatment_group_keys = treatment_group_keys
|
23 |
+
self.n_classification_features = n_classification_features
|
24 |
+
self.n_classification_informative = n_classification_informative
|
25 |
+
self.n_classification_repeated = n_classification_repeated
|
26 |
+
self.n_uplift_increase_dict = n_uplift_increase_dict
|
27 |
+
self.n_uplift_decrease_dict = n_uplift_decrease_dict
|
28 |
+
self.delta_uplift_increase_dict = delta_uplift_increase_dict
|
29 |
+
self.delta_uplift_decrease_dict = delta_uplift_decrease_dict
|
30 |
+
self.n_uplift_increase_mix_informative_dict = n_uplift_increase_mix_informative_dict
|
31 |
+
self.n_uplift_decrease_mix_informative_dict = n_uplift_decrease_mix_informative_dict
|
32 |
+
self.positive_class_proportion = positive_class_proportion
|
33 |
+
self.random_seed = random_seed
|
34 |
+
self.df = None
|
35 |
+
self.X_names = None
|
36 |
+
|
37 |
+
def simulate_dataset(self):
|
38 |
+
self.df, self.X_names = make_uplift_classification(
|
39 |
+
treatment_name=self.treatment_group_keys,
|
40 |
+
y_name=self.y_name,
|
41 |
+
n_samples=self.n,
|
42 |
+
n_classification_features=self.n_classification_features,
|
43 |
+
n_classification_informative=self.n_classification_informative,
|
44 |
+
n_classification_repeated=self.n_classification_repeated,
|
45 |
+
n_uplift_increase_dict=self.n_uplift_increase_dict,
|
46 |
+
n_uplift_decrease_dict=self.n_uplift_decrease_dict,
|
47 |
+
delta_uplift_increase_dict=self.delta_uplift_increase_dict,
|
48 |
+
delta_uplift_decrease_dict=self.delta_uplift_decrease_dict,
|
49 |
+
n_uplift_increase_mix_informative_dict=self.n_uplift_increase_mix_informative_dict,
|
50 |
+
n_uplift_decrease_mix_informative_dict=self.n_uplift_decrease_mix_informative_dict,
|
51 |
+
positive_class_proportion=self.positive_class_proportion,
|
52 |
+
random_seed=self.random_seed,
|
53 |
+
)
|
54 |
+
|
55 |
+
def apply_discounts_and_clean(self):
|
56 |
+
discounts_dict = {'control': 0, 'discount_05': 0.05, 'discount_10': 0.10, 'discount_15': 0.15}
|
57 |
+
self.df['discount'] = self.df['treatment_group_key']
|
58 |
+
self.df = self.df.replace({"discount": discounts_dict})
|
59 |
+
self.df.drop(columns=['treatment_effect'], inplace=True)
|
60 |
+
|
61 |
+
|
62 |
+
def postprocess_tables(self):
|
63 |
+
|
64 |
+
# Add a synthetic UserID for each entry
|
65 |
+
self.df['UserID'] = range(len(self.df))
|
66 |
+
|
67 |
+
# Mapping the columns
|
68 |
+
informative_cols = [col for col in self.df.columns if 'informative' in col]
|
69 |
+
uplift_cols = [col for col in self.df.columns if 'uplift' in col]
|
70 |
+
irrelevant_cols = [col for col in self.df.columns if 'irrelevant' in col]
|
71 |
+
transaction_cols = ['treatment_group_key', 'conversion', 'discount']
|
72 |
+
|
73 |
+
# User Demographics and Profiles Table (Including Informative Features)
|
74 |
+
user_profiles = self.df[['UserID'] + informative_cols].copy()
|
75 |
+
|
76 |
+
# Web Interaction Data Table (This might need adjustment based on actual data)
|
77 |
+
# If any of the 'informative' columns relate to web interaction, include them here.
|
78 |
+
|
79 |
+
# Uplift-Related Data Table
|
80 |
+
uplift_data = self.df[['UserID'] + uplift_cols].copy()
|
81 |
+
|
82 |
+
# Adjusting the Uplift-Related Data table to include the mixed features
|
83 |
+
mixed_uplift_columns = ['x31_increase_mix', 'x22_increase_mix', 'x20_increase_mix',
|
84 |
+
'x33_increase_mix', 'x32_increase_mix', 'x27_increase_mix',
|
85 |
+
'x21_increase_mix', 'x26_increase_mix']
|
86 |
+
|
87 |
+
# Assuming uplift_data already includes the 'UserID' column
|
88 |
+
uplift_data = pd.concat([uplift_data, self.df[mixed_uplift_columns]], axis=1)
|
89 |
+
|
90 |
+
# Irrelevant Data Table
|
91 |
+
irrelevant_data = self.df[['UserID'] + irrelevant_cols].copy()
|
92 |
+
|
93 |
+
# Transaction Data Table
|
94 |
+
transaction_data = self.df[['UserID'] + transaction_cols].copy()
|
95 |
+
|
96 |
+
user_profiles.columns = [
|
97 |
+
'UserID', 'AgeIndex', 'IncomeIndex', 'PurchaseFrequencyIndex',
|
98 |
+
'AccountLifetimeIndex', 'AverageTransactionValueIndex', 'PreferredPaymentMethodIndex', 'RegionIndex'
|
99 |
+
]
|
100 |
+
|
101 |
+
uplift_data.columns = [
|
102 |
+
'UserID', 'EmailDiscountCTRIndex', 'WebDiscountCTRIndex', 'SocialMediaEngagementIndex',
|
103 |
+
'DirectMailDiscountResponseIndex', 'InAppDiscountEngagementIndex', 'FlashSaleParticipationIndex',
|
104 |
+
'SeasonalPromoInterestIndex', 'LoyaltyProgramEngagementIndex', 'ReferralBonusUsageIndex',
|
105 |
+
'DiscountCodeRedemptionIndex', 'VIPSaleAccessIndex', 'EarlyAccessOptInIndex',
|
106 |
+
'ProductReviewAfterDiscountIndex', 'UpsellConversionIndex', 'CrossSellInterestIndex',
|
107 |
+
'BundlePurchaseIndex', 'SubscriptionUpgradeIndex', 'CustomerFeedbackIndex'
|
108 |
+
]
|
109 |
+
|
110 |
+
irrelevant_data.columns = [
|
111 |
+
'UserID', 'BrowserTypeIndex', 'DeviceCategoryIndex', 'OperatingSystemIndex',
|
112 |
+
'SessionStartTimeIndex', 'LanguagePreferenceIndex', 'NewsletterSubscriptionIndex',
|
113 |
+
'AccountVerificationStatusIndex', 'AdBlockerPresenceIndex'
|
114 |
+
]
|
115 |
+
|
116 |
+
# transaction_data.columns = [
|
117 |
+
# 'UserID', 'DiscountCategoryIndex', 'PurchaseIndex', 'DiscountPercentageIndex'
|
118 |
+
# ]
|
119 |
+
transaction_data.columns = ['UserID'] + transaction_cols
|
120 |
+
|
121 |
+
# List of all DataFrames to be merged
|
122 |
+
self.dataframes = [user_profiles, uplift_data, irrelevant_data, transaction_data]
|
123 |
+
|
124 |
+
# Merge all DataFrames on 'UserID' in one line
|
125 |
+
self.df = reduce(lambda left, right: pd.merge(left, right, on='UserID'), self.dataframes)
|
126 |
+
|
127 |
+
|
128 |
+
def add_monetary_effect(self):
|
129 |
+
# Adding a monetary effect column
|
130 |
+
def base_price(df, informative_features):
|
131 |
+
if df.conversion == 0:
|
132 |
+
base_price = 0
|
133 |
+
else:
|
134 |
+
base_price = randint(1, 100)
|
135 |
+
return base_price
|
136 |
+
|
137 |
+
informative_features = [k for k in self.X_names if 'informative' in k]
|
138 |
+
self.df['base_price'] = self.df.apply(lambda x: base_price(x, informative_features), axis=1)
|
139 |
+
self.df['discounted_price'] = self.df['base_price']*(1-self.df['discount'])
|
140 |
+
self.df['benefit'] = self.df['discounted_price']-0.8*self.df['base_price']
|
data_utils/data_simulation.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
class UpliftSimulationReady:
|
4 |
+
|
5 |
+
def __init__(self, files_path):
|
6 |
+
self.files_path = files_path
|
7 |
+
|
8 |
+
def load_user_profiles(self, file_name):
|
9 |
+
user_profiles = pd.read_csv(self.files_path + file_name)
|
10 |
+
return user_profiles
|
11 |
+
|
12 |
+
def load_uplift_data(self, file_name):
|
13 |
+
uplift_data = pd.read_csv(self.files_path + file_name)
|
14 |
+
return uplift_data
|
15 |
+
|
16 |
+
def load_irrelevant_data(self, file_name):
|
17 |
+
irrelevant_data = pd.read_csv(self.files_path + file_name)
|
18 |
+
return irrelevant_data
|
19 |
+
|
20 |
+
def load_other_data(self, file_name):
|
21 |
+
transaction_data = pd.read_csv(self.files_path + file_name)
|
22 |
+
return transaction_data
|
data_utils/eda_simulation.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from data_utils.data_simulation import UpliftSimulationReady
|
4 |
+
|
5 |
+
class EDASimulationReady:
|
6 |
+
|
7 |
+
def __init__(self, files_path):
|
8 |
+
self.files_path = files_path
|
9 |
+
|
10 |
+
def load_conversions(self, file_name):
|
11 |
+
|
12 |
+
uplift_simulation = UpliftSimulationReady(self.files_path)
|
13 |
+
df = uplift_simulation.load_uplift_data(file_name)
|
14 |
+
|
15 |
+
sum_conversions = df.pivot_table(values=['conversion','discounted_price','benefit'],
|
16 |
+
index='treatment_group_key',
|
17 |
+
aggfunc=[np.sum],
|
18 |
+
margins=False)
|
19 |
+
|
20 |
+
mean_conversions = df.pivot_table(values=['conversion','discounted_price','benefit'],
|
21 |
+
index='treatment_group_key',
|
22 |
+
aggfunc=[np.mean],
|
23 |
+
margins=False)
|
24 |
+
|
25 |
+
# save to csv
|
26 |
+
sum_conversions.to_csv(self.files_path + 'sum_conversions.csv')
|
27 |
+
mean_conversions.to_csv(self.files_path + 'mean_conversions.csv')
|
28 |
+
|
29 |
+
return sum_conversions, mean_conversions
|
data_utils/exploratory_data_analysis.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
class ExploratoryAnalysis:
|
4 |
+
def __init__(self, df):
|
5 |
+
self.df = df
|
6 |
+
|
7 |
+
def compute_summaries(self):
|
8 |
+
sum_conversions = self.df.pivot_table(values=['conversion', 'discounted_price', 'benefit'],
|
9 |
+
index='treatment_group_key',
|
10 |
+
aggfunc='sum',
|
11 |
+
margins=False)
|
12 |
+
|
13 |
+
mean_conversions = self.df.pivot_table(values=['conversion', 'discounted_price', 'benefit'],
|
14 |
+
index='treatment_group_key',
|
15 |
+
aggfunc='mean',
|
16 |
+
margins=False)
|
17 |
+
return sum_conversions, mean_conversions
|
18 |
+
|
19 |
+
def compute_mean_benefit_vs_conversion(self):
|
20 |
+
_, mean_conversions = self.compute_summaries()
|
21 |
+
return mean_conversions[['conversion', 'benefit']]
|
22 |
+
|
23 |
+
def compute_ate(self):
|
24 |
+
_, mean_conversions = self.compute_summaries()
|
25 |
+
control_mean = mean_conversions.loc['control']
|
26 |
+
mean_conversions_ate = mean_conversions - control_mean
|
27 |
+
return mean_conversions_ate
|
data_utils/feature_importance.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from causalml.feature_selection.filters import FilterSelect
|
2 |
+
|
3 |
+
class FeatureImportance:
|
4 |
+
|
5 |
+
def __init__(self, df, X_names, y_name, treatment_group):
|
6 |
+
self.df = df
|
7 |
+
self.X_names = X_names
|
8 |
+
self.y_name = y_name
|
9 |
+
self.treatment_group = treatment_group
|
10 |
+
|
11 |
+
def compute_feature_importance(self):
|
12 |
+
|
13 |
+
filter_method = FilterSelect()
|
14 |
+
method = 'KL'
|
15 |
+
kl_imp = filter_method.get_importance(self.df, self.X_names, self.y_name, method,
|
16 |
+
treatment_group = self.treatment_group,
|
17 |
+
n_bins=20)
|
18 |
+
return kl_imp
|
data_utils/feature_importance_simulation.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from data_utils.data_simulation import UpliftSimulationReady
|
4 |
+
|
5 |
+
class FISimulationReady:
|
6 |
+
|
7 |
+
def __init__(self, files_path):
|
8 |
+
self.files_path = files_path
|
9 |
+
|
10 |
+
def load_feature_importance(self, file_name):
|
11 |
+
uplift_simulation = UpliftSimulationReady(self.files_path)
|
12 |
+
df = uplift_simulation.load_uplift_data(file_name)
|
13 |
+
return df
|
eval_utils/evaluation.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
from causalml.metrics import *
|
4 |
+
|
5 |
+
class ModelEvaluator:
|
6 |
+
def __init__(self, model, df_eval, X_names):
|
7 |
+
self.model = model
|
8 |
+
self.df_eval = df_eval
|
9 |
+
self.X_names = X_names
|
10 |
+
|
11 |
+
def predict_cate(self, discount):
|
12 |
+
"""
|
13 |
+
Predicts the Conditional Average Treatment Effect (CATE) for a given discount level.
|
14 |
+
"""
|
15 |
+
self.df_eval['cate'] = self.model.predict(
|
16 |
+
X=self.df_eval[self.X_names].values,
|
17 |
+
treatment=self.df_eval['treatment_group_key'].values
|
18 |
+
).tolist()
|
19 |
+
self.df_eval[['cate_discount_05', 'cate_discount_10', 'cate_discount_15']] = pd.DataFrame(
|
20 |
+
self.df_eval.cate.tolist(),
|
21 |
+
index=self.df_eval.index
|
22 |
+
)
|
23 |
+
|
24 |
+
def eval_performance(self, discount):
|
25 |
+
"""
|
26 |
+
Evaluates the model's performance for a specific discount, calculating Qini curves for conversion and benefit.
|
27 |
+
"""
|
28 |
+
# Ensure CATE predictions are available
|
29 |
+
if 'cate' not in self.df_eval.columns:
|
30 |
+
self.predict_cate(discount)
|
31 |
+
|
32 |
+
df_eval_disc = self.df_eval[self.df_eval['treatment_group_key'].isin(['control', discount])]
|
33 |
+
df_eval_disc['treatment_num'] = df_eval_disc.apply(
|
34 |
+
lambda x: 0 if x['treatment_group_key'] == 'control' else 1,
|
35 |
+
axis=1
|
36 |
+
)
|
37 |
+
|
38 |
+
cate_col = 'cate_{}'.format(discount)
|
39 |
+
|
40 |
+
df_eval_qini_conversion = pd.DataFrame(
|
41 |
+
[df_eval_disc[cate_col].ravel(), df_eval_disc.treatment_num.ravel(), df_eval_disc['conversion'].ravel()],
|
42 |
+
index=['S', 'w', 'y']
|
43 |
+
).T
|
44 |
+
|
45 |
+
df_eval_qini_benefit = pd.DataFrame(
|
46 |
+
[df_eval_disc[cate_col].ravel(), df_eval_disc.treatment_num.ravel(), df_eval_disc['benefit'].ravel()],
|
47 |
+
index=['S', 'w', 'y']
|
48 |
+
).T
|
49 |
+
|
50 |
+
# Assuming get_qini function exists and calculates Qini coefficient
|
51 |
+
cd_conversion = (get_qini(df_eval_qini_conversion) * 2).reset_index()
|
52 |
+
cd_conversion = cd_conversion / cd_conversion.shape[0]
|
53 |
+
|
54 |
+
cd_benefit = (get_qini(df_eval_qini_benefit) * 2).reset_index()
|
55 |
+
cd_benefit = cd_benefit / cd_benefit.shape[0]
|
56 |
+
|
57 |
+
return cd_conversion, cd_benefit
|
eval_utils/evaluation_simulation.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
class CATEConversionEvaluateSimulationReady:
|
4 |
+
|
5 |
+
def __init__(self,data_path):
|
6 |
+
self.data_path = data_path
|
7 |
+
|
8 |
+
def evaluate(self, discount_group):
|
9 |
+
if discount_group == 5:
|
10 |
+
qini_05_conversion_test = pd.read_csv(self.data_path + 'qini_05_conversion_test.csv').drop(columns='Unnamed: 0')
|
11 |
+
qini_05_benefit_test = pd.read_csv(self.data_path + 'qini_05_benefit_test.csv').drop(columns='Unnamed: 0')
|
12 |
+
return qini_05_conversion_test, qini_05_benefit_test
|
13 |
+
elif discount_group == 10:
|
14 |
+
qini_10_conversion_test = pd.read_csv(self.data_path + 'qini_10_conversion_test.csv').drop(columns='Unnamed: 0')
|
15 |
+
qini_10_benefit_test = pd.read_csv(self.data_path + 'qini_10_benefit_test.csv').drop(columns='Unnamed: 0')
|
16 |
+
return qini_10_conversion_test, qini_10_benefit_test
|
17 |
+
elif discount_group == 15:
|
18 |
+
qini_15_conversion_test = pd.read_csv(self.data_path + 'qini_15_conversion_test.csv').drop(columns='Unnamed: 0')
|
19 |
+
qini_15_benefit_test = pd.read_csv(self.data_path + 'qini_15_benefit_test.csv').drop(columns='Unnamed: 0')
|
20 |
+
return qini_15_conversion_test, qini_15_benefit_test
|
21 |
+
|
22 |
+
class CATEBenefitEvaluateSimulationReady:
|
23 |
+
|
24 |
+
def __init__(self,data_path):
|
25 |
+
self.data_path = data_path
|
26 |
+
|
27 |
+
def evaluate(self, discount_group):
|
28 |
+
if discount_group == 5:
|
29 |
+
qini_05_conversion_test = pd.read_csv(self.data_path + 'qini_05_conversion_test_bate.csv').drop(columns='Unnamed: 0')
|
30 |
+
qini_05_benefit_test = pd.read_csv(self.data_path + 'qini_05_benefit_test_bate.csv').drop(columns='Unnamed: 0')
|
31 |
+
return qini_05_conversion_test, qini_05_benefit_test
|
32 |
+
elif discount_group == 10:
|
33 |
+
qini_10_conversion_test = pd.read_csv(self.data_path + 'qini_10_conversion_test_bate.csv').drop(columns='Unnamed: 0')
|
34 |
+
qini_10_benefit_test = pd.read_csv(self.data_path + 'qini_10_benefit_test_bate.csv').drop(columns='Unnamed: 0')
|
35 |
+
return qini_10_conversion_test, qini_10_benefit_test
|
36 |
+
elif discount_group == 15:
|
37 |
+
qini_15_conversion_test = pd.read_csv(self.data_path + 'qini_15_conversion_test_bate.csv').drop(columns='Unnamed: 0')
|
38 |
+
qini_15_benefit_test = pd.read_csv(self.data_path + 'qini_15_benefit_test_bate.csv').drop(columns='Unnamed: 0')
|
39 |
+
return qini_15_conversion_test, qini_15_benefit_test
|
mlops_utils/wandb_utils.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import wandb
|
2 |
+
import pandas as pd
|
3 |
+
import os
|
4 |
+
|
5 |
+
def upload_dataset_to_wandb(dirs, project_name, dataset_name, dataset_type='raw_dataset'):
|
6 |
+
with wandb.init(project=project_name, job_type='load-data') as run:
|
7 |
+
dataset_artifact = wandb.Artifact(dataset_name, type=dataset_type)
|
8 |
+
for dir in dirs:
|
9 |
+
dataset_artifact.add_dir(dir)
|
10 |
+
run.log_artifact(dataset_artifact)
|
11 |
+
|
12 |
+
def eda_work_with_dataset_to_wandb(dirs, project_name, dataset_name, dataset_type, artifact_type):
|
13 |
+
with wandb.init(project=project_name, job_type='eda') as run:
|
14 |
+
dataset_artifact = run.use_artifact(dataset_name, type=dataset_type)
|
15 |
+
eda_artifact = wandb.Artifact('eda_result', type=artifact_type)
|
16 |
+
for dir in dirs:
|
17 |
+
eda_artifact.add_dir(dir)
|
18 |
+
run.log_artifact(eda_artifact)
|
19 |
+
|
20 |
+
run.log({
|
21 |
+
"eda_result": pd.read_csv(
|
22 |
+
os.path.join(dirs[0], "kl_feature_importance.csv")
|
23 |
+
)
|
24 |
+
}
|
25 |
+
)
|
26 |
+
|
27 |
+
def training_results_to_wandb(dirs, project_name, dataset_name, dataset_type, artifact_type, model_name, job_type='train'):
|
28 |
+
with wandb.init(project=project_name, job_type=job_type) as run:
|
29 |
+
dataset_artifact = run.use_artifact(dataset_name, type=dataset_type)
|
30 |
+
model_artifact = wandb.Artifact(model_name, type=artifact_type)
|
31 |
+
for dir in dirs:
|
32 |
+
model_artifact.add_dir(dir)
|
33 |
+
run.log_artifact(model_artifact)
|
34 |
+
|
35 |
+
if job_type == 'train':
|
36 |
+
run.log({
|
37 |
+
"discount_05_feature_importance": pd.read_csv(
|
38 |
+
os.path.join(dirs[0], "discount_05_feature_importance.csv")
|
39 |
+
),
|
40 |
+
"discount_10_feature_importance": pd.read_csv(
|
41 |
+
os.path.join(dirs[0], "discount_10_feature_importance.csv")
|
42 |
+
),
|
43 |
+
"discount_15_feature_importance": pd.read_csv(
|
44 |
+
os.path.join(dirs[0], "discount_15_feature_importance.csv")
|
45 |
+
),
|
46 |
+
}
|
47 |
+
)
|
models_utils/ml_models.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.model_selection import train_test_split
|
2 |
+
from xgboost import XGBRegressor, XGBClassifier
|
3 |
+
|
4 |
+
from causalml.inference.tree import UpliftRandomForestClassifier
|
5 |
+
from causalml.inference.meta import BaseXRegressor, BaseRRegressor, BaseSRegressor, BaseTRegressor
|
6 |
+
from causalml.inference.meta import BaseSClassifier, BaseTClassifier, BaseXClassifier, BaseRClassifier
|
7 |
+
|
8 |
+
class ModelTraining:
|
9 |
+
def __init__(self, df, y_name, X_names):
|
10 |
+
self.df = df
|
11 |
+
self.y_name = y_name
|
12 |
+
self.X_names = X_names
|
13 |
+
self.df_train = None
|
14 |
+
self.df_test = None
|
15 |
+
self.learner_t = None
|
16 |
+
self.conversion_learner_t = None
|
17 |
+
self.benefit_learner_t = None
|
18 |
+
|
19 |
+
def split_data(self, test_size, random_state):
|
20 |
+
self.df_train, self.df_test = train_test_split(
|
21 |
+
self.df,
|
22 |
+
test_size=test_size,
|
23 |
+
random_state=random_state
|
24 |
+
)
|
25 |
+
|
26 |
+
def fit_predict_classifier(self, params, control_name):
|
27 |
+
self.learner_t = BaseTClassifier(XGBClassifier(**params), control_name=control_name)
|
28 |
+
self.conversion_learner_t = self.learner_t
|
29 |
+
return self._fit_predict()
|
30 |
+
|
31 |
+
def fit_predict_regressor(self, params, control_name):
|
32 |
+
self.learner_t = BaseTRegressor(XGBRegressor(**params), control_name=control_name)
|
33 |
+
self.benefit_learner_t = self.learner_t
|
34 |
+
return self._fit_predict()
|
35 |
+
|
36 |
+
def _fit_predict(self):
|
37 |
+
self.learner_t_tau = self.learner_t.fit_predict(
|
38 |
+
X=self.df_train[self.X_names].values,
|
39 |
+
treatment=self.df_train['treatment_group_key'].values,
|
40 |
+
y=self.df_train[self.y_name].values
|
41 |
+
)
|
42 |
+
self.learner_t.feature_names = self.X_names
|
43 |
+
return self.learner_t_tau
|
44 |
+
|
45 |
+
def compute_feature_importance(self):
|
46 |
+
if self.learner_t is None:
|
47 |
+
raise ValueError("Model must be fitted before computing feature importances.")
|
48 |
+
|
49 |
+
return self.learner_t.get_importance(
|
50 |
+
X=self.df_train[self.X_names],
|
51 |
+
tau=self.learner_t_tau,
|
52 |
+
features=self.X_names,
|
53 |
+
normalize=True,
|
54 |
+
method='auto'
|
55 |
+
)
|
models_utils/models_simulation.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import pickle
|
3 |
+
|
4 |
+
class CATESimulationReady:
|
5 |
+
|
6 |
+
def __init__(self, model_path, y_pred_path):
|
7 |
+
self.model_path = model_path
|
8 |
+
self.y_pred_path = y_pred_path
|
9 |
+
|
10 |
+
# def get_model(self):
|
11 |
+
# model = pd.read_csv(self.model_path)
|
12 |
+
# return model
|
13 |
+
|
14 |
+
def predict(self):
|
15 |
+
with open(self.y_pred_path, 'rb') as f:
|
16 |
+
y_pred = pickle.load(f)
|
17 |
+
return y_pred
|
18 |
+
|
19 |
+
def feature_importance(self, fi_path):
|
20 |
+
fi = pd.read_csv(fi_path)
|
21 |
+
fi.columns = ['feature', 'score']
|
22 |
+
return fi
|
notebooks/Demo_Notebook.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
notebooks/Test.ipynb
ADDED
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 11,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"import pandas as pd\n",
|
10 |
+
"import matplotlib.pyplot as plt\n",
|
11 |
+
"\n",
|
12 |
+
"class CATEEvaluateSimulationReady:\n",
|
13 |
+
"\n",
|
14 |
+
" def __init__(self,data_path):\n",
|
15 |
+
" self.data_path = data_path\n",
|
16 |
+
"\n",
|
17 |
+
" def evaluate(self, discount_group):\n",
|
18 |
+
" if discount_group == 5:\n",
|
19 |
+
" qini_05_conversion_test = pd.read_csv(self.data_path + 'qini_05_conversion_test.csv').drop(columns='Unnamed: 0')\n",
|
20 |
+
" qini_05_benefit_test = pd.read_csv(self.data_path + 'qini_05_benefit_test.csv').drop(columns='Unnamed: 0')\n",
|
21 |
+
" return qini_05_conversion_test, qini_05_benefit_test\n",
|
22 |
+
" elif discount_group == 10:\n",
|
23 |
+
" qini_10_conversion_test = pd.read_csv(self.data_path + 'qini_10_conversion_test.csv').drop(columns='Unnamed: 0')\n",
|
24 |
+
" qini_10_benefit_test = pd.read_csv(self.data_path + 'qini_10_benefit_test.csv').drop(columns='Unnamed: 0')\n",
|
25 |
+
" return qini_10_conversion_test, qini_10_benefit_test\n",
|
26 |
+
" elif discount_group == 15:\n",
|
27 |
+
" qini_15_conversion_test = pd.read_csv(self.data_path + 'qini_15_conversion_test.csv').drop(columns='Unnamed: 0')\n",
|
28 |
+
" qini_15_benefit_test = pd.read_csv(self.data_path + 'qini_15_benefit_test.csv').drop(columns='Unnamed: 0')\n",
|
29 |
+
" return qini_15_conversion_test, qini_15_benefit_test"
|
30 |
+
]
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"cell_type": "code",
|
34 |
+
"execution_count": 7,
|
35 |
+
"metadata": {},
|
36 |
+
"outputs": [],
|
37 |
+
"source": [
|
38 |
+
"eval = CATEEvaluateSimulationReady('../data/effect_data/')\n",
|
39 |
+
"qini_05_conversion_test, qini_05_benefit_test = eval.evaluate(5)\n",
|
40 |
+
"qini_10_conversion_test, qini_10_benefit_test = eval.evaluate(10)\n",
|
41 |
+
"qini_15_conversion_test, qini_15_benefit_test = eval.evaluate(15)"
|
42 |
+
]
|
43 |
+
},
|
44 |
+
{
|
45 |
+
"cell_type": "code",
|
46 |
+
"execution_count": 23,
|
47 |
+
"metadata": {},
|
48 |
+
"outputs": [
|
49 |
+
{
|
50 |
+
"data": {
|
51 |
+
"text/plain": [
|
52 |
+
"Text(0.5, 1.0, 'CATE conversion vs Targeted Population')"
|
53 |
+
]
|
54 |
+
},
|
55 |
+
"execution_count": 23,
|
56 |
+
"metadata": {},
|
57 |
+
"output_type": "execute_result"
|
58 |
+
},
|
59 |
+
{
|
60 |
+
"data": {
|
61 |
+
"image/png": "",
|
62 |
+
"text/plain": [
|
63 |
+
"<Figure size 432x288 with 1 Axes>"
|
64 |
+
]
|
65 |
+
},
|
66 |
+
"metadata": {
|
67 |
+
"needs_background": "light"
|
68 |
+
},
|
69 |
+
"output_type": "display_data"
|
70 |
+
}
|
71 |
+
],
|
72 |
+
"source": [
|
73 |
+
"fig_conversion, ax_conversion = plt.subplots()\n",
|
74 |
+
"\n",
|
75 |
+
"qini_05_conversion_test.plot(ax=ax_conversion, x='index', y='Random', color='b', ls='--', lw=0.5, label = '5% random')\n",
|
76 |
+
"qini_10_conversion_test.plot(ax=ax_conversion, x='index', y='Random', color='g', ls='--', lw=0.5, label = '10% random')\n",
|
77 |
+
"qini_15_conversion_test.plot(ax=ax_conversion, x='index', y='Random', color='y', ls='--', lw=0.5, label = '15% random')\n",
|
78 |
+
"qini_05_conversion_test.plot(ax=ax_conversion, x='index', y='S', color='b', label = '5% model')\n",
|
79 |
+
"qini_10_conversion_test.plot(ax=ax_conversion, x='index', y='S', color='g', label = '10% model')\n",
|
80 |
+
"qini_15_conversion_test.plot(ax=ax_conversion, x='index', y='S', color='y', label = '15% model')\n",
|
81 |
+
"\n",
|
82 |
+
"ax_conversion.legend()\n",
|
83 |
+
"ax_conversion.set_xlabel('Fraction of Targeted Users')\n",
|
84 |
+
"ax_conversion.set_ylabel('CATE conversion')\n",
|
85 |
+
"ax_conversion.set_title('CATE conversion vs Targeted Population')"
|
86 |
+
]
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"cell_type": "code",
|
90 |
+
"execution_count": 26,
|
91 |
+
"metadata": {},
|
92 |
+
"outputs": [
|
93 |
+
{
|
94 |
+
"data": {
|
95 |
+
"text/plain": [
|
96 |
+
"Text(0.5, 1.0, 'CATE benefit vs Targeted Population')"
|
97 |
+
]
|
98 |
+
},
|
99 |
+
"execution_count": 26,
|
100 |
+
"metadata": {},
|
101 |
+
"output_type": "execute_result"
|
102 |
+
},
|
103 |
+
{
|
104 |
+
"data": {
|
105 |
+
"image/png": "",
|
106 |
+
"text/plain": [
|
107 |
+
"<Figure size 432x288 with 1 Axes>"
|
108 |
+
]
|
109 |
+
},
|
110 |
+
"metadata": {
|
111 |
+
"needs_background": "light"
|
112 |
+
},
|
113 |
+
"output_type": "display_data"
|
114 |
+
}
|
115 |
+
],
|
116 |
+
"source": [
|
117 |
+
"fig_benefit, ax_benefit = plt.subplots()\n",
|
118 |
+
"qini_05_benefit_test.plot(ax=ax_benefit, x='index', y='Random', color='b', ls='--', lw=0.5, label = '5% random')\n",
|
119 |
+
"qini_10_benefit_test.plot(ax=ax_benefit, x='index', y='Random', color='g', ls='--', lw=0.5, label = '10% random')\n",
|
120 |
+
"qini_15_benefit_test.plot(ax=ax_benefit, x='index', y='Random', color='y', ls='--', lw=0.5, label = '15% random')\n",
|
121 |
+
"qini_05_benefit_test.plot(ax=ax_benefit, x='index', y='S', color='b', label = '5% model')\n",
|
122 |
+
"qini_10_benefit_test.plot(ax=ax_benefit, x='index', y='S', color='g', label = '10% model')\n",
|
123 |
+
"qini_15_benefit_test.plot(ax=ax_benefit, x='index', y='S', color='y', label = '15% model')\n",
|
124 |
+
"ax_benefit.legend()\n",
|
125 |
+
"ax_benefit.set_xlabel('Fraction of Targeted Users')\n",
|
126 |
+
"ax_benefit.set_ylabel('CATE Benefit')\n",
|
127 |
+
"ax_benefit.set_title('CATE benefit vs Targeted Population')"
|
128 |
+
]
|
129 |
+
}
|
130 |
+
],
|
131 |
+
"metadata": {
|
132 |
+
"kernelspec": {
|
133 |
+
"display_name": "Python 3",
|
134 |
+
"language": "python",
|
135 |
+
"name": "python3"
|
136 |
+
},
|
137 |
+
"language_info": {
|
138 |
+
"codemirror_mode": {
|
139 |
+
"name": "ipython",
|
140 |
+
"version": 3
|
141 |
+
},
|
142 |
+
"file_extension": ".py",
|
143 |
+
"mimetype": "text/x-python",
|
144 |
+
"name": "python",
|
145 |
+
"nbconvert_exporter": "python",
|
146 |
+
"pygments_lexer": "ipython3",
|
147 |
+
"version": "3.9.6"
|
148 |
+
}
|
149 |
+
},
|
150 |
+
"nbformat": 4,
|
151 |
+
"nbformat_minor": 2
|
152 |
+
}
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
causalml==0.15.0
|
2 |
+
matplotlib==3.8.3
|
3 |
+
numpy==1.23.5
|
4 |
+
pandas==2.2.1
|
5 |
+
scikit_learn==1.4.1.post1
|
6 |
+
streamlit==1.32.2
|
7 |
+
xgboost==2.0.3
|