howardroark commited on
Commit
6f4f21f
1 Parent(s): beb8613

initial commit

Browse files
.gitignore ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ #.idea/
161
+
162
+ data/
163
+ wandb/
README.md CHANGED
@@ -1,12 +1 @@
1
- ---
2
- title: Uplift Modeling
3
- emoji: 😻
4
- colorFrom: pink
5
- colorTo: green
6
- sdk: streamlit
7
- sdk_version: 1.32.2
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # uplift_modeling
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from data_utils.data_generation import UpliftSimulation
2
+ from data_utils.exploratory_data_analysis import ExploratoryAnalysis
3
+ from data_utils.feature_importance import FeatureImportance
4
+ from models_utils.ml_models import ModelTraining
5
+ from eval_utils.evaluation import ModelEvaluator
6
+
7
+ import matplotlib.pyplot as plt
8
+ import numpy as np
9
+ import pandas as pd
10
+ import streamlit as st
11
+
12
+ X_names = [
13
+ 'AgeIndex', 'IncomeIndex', 'PurchaseFrequencyIndex',
14
+ 'AccountLifetimeIndex', 'AverageTransactionValueIndex', 'PreferredPaymentMethodIndex', 'RegionIndex',
15
+ 'EmailDiscountCTRIndex', 'WebDiscountCTRIndex', 'SocialMediaEngagementIndex',
16
+ 'DirectMailDiscountResponseIndex', 'InAppDiscountEngagementIndex', 'FlashSaleParticipationIndex',
17
+ 'SeasonalPromoInterestIndex', 'LoyaltyProgramEngagementIndex', 'ReferralBonusUsageIndex',
18
+ 'DiscountCodeRedemptionIndex', 'VIPSaleAccessIndex', 'EarlyAccessOptInIndex',
19
+ 'ProductReviewAfterDiscountIndex', 'UpsellConversionIndex', 'CrossSellInterestIndex',
20
+ 'BundlePurchaseIndex', 'SubscriptionUpgradeIndex', 'CustomerFeedbackIndex',
21
+ 'BrowserTypeIndex', 'DeviceCategoryIndex', 'OperatingSystemIndex',
22
+ 'SessionStartTimeIndex', 'LanguagePreferenceIndex', 'NewsletterSubscriptionIndex',
23
+ 'AccountVerificationStatusIndex', 'AdBlockerPresenceIndex'
24
+ ]
25
+
26
+ # Title
27
+ st.title("Uplift Modeling in Retail Demo")
28
+
29
+ tabs = st.sidebar.radio("Navigation", ["Data generation", "Exploratory analysis", "Model training", "Economic effects"])
30
+
31
+ if tabs == "Data generation":
32
+
33
+ st.header("Data Generation")
34
+
35
+ # Description
36
+ st.write("""
37
+ This app creates a simulated dataset for a special kind of analysis called uplift modeling, which helps understand the effect of different actions (like promotions) on customer behavior. We use some default settings to make things easy:
38
+ - We're looking at whether customers make a purchase or not.
39
+ - We compare different types of promotions (like no discount, 5% off, etc.).
40
+ - The dataset includes 15 different pieces of information (features) about each customer.
41
+ """)
42
+
43
+ # Interactive number of samples selection
44
+ n = st.number_input('Number of Samples (n)', min_value=1000, value=10000, step=1000,
45
+ help="Total number of samples to generate in the dataset.")
46
+
47
+ # Default values for other variables
48
+ y_name = 'conversion'
49
+ treatment_group_keys = ['control', 'discount_05', 'discount_10', 'discount_15']
50
+ n_classification_features = 15
51
+ n_classification_informative = 7
52
+ n_classification_repeated = 0
53
+ n_uplift_increase_dict = {'discount_05': 4, 'discount_10': 3, 'discount_15': 3}
54
+ n_uplift_decrease_dict = {'discount_05': 0, 'discount_10': 0, 'discount_15': 0}
55
+ positive_class_proportion = 0.05
56
+ random_seed = 8097
57
+
58
+ # Button to generate dataset
59
+ if st.button('Generate Dataset'):
60
+ uplift_sim = UpliftSimulation(n=n, y_name=y_name, treatment_group_keys=treatment_group_keys,
61
+ n_classification_features=n_classification_features,
62
+ n_classification_informative=n_classification_informative,
63
+ n_classification_repeated=n_classification_repeated,
64
+ n_uplift_increase_dict=n_uplift_increase_dict,
65
+ n_uplift_decrease_dict=n_uplift_decrease_dict,
66
+ positive_class_proportion=positive_class_proportion,
67
+ random_seed=random_seed)
68
+ uplift_sim.simulate_dataset()
69
+ uplift_sim.apply_discounts_and_clean()
70
+ uplift_sim.postprocess_tables()
71
+ uplift_sim.add_monetary_effect()
72
+ st.session_state.uplift_sim = uplift_sim # Store in session state
73
+
74
+ st.write("Dataset Generated Successfully!")
75
+
76
+ st.subheader("User profiles")
77
+ st.write('Features that represent a customer such as age, income, purchase frequency, etc')
78
+ st.dataframe(uplift_sim.dataframes[0].head(3))
79
+
80
+ st.subheader("Treatments data")
81
+ st.write('Information about the different treatments (discounts) that were applied to the customers as discounts in different channels (web, email, mobile), early access, etc')
82
+ st.dataframe(uplift_sim.dataframes[1].head(3))
83
+
84
+ st.subheader("Other data")
85
+ st.write('Other data that can be used in the analysis')
86
+ st.dataframe(uplift_sim.dataframes[2].head(3))
87
+
88
+ if tabs == "Exploratory analysis":
89
+
90
+ st.header("Exploratory Analysis")
91
+
92
+ if 'uplift_sim' in st.session_state:
93
+
94
+ st.subheader('Summary statistics')
95
+ uplift_sim = st.session_state.uplift_sim
96
+ eda = ExploratoryAnalysis(uplift_sim.df)
97
+
98
+ st.write('We begin by computing the total sum of conversions, sales (discounted price) and platform benefit. We can see that the total conversions and the total sales grows as the discount value is bigger. However, the platform benefit decreases.')
99
+
100
+ sum_conversions, mean_conversions = eda.compute_summaries()
101
+ st.write(sum_conversions)
102
+ st.write(mean_conversions)
103
+
104
+ st.write('We can also visualize the tradeoff between conversions and platform benefit by plotting the mean benefit per user on the y-axis and the mean conversion rate on the x-axis, for each treatment group.')
105
+ mean_benefit_vs_conversion = eda.compute_mean_benefit_vs_conversion()
106
+
107
+ fig, ax = plt.subplots()
108
+ mean_benefit_vs_conversion.plot.scatter(x='conversion', y='benefit', c='DarkBlue', s=50, ax=ax)
109
+ st.pyplot(fig)
110
+
111
+ st.write('''
112
+ We further compute the Average Treatment Effect (ATE) for both the mean conversion rate and the mean benefit per user:
113
+ - Conversion ATE = Mean Conversion rate in the discounted group minus Mean Conversion rate in the control group
114
+ - Benefit ATE = Mean Benefit per user in the discounted group minus Mean Benefit per user in the control group
115
+ This helps illustrate how the discount value affects Conversion ATE and Benefit ATE.
116
+ ''')
117
+ mean_conversions_ate = eda.compute_ate()
118
+
119
+ fig, ax = plt.subplots()
120
+ mean_conversions_ate.plot.scatter(x='conversion', y='benefit', c='DarkBlue', s=50, ax=ax)
121
+ st.pyplot(fig)
122
+
123
+ st.subheader('Feature importance')
124
+
125
+ # Allow users to select a treatment group
126
+ treatment_group = st.selectbox(
127
+ 'Select a treatment group',
128
+ options=['discount_05', 'discount_10', 'discount_15'],
129
+ index=0 # default to 'discount_05'
130
+ )
131
+
132
+ feature_importance = FeatureImportance(uplift_sim.df, X_names, y_name = 'conversion', treatment_group = treatment_group)
133
+ fi = feature_importance.compute_feature_importance()
134
+ fig, ax = plt.subplots()
135
+ di_df_sorted = fi.sort_values(by='score', ascending=False)
136
+ di_df_sorted[['feature', 'score']].plot.barh(x='feature', y='score', ax=ax)
137
+ st.pyplot(fig)
138
+
139
+ st.write("""
140
+ - AccountLifetimeIndex: Longer-standing accounts are key predictors of customer response to promotions \n
141
+ - CustomerFeedbackIndex: Customer feedback significantly influences the success of marketing strategies \n
142
+ - UpsellConversionIndex: The success rate of upselling is an important factor \n
143
+ - PurchaseFrequencyIndex: More frequent purchases indicate higher engagement and response to marketing efforts \n
144
+ - ReferralBonusUsedIndex and LoyaltyProgramEngagementIndex: Engagement with these programs is highly indicative of responsiveness to promotions
145
+ """)
146
+
147
+ else:
148
+ st.error("Please generate the dataset first.")
149
+
150
+ if tabs == "Model training":
151
+
152
+ st.header("Model Training")
153
+
154
+ if 'uplift_sim' in st.session_state:
155
+
156
+ uplift_sim = st.session_state.uplift_sim
157
+
158
+ model_trainer = ModelTraining(uplift_sim.df, 'conversion', X_names)
159
+
160
+ model_type = st.radio("Choose the model type", ('Conversion Model', 'Benefit Model'))
161
+
162
+ params = {
163
+ 'n_estimators': st.slider('Number of Estimators', 10, 100, 50),
164
+ 'max_depth': st.slider('Max Depth', 1, 10, 4),
165
+ 'colsample_bytree': st.slider('Colsample by Tree', 0.1, 1.0, 0.2),
166
+ 'subsample': st.slider('Subsample', 0.1, 1.0, 0.2),
167
+ }
168
+ control_name = 'control' # st.text_input('Control Group Name', 'control')
169
+ test_size = st.slider('Test Size', 0.1, 0.9, 0.5)
170
+ random_state = 20143 # st.slider('Random State', 0, 10000, 20143)
171
+
172
+ if st.button('Train Model'):
173
+
174
+ model_trainer.split_data(test_size=test_size, random_state=random_state)
175
+
176
+ if model_type == 'Conversion Model':
177
+ y_name = 'conversion' # st.selectbox('Select target variable for conversion', options=uplift_sim.target_options)
178
+ model_trainer.y_name = y_name
179
+ tau = model_trainer.fit_predict_classifier(params, control_name)
180
+ elif model_type == 'BATE Model':
181
+ y_name = 'benefit' # st.selectbox('Select target variable for benefit', options=uplift_sim.benefit_options)
182
+ model_trainer.y_name = y_name
183
+ tau = model_trainer.fit_predict_regressor(params, control_name)
184
+
185
+ st.session_state.model_trainer = model_trainer
186
+
187
+ feature_importances = model_trainer.compute_feature_importance()
188
+
189
+ st.subheader('Feature Importances')
190
+ fig, ax = plt.subplots()
191
+
192
+ for k, v in feature_importances.items():
193
+ st.write(f"Feature importance for {k}")
194
+ v.plot(kind='barh', ax=ax)
195
+ ax.set_xlabel("Importance")
196
+ ax.set_ylabel("Feature")
197
+ ax.set_title(f"Feature Importance for {model_type}")
198
+ st.pyplot(fig)
199
+
200
+ else:
201
+ st.error("Please generate and preprocess the dataset first.")
202
+
203
+ if tabs == "Economic effects":
204
+
205
+ st.header("Economic Effects Analysis")
206
+
207
+ if 'uplift_sim' in st.session_state and 'model_trainer' in st.session_state:
208
+ df_test = st.session_state.model_trainer.df_test
209
+ model_type = st.radio("Choose the model type for analysis", ('Conversion Model', 'Benefit Model'))
210
+
211
+ # Determine which model to use based on user selection
212
+ if model_type == 'Conversion Model':
213
+ model = st.session_state.model_trainer.conversion_learner_t
214
+ elif model_type == 'Benefit Model':
215
+ model = st.session_state.model_trainer.benefit_learner_t
216
+ else:
217
+ st.error("Invalid model type selected.")
218
+ st.stop()
219
+
220
+ if model == None:
221
+ st.error("Please train the model first.")
222
+ st.stop()
223
+
224
+ evaluator = ModelEvaluator(model,
225
+ df_test,
226
+ X_names # df_test.columns.drop(['conversion', 'benefit', 'treatment_group_key'])
227
+ )
228
+ discounts = ['discount_05', 'discount_10', 'discount_15']
229
+ qini_conversions = {}
230
+ qini_benefits = {}
231
+
232
+ for discount in discounts:
233
+ qini_conv, qini_ben = evaluator.eval_performance(discount)
234
+ qini_conversions[discount] = qini_conv
235
+ qini_benefits[discount] = qini_ben
236
+
237
+ # Plotting CATE Conversion
238
+ st.subheader("CATE Conversion vs Targeted Population")
239
+ fig, ax_conversion = plt.subplots()
240
+ for discount, color in zip(discounts, ['b', 'g', 'y']):
241
+ qini_conversions[discount].plot(ax=ax_conversion, x='index', y='S', color=color)
242
+ qini_conversions[discount].plot(ax=ax_conversion, x='index', y='Random', color='r', ls='--')
243
+
244
+ ax_conversion.legend([f'{d} model' for d in discounts] + [f'{d} random' for d in discounts], prop={'size': 10})
245
+ ax_conversion.set_xlabel('Fraction of Targeted Users')
246
+ ax_conversion.set_ylabel('CATE Conversion')
247
+ ax_conversion.set_title('CATE Conversion vs Targeted Population')
248
+ st.pyplot(fig)
249
+
250
+ # Plotting CATE Benefit
251
+ st.subheader("CATE Benefit vs Targeted Population")
252
+ fig, ax_benefit = plt.subplots()
253
+ for discount, color in zip(discounts, ['b', 'g', 'y']):
254
+ qini_benefits[discount].plot(ax=ax_benefit, x='index', y='S', color=color)
255
+ qini_benefits[discount].plot(ax=ax_benefit, x='index', y='Random', color='r', ls='--')
256
+
257
+ ax_benefit.legend([f'{d} model' for d in discounts] + [f'{d} random' for d in discounts], prop={'size': 10})
258
+ ax_benefit.set_xlabel('Fraction of Targeted Users')
259
+ ax_benefit.set_ylabel('CATE Benefit')
260
+ ax_benefit.set_title('CATE Benefit vs Targeted Population')
261
+ st.pyplot(fig)
262
+
263
+ # Plotting CATE Benefit vs CATE Conversion
264
+ st.subheader("CATE Benefit vs CATE Conversion")
265
+ fig, ax_comp = plt.subplots()
266
+ colors = ['b', 'g', 'y']
267
+ for i, discount in enumerate(discounts):
268
+ qini_conc_test = pd.concat([qini_conversions[discount][['S']], qini_benefits[discount][['S']]], axis=1)
269
+ qini_conc_test.columns = ['cate_conversion', 'cate_benefit']
270
+ qini_conc_test.plot(ax=ax_comp, x='cate_conversion', y='cate_benefit', color=colors[i], label=f'{discount} model')
271
+
272
+ ax_comp.legend(prop={'size': 10})
273
+ ax_comp.set_xlabel('CATE Conversion')
274
+ ax_comp.set_ylabel('CATE Benefit')
275
+ ax_comp.set_title('CATE Benefit vs CATE Conversion')
276
+ st.pyplot(fig)
277
+
278
+ else:
279
+ st.error("Please ensure the model is trained and the dataset is prepared.")
app_old.py ADDED
@@ -0,0 +1,341 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+
3
+ import pandas as pd
4
+ import streamlit as st
5
+ import matplotlib.pyplot as plt
6
+
7
+ from data_utils.data_simulation import UpliftSimulationReady
8
+ from data_utils.eda_simulation import EDASimulationReady
9
+ from data_utils.feature_importance_simulation import FISimulationReady
10
+ from models_utils.models_simulation import CATESimulationReady
11
+ from eval_utils.evaluation_simulation import CATEConversionEvaluateSimulationReady, CATEBenefitEvaluateSimulationReady
12
+
13
+ from mlops_utils.wandb_utils import upload_dataset_to_wandb, eda_work_with_dataset_to_wandb, training_results_to_wandb
14
+
15
+ st.title('Causal Uplift Modeling')
16
+ tabs = st.sidebar.radio("Navigation", ["Data", "EDA", "Modeling", "Effect"])
17
+
18
+ if tabs == "Data":
19
+
20
+ # Needed raw data
21
+ uplift_simulation = UpliftSimulationReady('./data/raw_data_client/')
22
+ user_profiles = uplift_simulation.load_user_profiles('user_profiles.csv')
23
+ uplift_data = uplift_simulation.load_uplift_data('uplift_data.csv')
24
+ irrelevant_data = uplift_simulation.load_irrelevant_data('irrelevant_data.csv')
25
+ transaction_data = uplift_simulation.load_other_data('transaction_data.csv')
26
+
27
+ # Subtitle
28
+ st.subheader('Loading data')
29
+
30
+ st.write('User profiles')
31
+ st.write(user_profiles.head(5))
32
+
33
+ st.write('Uplift data')
34
+ st.write(uplift_data.head(5))
35
+
36
+ st.write('Other data')
37
+ st.write(irrelevant_data.head(5))
38
+
39
+ st.write('Transaction data')
40
+ st.write(transaction_data.head(5))
41
+
42
+ if st.button('Upload data to wandb'):
43
+ upload_dataset_to_wandb(['./data/raw_data_client'], 'nl_cate_modeling', 'uplift_data')
44
+ st.write('Data uploaded to wandb')
45
+
46
+ # TODO: add to WANDB data processing step in the beginning
47
+ # TODO: the tree of updates
48
+ # TODO: choose the version from MLOps here exactly
49
+
50
+ if tabs == "EDA":
51
+
52
+ eda_simulation = EDASimulationReady('./data/processed_data/')
53
+ sum_conversions, mean_conversions = eda_simulation.load_conversions('uplift_classification_processed.csv')
54
+
55
+ st.subheader('Exploratory Data Analysis')
56
+
57
+ st.write('We can begin by computing the total sum of conversions, sales (discounted price) and platform benefit. We can see that the total conversions and the total sales grows as the discount value is bigger. However the platform benefit decreases.')
58
+ st.write(sum_conversions)
59
+
60
+ st.write('We can repeat the analysis but using the mean instead of the sum. This will give us the mean conversion rate, the mean sales per user and the mean platform benefit per user.')
61
+ st.write(mean_conversions)
62
+
63
+ st.write('To illustrate the tradeoff between conversions and platform benefit we can plot the mean benefit per user in the y-axis and the mean conversion rate in the x-axis, per treatment group.')
64
+
65
+ df_pivot_mean = mean_conversions[['mean']]
66
+ df_pivot_mean.columns = df_pivot_mean.columns.droplevel()
67
+
68
+ fig, ax = plt.subplots()
69
+ df_pivot_mean.plot.scatter(x='conversion',
70
+ y='benefit',
71
+ c='DarkBlue',
72
+ s=50,
73
+ ax=ax)
74
+ st.pyplot(fig)
75
+
76
+ st.write('''
77
+ We can also compute the Average Treatment Effect (ATE) for both the mean conversion rate and the mean benefit per user:
78
+ Conversion ATE = Mean Converstion rate in discounted group minus Mean Conversion rate in control group
79
+ Benefit ATE = Mean Benefit per user in discounted group minus Mean Benefit per user in control group
80
+ We can see in the plot below that the bigger the discount value the stronger the Conversion ATE (x-axis), but at the same time the more negative the Benefit ATE (y-axis).
81
+ ''')
82
+
83
+ df_pivot_mean_ate = df_pivot_mean - df_pivot_mean.loc['control'].values.squeeze()
84
+ df_pivot_mean_ate.columns = ['benefit_ate', 'conversion_ate', 'discounted_price_ate']
85
+
86
+ fig, ax = plt.subplots()
87
+ df_pivot_mean_ate.plot.scatter(x='conversion_ate',
88
+ y='benefit_ate',
89
+ c='DarkBlue',
90
+ s=50,
91
+ ax=ax)
92
+ st.pyplot(fig)
93
+
94
+ st.subheader('Feature Importance')
95
+
96
+ fi = FISimulationReady('./data/eda_data/')
97
+ di_df = fi.load_feature_importance('kl_feature_importance.csv')
98
+
99
+ st.write('Feature importance')
100
+ fig, ax = plt.subplots()
101
+ di_df_sorted = di_df.sort_values(by='score', ascending=False)
102
+ di_df_sorted[['feature', 'score']].plot.barh(x='feature', y='score', ax=ax)
103
+ st.pyplot(fig)
104
+
105
+ if st.button('Upload EDA to wandb'):
106
+ eda_work_with_dataset_to_wandb(
107
+ dirs = ['./data/eda_data/'],
108
+ project_name = 'nl_cate_modeling',
109
+ dataset_name = 'uplift_data:latest',
110
+ dataset_type = 'raw_dataset',
111
+ artifact_type = 'eda')
112
+ st.write('EDA uploaded to wandb')
113
+
114
+ # TODO: add report to WANDB
115
+ # TODO: add artifacts to WANDB
116
+
117
+ if tabs == "Modeling":
118
+
119
+ st.subheader('Causal ML modeling')
120
+
121
+ st.write('We can begin by modeling the Conditional Average Treatment Effect')
122
+ if st.button('Train & run CATE conversion model'):
123
+ # fake trainin via 5 seconds spinner
124
+ with st.spinner('Training model...'):
125
+ time.sleep(2)
126
+
127
+ st.subheader('Feature importance by discount group')
128
+
129
+ model = CATESimulationReady('./data/models_data/model.pkl', './data/models_data/y_pred.pkl')
130
+ y_pred = model.predict()
131
+
132
+ fi05 = model.feature_importance('./data/models_data/discount_05_feature_importance.csv')
133
+ fi10 = model.feature_importance('./data/models_data/discount_10_feature_importance.csv')
134
+ fi15 = model.feature_importance('./data/models_data/discount_15_feature_importance.csv')
135
+
136
+ st.write('5\% discount group')
137
+ # plot feature importance as bar chart
138
+ fig, ax = plt.subplots()
139
+ fi05_sorted = fi05.sort_values(by='score', ascending=False)
140
+ fi05_sorted[['feature', 'score']].plot.barh(x='feature', y='score', ax=ax)
141
+ st.pyplot(fig)
142
+
143
+ st.write('10\% discount group')
144
+ fig, ax = plt.subplots()
145
+ fi10_sorted = fi10.sort_values(by='score', ascending=False)
146
+ fi10_sorted[['feature', 'score']].plot.barh(x='feature', y='score', ax=ax)
147
+ st.pyplot(fig)
148
+
149
+ st.write('15\% discount group')
150
+ fig, ax = plt.subplots()
151
+ fi15_sorted = fi15.sort_values(by='score', ascending=False)
152
+ fi15_sorted[['feature', 'score']].plot.barh(x='feature', y='score', ax=ax)
153
+ st.pyplot(fig)
154
+ if st.button('Upload convesion model to wandb'):
155
+ training_results_to_wandb(['./data/models_data'],
156
+ 'nl_cate_modeling',
157
+ 'uplift_data:latest',
158
+ 'raw_dataset',
159
+ 'model_artifacts',
160
+ 'causal_model_conversion')
161
+ st.write('Models uploaded to wandb')
162
+
163
+ st.write('Similarly we can now train a T-Learner on the benefit label, and use the model predictions to evaluate the performance on the CATE conversion and CATE benefit.')
164
+ if st.button('Train & run CATE benefit model'):
165
+ # fake trainin via 5 seconds spinner
166
+ with st.spinner('Training model...'):
167
+ time.sleep(2)
168
+
169
+ st.subheader('Feature importance by discount group')
170
+
171
+ model = CATESimulationReady('./data/models_data/model.pkl', './data/models_data/y_pred.pkl')
172
+ y_pred = model.predict()
173
+
174
+ fi05 = model.feature_importance('./data/models_data/discount_05_feature_importance_bate.csv')
175
+ fi10 = model.feature_importance('./data/models_data/discount_10_feature_importance_bate.csv')
176
+ fi15 = model.feature_importance('./data/models_data/discount_15_feature_importance_bate.csv')
177
+
178
+ st.write('5\% discount group')
179
+ # plot feature importance as bar chart
180
+ fig, ax = plt.subplots()
181
+ fi05_sorted = fi05.sort_values(by='score', ascending=False)
182
+ fi05_sorted[['feature', 'score']].plot.barh(x='feature', y='score', ax=ax)
183
+ st.pyplot(fig)
184
+
185
+ st.write('10\% discount group')
186
+ fig, ax = plt.subplots()
187
+ fi10_sorted = fi10.sort_values(by='score', ascending=False)
188
+ fi10_sorted[['feature', 'score']].plot.barh(x='feature', y='score', ax=ax)
189
+ st.pyplot(fig)
190
+
191
+ st.write('15\% discount group')
192
+ fig, ax = plt.subplots()
193
+ fi15_sorted = fi15.sort_values(by='score', ascending=False)
194
+ fi15_sorted[['feature', 'score']].plot.barh(x='feature', y='score', ax=ax)
195
+ st.pyplot(fig)
196
+ if st.button('Upload benefit model to wandb'):
197
+ training_results_to_wandb(['./data/models_data'],
198
+ 'nl_cate_modeling',
199
+ 'uplift_data:latest',
200
+ 'raw_dataset',
201
+ 'model_artifacts',
202
+ 'causal_model_benefit')
203
+ st.write('Models uploaded to wandb')
204
+
205
+ if tabs == "Effect":
206
+
207
+ st.subheader('Causal ML evaluation')
208
+ st.write('We can evaluate our models by looking at the Qini curves. We can use the CATE conversion model to evaluate the performance on both the Conversion and the Benefit as a function of the fraction of users targeted.')
209
+
210
+ # two columns
211
+ col1, col2 = st.columns(2)
212
+
213
+ with col1:
214
+
215
+ st.write('CATE conversion model')
216
+
217
+ eval = CATEConversionEvaluateSimulationReady('./data/effect_data/')
218
+ qini_05_conversion_test, qini_05_benefit_test = eval.evaluate(5)
219
+ qini_10_conversion_test, qini_10_benefit_test = eval.evaluate(10)
220
+ qini_15_conversion_test, qini_15_benefit_test = eval.evaluate(15)
221
+
222
+ # Plot CATE conversion vs Targeted Population
223
+ fig_conversion, ax_conversion = plt.subplots()
224
+ qini_05_conversion_test.plot(ax=ax_conversion, x='index', y='Random', color='b', ls='--', lw=0.5, label = '5% random')
225
+ qini_10_conversion_test.plot(ax=ax_conversion, x='index', y='Random', color='g', ls='--', lw=0.5, label = '10% random')
226
+ qini_15_conversion_test.plot(ax=ax_conversion, x='index', y='Random', color='y', ls='--', lw=0.5, label = '15% random')
227
+ qini_05_conversion_test.plot(ax=ax_conversion, x='index', y='S', color='b', label = '5% model')
228
+ qini_10_conversion_test.plot(ax=ax_conversion, x='index', y='S', color='g', label = '10% model')
229
+ qini_15_conversion_test.plot(ax=ax_conversion, x='index', y='S', color='y', label = '15% model')
230
+ ax_conversion.legend()
231
+ ax_conversion.set_xlabel('Fraction of Targeted Users')
232
+ ax_conversion.set_ylabel('CATE conversion')
233
+ ax_conversion.set_title('CATE conversion vs Targeted Population')
234
+ st.pyplot(fig_conversion)
235
+
236
+ # Plot CATE benefit vs Targeted Population
237
+ fig_benefit, ax_benefit = plt.subplots()
238
+ qini_05_benefit_test.plot(ax=ax_benefit, x='index', y='Random', color='b', ls='--', lw=0.5, label = '5% random')
239
+ qini_10_benefit_test.plot(ax=ax_benefit, x='index', y='Random', color='g', ls='--', lw=0.5, label = '10% random')
240
+ qini_15_benefit_test.plot(ax=ax_benefit, x='index', y='Random', color='y', ls='--', lw=0.5, label = '15% random')
241
+ qini_05_benefit_test.plot(ax=ax_benefit, x='index', y='S', color='b', label = '5% model')
242
+ qini_10_benefit_test.plot(ax=ax_benefit, x='index', y='S', color='g', label = '10% model')
243
+ qini_15_benefit_test.plot(ax=ax_benefit, x='index', y='S', color='y', label = '15% model')
244
+ ax_benefit.legend()
245
+ ax_benefit.set_xlabel('Fraction of Targeted Users')
246
+ ax_benefit.set_ylabel('CATE Benefit')
247
+ ax_benefit.set_title('CATE benefit vs Targeted Population')
248
+ st.pyplot(fig_benefit)
249
+
250
+ qini_05_conc_test = pd.concat([qini_05_conversion_test[['S']], qini_05_benefit_test[['S']]], axis=1)
251
+ qini_05_conc_test.columns = ['cate_conversion', 'cate_benefit']
252
+ qini_10_conc_test = pd.concat([qini_10_conversion_test[['S']], qini_10_benefit_test[['S']]], axis=1)
253
+ qini_10_conc_test.columns = ['cate_conversion', 'cate_benefit']
254
+ qini_15_conc_test = pd.concat([qini_15_conversion_test[['S']], qini_15_benefit_test[['S']]], axis=1)
255
+ qini_15_conc_test.columns = ['cate_conversion', 'cate_benefit']
256
+
257
+ fig_conversion, ax_conversion = plt.subplots()
258
+ qini_05_conc_test.plot(ax=ax_conversion, x='cate_conversion',y='cate_benefit',color='b')
259
+ qini_10_conc_test.plot(ax=ax_conversion, x='cate_conversion',y='cate_benefit',color='g')
260
+ qini_15_conc_test.plot(ax=ax_conversion, x='cate_conversion',y='cate_benefit',color='y')
261
+ ax_conversion.legend(['5% model', '10% model','15% model'], prop={'size': 10})
262
+ ax_conversion.set_xlabel('CATE Conversion')
263
+ ax_conversion.set_ylabel('CATE Benefit')
264
+ ax_conversion.set_title('CATE benefit vs CATE conversion')
265
+ st.pyplot(fig_conversion)
266
+
267
+ if st.button('Upload conversion effects to wandb'):
268
+ training_results_to_wandb(['./data/effect_data'],
269
+ 'nl_cate_modeling',
270
+ 'causal_model_conversion:latest',
271
+ 'model_artifacts',
272
+ 'effects_artifacts',
273
+ 'convesion_model_evaluation',
274
+ job_type='evaluation')
275
+ st.write('Evaluation uploaded to wandb')
276
+
277
+ with col2:
278
+ st.write('CATE benefit model')
279
+
280
+ eval = CATEBenefitEvaluateSimulationReady('./data/effect_data/')
281
+ qini_05_conversion_test, qini_05_benefit_test = eval.evaluate(5)
282
+ qini_10_conversion_test, qini_10_benefit_test = eval.evaluate(10)
283
+ qini_15_conversion_test, qini_15_benefit_test = eval.evaluate(15)
284
+
285
+ # Plot CATE conversion vs Targeted Population
286
+ fig_conversion, ax_conversion = plt.subplots()
287
+ qini_05_conversion_test.plot(ax=ax_conversion, x='index', y='Random', color='b', ls='--', lw=0.5, label = '5% random')
288
+ qini_10_conversion_test.plot(ax=ax_conversion, x='index', y='Random', color='g', ls='--', lw=0.5, label = '10% random')
289
+ qini_15_conversion_test.plot(ax=ax_conversion, x='index', y='Random', color='y', ls='--', lw=0.5, label = '15% random')
290
+ qini_05_conversion_test.plot(ax=ax_conversion, x='index', y='S', color='b', label = '5% model')
291
+ qini_10_conversion_test.plot(ax=ax_conversion, x='index', y='S', color='g', label = '10% model')
292
+ qini_15_conversion_test.plot(ax=ax_conversion, x='index', y='S', color='y', label = '15% model')
293
+ ax_conversion.legend()
294
+ ax_conversion.set_xlabel('Fraction of Targeted Users')
295
+ ax_conversion.set_ylabel('CATE conversion')
296
+ ax_conversion.set_title('CATE conversion vs Targeted Population')
297
+ st.pyplot(fig_conversion)
298
+
299
+ # Plot CATE benefit vs Targeted Population
300
+ fig_benefit, ax_benefit = plt.subplots()
301
+ qini_05_benefit_test.plot(ax=ax_benefit, x='index', y='Random', color='b', ls='--', lw=0.5, label = '5% random')
302
+ qini_10_benefit_test.plot(ax=ax_benefit, x='index', y='Random', color='g', ls='--', lw=0.5, label = '10% random')
303
+ qini_15_benefit_test.plot(ax=ax_benefit, x='index', y='Random', color='y', ls='--', lw=0.5, label = '15% random')
304
+ qini_05_benefit_test.plot(ax=ax_benefit, x='index', y='S', color='b', label = '5% model')
305
+ qini_10_benefit_test.plot(ax=ax_benefit, x='index', y='S', color='g', label = '10% model')
306
+ qini_15_benefit_test.plot(ax=ax_benefit, x='index', y='S', color='y', label = '15% model')
307
+ ax_benefit.legend()
308
+ ax_benefit.set_xlabel('Fraction of Targeted Users')
309
+ ax_benefit.set_ylabel('CATE Benefit')
310
+ ax_benefit.set_title('CATE benefit vs Targeted Population')
311
+ st.pyplot(fig_benefit)
312
+
313
+ qini_05_conc_test = pd.concat([qini_05_conversion_test[['S']], qini_05_benefit_test[['S']]], axis=1)
314
+ qini_05_conc_test.columns = ['cate_conversion', 'cate_benefit']
315
+ qini_10_conc_test = pd.concat([qini_10_conversion_test[['S']], qini_10_benefit_test[['S']]], axis=1)
316
+ qini_10_conc_test.columns = ['cate_conversion', 'cate_benefit']
317
+ qini_15_conc_test = pd.concat([qini_15_conversion_test[['S']], qini_15_benefit_test[['S']]], axis=1)
318
+ qini_15_conc_test.columns = ['cate_conversion', 'cate_benefit']
319
+
320
+ fig_conversion, ax_conversion = plt.subplots()
321
+ qini_05_conc_test.plot(ax=ax_conversion, x='cate_conversion',y='cate_benefit',color='b')
322
+ qini_10_conc_test.plot(ax=ax_conversion, x='cate_conversion',y='cate_benefit',color='g')
323
+ qini_15_conc_test.plot(ax=ax_conversion, x='cate_conversion',y='cate_benefit',color='y')
324
+ ax_conversion.legend(['5% model', '10% model','15% model'], prop={'size': 10})
325
+ ax_conversion.set_xlabel('CATE Conversion')
326
+ ax_conversion.set_ylabel('CATE Benefit')
327
+ ax_conversion.set_title('CATE benefit vs CATE conversion')
328
+ st.pyplot(fig_conversion)
329
+
330
+ if st.button('Upload benefit effects to wandb'):
331
+ training_results_to_wandb(['./data/effect_data'],
332
+ 'nl_cate_modeling',
333
+ 'causal_model_benefit:latest',
334
+ 'model_artifacts',
335
+ 'effects_artifacts',
336
+ 'benefit_model_evaluation',
337
+ job_type='evaluation')
338
+ st.write('Evaluation uploaded to wandb')
339
+
340
+ st.write('To simplify the comparison, we can plot the CATE Benefit as a function of the CATE conversion.')
341
+ st.write('In the last plot for example we can see that there is a region where offering 15% discount to a targeted group of users is more efficient than giving 10% to everyone. We can obtain the same impact in overall conversion uplift while reducing our benefit loss considerably.')
data_utils/data_generation.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from functools import reduce
3
+ from random import randint
4
+
5
+ from causalml.dataset import make_uplift_classification
6
+
7
+ class UpliftSimulation:
8
+
9
+ def __init__(self, n=50000, y_name='conversion',
10
+ treatment_group_keys=['control', 'discount_05', 'discount_10', 'discount_15'],
11
+ n_classification_features=15, n_classification_informative=7,
12
+ n_classification_repeated=0,
13
+ n_uplift_increase_dict={'discount_05': 4, 'discount_10': 3, 'discount_15': 3},
14
+ n_uplift_decrease_dict={'discount_05': 0, 'discount_10': 0, 'discount_15': 0},
15
+ delta_uplift_increase_dict={'discount_05': 0.0020, 'discount_10': 0.0045, 'discount_15': 0.008},
16
+ delta_uplift_decrease_dict={'discount_05': 0, 'discount_10': 0, 'discount_15': 0},
17
+ n_uplift_increase_mix_informative_dict={'discount_05': 3, 'discount_10': 2, 'discount_15': 3},
18
+ n_uplift_decrease_mix_informative_dict={'discount_05': 0, 'discount_10': 0, 'discount_15': 0},
19
+ positive_class_proportion=0.05, random_seed=8097):
20
+ self.n = n
21
+ self.y_name = y_name
22
+ self.treatment_group_keys = treatment_group_keys
23
+ self.n_classification_features = n_classification_features
24
+ self.n_classification_informative = n_classification_informative
25
+ self.n_classification_repeated = n_classification_repeated
26
+ self.n_uplift_increase_dict = n_uplift_increase_dict
27
+ self.n_uplift_decrease_dict = n_uplift_decrease_dict
28
+ self.delta_uplift_increase_dict = delta_uplift_increase_dict
29
+ self.delta_uplift_decrease_dict = delta_uplift_decrease_dict
30
+ self.n_uplift_increase_mix_informative_dict = n_uplift_increase_mix_informative_dict
31
+ self.n_uplift_decrease_mix_informative_dict = n_uplift_decrease_mix_informative_dict
32
+ self.positive_class_proportion = positive_class_proportion
33
+ self.random_seed = random_seed
34
+ self.df = None
35
+ self.X_names = None
36
+
37
+ def simulate_dataset(self):
38
+ self.df, self.X_names = make_uplift_classification(
39
+ treatment_name=self.treatment_group_keys,
40
+ y_name=self.y_name,
41
+ n_samples=self.n,
42
+ n_classification_features=self.n_classification_features,
43
+ n_classification_informative=self.n_classification_informative,
44
+ n_classification_repeated=self.n_classification_repeated,
45
+ n_uplift_increase_dict=self.n_uplift_increase_dict,
46
+ n_uplift_decrease_dict=self.n_uplift_decrease_dict,
47
+ delta_uplift_increase_dict=self.delta_uplift_increase_dict,
48
+ delta_uplift_decrease_dict=self.delta_uplift_decrease_dict,
49
+ n_uplift_increase_mix_informative_dict=self.n_uplift_increase_mix_informative_dict,
50
+ n_uplift_decrease_mix_informative_dict=self.n_uplift_decrease_mix_informative_dict,
51
+ positive_class_proportion=self.positive_class_proportion,
52
+ random_seed=self.random_seed,
53
+ )
54
+
55
+ def apply_discounts_and_clean(self):
56
+ discounts_dict = {'control': 0, 'discount_05': 0.05, 'discount_10': 0.10, 'discount_15': 0.15}
57
+ self.df['discount'] = self.df['treatment_group_key']
58
+ self.df = self.df.replace({"discount": discounts_dict})
59
+ self.df.drop(columns=['treatment_effect'], inplace=True)
60
+
61
+
62
+ def postprocess_tables(self):
63
+
64
+ # Add a synthetic UserID for each entry
65
+ self.df['UserID'] = range(len(self.df))
66
+
67
+ # Mapping the columns
68
+ informative_cols = [col for col in self.df.columns if 'informative' in col]
69
+ uplift_cols = [col for col in self.df.columns if 'uplift' in col]
70
+ irrelevant_cols = [col for col in self.df.columns if 'irrelevant' in col]
71
+ transaction_cols = ['treatment_group_key', 'conversion', 'discount']
72
+
73
+ # User Demographics and Profiles Table (Including Informative Features)
74
+ user_profiles = self.df[['UserID'] + informative_cols].copy()
75
+
76
+ # Web Interaction Data Table (This might need adjustment based on actual data)
77
+ # If any of the 'informative' columns relate to web interaction, include them here.
78
+
79
+ # Uplift-Related Data Table
80
+ uplift_data = self.df[['UserID'] + uplift_cols].copy()
81
+
82
+ # Adjusting the Uplift-Related Data table to include the mixed features
83
+ mixed_uplift_columns = ['x31_increase_mix', 'x22_increase_mix', 'x20_increase_mix',
84
+ 'x33_increase_mix', 'x32_increase_mix', 'x27_increase_mix',
85
+ 'x21_increase_mix', 'x26_increase_mix']
86
+
87
+ # Assuming uplift_data already includes the 'UserID' column
88
+ uplift_data = pd.concat([uplift_data, self.df[mixed_uplift_columns]], axis=1)
89
+
90
+ # Irrelevant Data Table
91
+ irrelevant_data = self.df[['UserID'] + irrelevant_cols].copy()
92
+
93
+ # Transaction Data Table
94
+ transaction_data = self.df[['UserID'] + transaction_cols].copy()
95
+
96
+ user_profiles.columns = [
97
+ 'UserID', 'AgeIndex', 'IncomeIndex', 'PurchaseFrequencyIndex',
98
+ 'AccountLifetimeIndex', 'AverageTransactionValueIndex', 'PreferredPaymentMethodIndex', 'RegionIndex'
99
+ ]
100
+
101
+ uplift_data.columns = [
102
+ 'UserID', 'EmailDiscountCTRIndex', 'WebDiscountCTRIndex', 'SocialMediaEngagementIndex',
103
+ 'DirectMailDiscountResponseIndex', 'InAppDiscountEngagementIndex', 'FlashSaleParticipationIndex',
104
+ 'SeasonalPromoInterestIndex', 'LoyaltyProgramEngagementIndex', 'ReferralBonusUsageIndex',
105
+ 'DiscountCodeRedemptionIndex', 'VIPSaleAccessIndex', 'EarlyAccessOptInIndex',
106
+ 'ProductReviewAfterDiscountIndex', 'UpsellConversionIndex', 'CrossSellInterestIndex',
107
+ 'BundlePurchaseIndex', 'SubscriptionUpgradeIndex', 'CustomerFeedbackIndex'
108
+ ]
109
+
110
+ irrelevant_data.columns = [
111
+ 'UserID', 'BrowserTypeIndex', 'DeviceCategoryIndex', 'OperatingSystemIndex',
112
+ 'SessionStartTimeIndex', 'LanguagePreferenceIndex', 'NewsletterSubscriptionIndex',
113
+ 'AccountVerificationStatusIndex', 'AdBlockerPresenceIndex'
114
+ ]
115
+
116
+ # transaction_data.columns = [
117
+ # 'UserID', 'DiscountCategoryIndex', 'PurchaseIndex', 'DiscountPercentageIndex'
118
+ # ]
119
+ transaction_data.columns = ['UserID'] + transaction_cols
120
+
121
+ # List of all DataFrames to be merged
122
+ self.dataframes = [user_profiles, uplift_data, irrelevant_data, transaction_data]
123
+
124
+ # Merge all DataFrames on 'UserID' in one line
125
+ self.df = reduce(lambda left, right: pd.merge(left, right, on='UserID'), self.dataframes)
126
+
127
+
128
+ def add_monetary_effect(self):
129
+ # Adding a monetary effect column
130
+ def base_price(df, informative_features):
131
+ if df.conversion == 0:
132
+ base_price = 0
133
+ else:
134
+ base_price = randint(1, 100)
135
+ return base_price
136
+
137
+ informative_features = [k for k in self.X_names if 'informative' in k]
138
+ self.df['base_price'] = self.df.apply(lambda x: base_price(x, informative_features), axis=1)
139
+ self.df['discounted_price'] = self.df['base_price']*(1-self.df['discount'])
140
+ self.df['benefit'] = self.df['discounted_price']-0.8*self.df['base_price']
data_utils/data_simulation.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ class UpliftSimulationReady:
4
+
5
+ def __init__(self, files_path):
6
+ self.files_path = files_path
7
+
8
+ def load_user_profiles(self, file_name):
9
+ user_profiles = pd.read_csv(self.files_path + file_name)
10
+ return user_profiles
11
+
12
+ def load_uplift_data(self, file_name):
13
+ uplift_data = pd.read_csv(self.files_path + file_name)
14
+ return uplift_data
15
+
16
+ def load_irrelevant_data(self, file_name):
17
+ irrelevant_data = pd.read_csv(self.files_path + file_name)
18
+ return irrelevant_data
19
+
20
+ def load_other_data(self, file_name):
21
+ transaction_data = pd.read_csv(self.files_path + file_name)
22
+ return transaction_data
data_utils/eda_simulation.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from data_utils.data_simulation import UpliftSimulationReady
4
+
5
+ class EDASimulationReady:
6
+
7
+ def __init__(self, files_path):
8
+ self.files_path = files_path
9
+
10
+ def load_conversions(self, file_name):
11
+
12
+ uplift_simulation = UpliftSimulationReady(self.files_path)
13
+ df = uplift_simulation.load_uplift_data(file_name)
14
+
15
+ sum_conversions = df.pivot_table(values=['conversion','discounted_price','benefit'],
16
+ index='treatment_group_key',
17
+ aggfunc=[np.sum],
18
+ margins=False)
19
+
20
+ mean_conversions = df.pivot_table(values=['conversion','discounted_price','benefit'],
21
+ index='treatment_group_key',
22
+ aggfunc=[np.mean],
23
+ margins=False)
24
+
25
+ # save to csv
26
+ sum_conversions.to_csv(self.files_path + 'sum_conversions.csv')
27
+ mean_conversions.to_csv(self.files_path + 'mean_conversions.csv')
28
+
29
+ return sum_conversions, mean_conversions
data_utils/exploratory_data_analysis.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ class ExploratoryAnalysis:
4
+ def __init__(self, df):
5
+ self.df = df
6
+
7
+ def compute_summaries(self):
8
+ sum_conversions = self.df.pivot_table(values=['conversion', 'discounted_price', 'benefit'],
9
+ index='treatment_group_key',
10
+ aggfunc='sum',
11
+ margins=False)
12
+
13
+ mean_conversions = self.df.pivot_table(values=['conversion', 'discounted_price', 'benefit'],
14
+ index='treatment_group_key',
15
+ aggfunc='mean',
16
+ margins=False)
17
+ return sum_conversions, mean_conversions
18
+
19
+ def compute_mean_benefit_vs_conversion(self):
20
+ _, mean_conversions = self.compute_summaries()
21
+ return mean_conversions[['conversion', 'benefit']]
22
+
23
+ def compute_ate(self):
24
+ _, mean_conversions = self.compute_summaries()
25
+ control_mean = mean_conversions.loc['control']
26
+ mean_conversions_ate = mean_conversions - control_mean
27
+ return mean_conversions_ate
data_utils/feature_importance.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from causalml.feature_selection.filters import FilterSelect
2
+
3
+ class FeatureImportance:
4
+
5
+ def __init__(self, df, X_names, y_name, treatment_group):
6
+ self.df = df
7
+ self.X_names = X_names
8
+ self.y_name = y_name
9
+ self.treatment_group = treatment_group
10
+
11
+ def compute_feature_importance(self):
12
+
13
+ filter_method = FilterSelect()
14
+ method = 'KL'
15
+ kl_imp = filter_method.get_importance(self.df, self.X_names, self.y_name, method,
16
+ treatment_group = self.treatment_group,
17
+ n_bins=20)
18
+ return kl_imp
data_utils/feature_importance_simulation.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from data_utils.data_simulation import UpliftSimulationReady
4
+
5
+ class FISimulationReady:
6
+
7
+ def __init__(self, files_path):
8
+ self.files_path = files_path
9
+
10
+ def load_feature_importance(self, file_name):
11
+ uplift_simulation = UpliftSimulationReady(self.files_path)
12
+ df = uplift_simulation.load_uplift_data(file_name)
13
+ return df
eval_utils/evaluation.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ from causalml.metrics import *
4
+
5
+ class ModelEvaluator:
6
+ def __init__(self, model, df_eval, X_names):
7
+ self.model = model
8
+ self.df_eval = df_eval
9
+ self.X_names = X_names
10
+
11
+ def predict_cate(self, discount):
12
+ """
13
+ Predicts the Conditional Average Treatment Effect (CATE) for a given discount level.
14
+ """
15
+ self.df_eval['cate'] = self.model.predict(
16
+ X=self.df_eval[self.X_names].values,
17
+ treatment=self.df_eval['treatment_group_key'].values
18
+ ).tolist()
19
+ self.df_eval[['cate_discount_05', 'cate_discount_10', 'cate_discount_15']] = pd.DataFrame(
20
+ self.df_eval.cate.tolist(),
21
+ index=self.df_eval.index
22
+ )
23
+
24
+ def eval_performance(self, discount):
25
+ """
26
+ Evaluates the model's performance for a specific discount, calculating Qini curves for conversion and benefit.
27
+ """
28
+ # Ensure CATE predictions are available
29
+ if 'cate' not in self.df_eval.columns:
30
+ self.predict_cate(discount)
31
+
32
+ df_eval_disc = self.df_eval[self.df_eval['treatment_group_key'].isin(['control', discount])]
33
+ df_eval_disc['treatment_num'] = df_eval_disc.apply(
34
+ lambda x: 0 if x['treatment_group_key'] == 'control' else 1,
35
+ axis=1
36
+ )
37
+
38
+ cate_col = 'cate_{}'.format(discount)
39
+
40
+ df_eval_qini_conversion = pd.DataFrame(
41
+ [df_eval_disc[cate_col].ravel(), df_eval_disc.treatment_num.ravel(), df_eval_disc['conversion'].ravel()],
42
+ index=['S', 'w', 'y']
43
+ ).T
44
+
45
+ df_eval_qini_benefit = pd.DataFrame(
46
+ [df_eval_disc[cate_col].ravel(), df_eval_disc.treatment_num.ravel(), df_eval_disc['benefit'].ravel()],
47
+ index=['S', 'w', 'y']
48
+ ).T
49
+
50
+ # Assuming get_qini function exists and calculates Qini coefficient
51
+ cd_conversion = (get_qini(df_eval_qini_conversion) * 2).reset_index()
52
+ cd_conversion = cd_conversion / cd_conversion.shape[0]
53
+
54
+ cd_benefit = (get_qini(df_eval_qini_benefit) * 2).reset_index()
55
+ cd_benefit = cd_benefit / cd_benefit.shape[0]
56
+
57
+ return cd_conversion, cd_benefit
eval_utils/evaluation_simulation.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ class CATEConversionEvaluateSimulationReady:
4
+
5
+ def __init__(self,data_path):
6
+ self.data_path = data_path
7
+
8
+ def evaluate(self, discount_group):
9
+ if discount_group == 5:
10
+ qini_05_conversion_test = pd.read_csv(self.data_path + 'qini_05_conversion_test.csv').drop(columns='Unnamed: 0')
11
+ qini_05_benefit_test = pd.read_csv(self.data_path + 'qini_05_benefit_test.csv').drop(columns='Unnamed: 0')
12
+ return qini_05_conversion_test, qini_05_benefit_test
13
+ elif discount_group == 10:
14
+ qini_10_conversion_test = pd.read_csv(self.data_path + 'qini_10_conversion_test.csv').drop(columns='Unnamed: 0')
15
+ qini_10_benefit_test = pd.read_csv(self.data_path + 'qini_10_benefit_test.csv').drop(columns='Unnamed: 0')
16
+ return qini_10_conversion_test, qini_10_benefit_test
17
+ elif discount_group == 15:
18
+ qini_15_conversion_test = pd.read_csv(self.data_path + 'qini_15_conversion_test.csv').drop(columns='Unnamed: 0')
19
+ qini_15_benefit_test = pd.read_csv(self.data_path + 'qini_15_benefit_test.csv').drop(columns='Unnamed: 0')
20
+ return qini_15_conversion_test, qini_15_benefit_test
21
+
22
+ class CATEBenefitEvaluateSimulationReady:
23
+
24
+ def __init__(self,data_path):
25
+ self.data_path = data_path
26
+
27
+ def evaluate(self, discount_group):
28
+ if discount_group == 5:
29
+ qini_05_conversion_test = pd.read_csv(self.data_path + 'qini_05_conversion_test_bate.csv').drop(columns='Unnamed: 0')
30
+ qini_05_benefit_test = pd.read_csv(self.data_path + 'qini_05_benefit_test_bate.csv').drop(columns='Unnamed: 0')
31
+ return qini_05_conversion_test, qini_05_benefit_test
32
+ elif discount_group == 10:
33
+ qini_10_conversion_test = pd.read_csv(self.data_path + 'qini_10_conversion_test_bate.csv').drop(columns='Unnamed: 0')
34
+ qini_10_benefit_test = pd.read_csv(self.data_path + 'qini_10_benefit_test_bate.csv').drop(columns='Unnamed: 0')
35
+ return qini_10_conversion_test, qini_10_benefit_test
36
+ elif discount_group == 15:
37
+ qini_15_conversion_test = pd.read_csv(self.data_path + 'qini_15_conversion_test_bate.csv').drop(columns='Unnamed: 0')
38
+ qini_15_benefit_test = pd.read_csv(self.data_path + 'qini_15_benefit_test_bate.csv').drop(columns='Unnamed: 0')
39
+ return qini_15_conversion_test, qini_15_benefit_test
mlops_utils/wandb_utils.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import wandb
2
+ import pandas as pd
3
+ import os
4
+
5
+ def upload_dataset_to_wandb(dirs, project_name, dataset_name, dataset_type='raw_dataset'):
6
+ with wandb.init(project=project_name, job_type='load-data') as run:
7
+ dataset_artifact = wandb.Artifact(dataset_name, type=dataset_type)
8
+ for dir in dirs:
9
+ dataset_artifact.add_dir(dir)
10
+ run.log_artifact(dataset_artifact)
11
+
12
+ def eda_work_with_dataset_to_wandb(dirs, project_name, dataset_name, dataset_type, artifact_type):
13
+ with wandb.init(project=project_name, job_type='eda') as run:
14
+ dataset_artifact = run.use_artifact(dataset_name, type=dataset_type)
15
+ eda_artifact = wandb.Artifact('eda_result', type=artifact_type)
16
+ for dir in dirs:
17
+ eda_artifact.add_dir(dir)
18
+ run.log_artifact(eda_artifact)
19
+
20
+ run.log({
21
+ "eda_result": pd.read_csv(
22
+ os.path.join(dirs[0], "kl_feature_importance.csv")
23
+ )
24
+ }
25
+ )
26
+
27
+ def training_results_to_wandb(dirs, project_name, dataset_name, dataset_type, artifact_type, model_name, job_type='train'):
28
+ with wandb.init(project=project_name, job_type=job_type) as run:
29
+ dataset_artifact = run.use_artifact(dataset_name, type=dataset_type)
30
+ model_artifact = wandb.Artifact(model_name, type=artifact_type)
31
+ for dir in dirs:
32
+ model_artifact.add_dir(dir)
33
+ run.log_artifact(model_artifact)
34
+
35
+ if job_type == 'train':
36
+ run.log({
37
+ "discount_05_feature_importance": pd.read_csv(
38
+ os.path.join(dirs[0], "discount_05_feature_importance.csv")
39
+ ),
40
+ "discount_10_feature_importance": pd.read_csv(
41
+ os.path.join(dirs[0], "discount_10_feature_importance.csv")
42
+ ),
43
+ "discount_15_feature_importance": pd.read_csv(
44
+ os.path.join(dirs[0], "discount_15_feature_importance.csv")
45
+ ),
46
+ }
47
+ )
models_utils/ml_models.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.model_selection import train_test_split
2
+ from xgboost import XGBRegressor, XGBClassifier
3
+
4
+ from causalml.inference.tree import UpliftRandomForestClassifier
5
+ from causalml.inference.meta import BaseXRegressor, BaseRRegressor, BaseSRegressor, BaseTRegressor
6
+ from causalml.inference.meta import BaseSClassifier, BaseTClassifier, BaseXClassifier, BaseRClassifier
7
+
8
+ class ModelTraining:
9
+ def __init__(self, df, y_name, X_names):
10
+ self.df = df
11
+ self.y_name = y_name
12
+ self.X_names = X_names
13
+ self.df_train = None
14
+ self.df_test = None
15
+ self.learner_t = None
16
+ self.conversion_learner_t = None
17
+ self.benefit_learner_t = None
18
+
19
+ def split_data(self, test_size, random_state):
20
+ self.df_train, self.df_test = train_test_split(
21
+ self.df,
22
+ test_size=test_size,
23
+ random_state=random_state
24
+ )
25
+
26
+ def fit_predict_classifier(self, params, control_name):
27
+ self.learner_t = BaseTClassifier(XGBClassifier(**params), control_name=control_name)
28
+ self.conversion_learner_t = self.learner_t
29
+ return self._fit_predict()
30
+
31
+ def fit_predict_regressor(self, params, control_name):
32
+ self.learner_t = BaseTRegressor(XGBRegressor(**params), control_name=control_name)
33
+ self.benefit_learner_t = self.learner_t
34
+ return self._fit_predict()
35
+
36
+ def _fit_predict(self):
37
+ self.learner_t_tau = self.learner_t.fit_predict(
38
+ X=self.df_train[self.X_names].values,
39
+ treatment=self.df_train['treatment_group_key'].values,
40
+ y=self.df_train[self.y_name].values
41
+ )
42
+ self.learner_t.feature_names = self.X_names
43
+ return self.learner_t_tau
44
+
45
+ def compute_feature_importance(self):
46
+ if self.learner_t is None:
47
+ raise ValueError("Model must be fitted before computing feature importances.")
48
+
49
+ return self.learner_t.get_importance(
50
+ X=self.df_train[self.X_names],
51
+ tau=self.learner_t_tau,
52
+ features=self.X_names,
53
+ normalize=True,
54
+ method='auto'
55
+ )
models_utils/models_simulation.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import pickle
3
+
4
+ class CATESimulationReady:
5
+
6
+ def __init__(self, model_path, y_pred_path):
7
+ self.model_path = model_path
8
+ self.y_pred_path = y_pred_path
9
+
10
+ # def get_model(self):
11
+ # model = pd.read_csv(self.model_path)
12
+ # return model
13
+
14
+ def predict(self):
15
+ with open(self.y_pred_path, 'rb') as f:
16
+ y_pred = pickle.load(f)
17
+ return y_pred
18
+
19
+ def feature_importance(self, fi_path):
20
+ fi = pd.read_csv(fi_path)
21
+ fi.columns = ['feature', 'score']
22
+ return fi
notebooks/Demo_Notebook.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/Test.ipynb ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 11,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd\n",
10
+ "import matplotlib.pyplot as plt\n",
11
+ "\n",
12
+ "class CATEEvaluateSimulationReady:\n",
13
+ "\n",
14
+ " def __init__(self,data_path):\n",
15
+ " self.data_path = data_path\n",
16
+ "\n",
17
+ " def evaluate(self, discount_group):\n",
18
+ " if discount_group == 5:\n",
19
+ " qini_05_conversion_test = pd.read_csv(self.data_path + 'qini_05_conversion_test.csv').drop(columns='Unnamed: 0')\n",
20
+ " qini_05_benefit_test = pd.read_csv(self.data_path + 'qini_05_benefit_test.csv').drop(columns='Unnamed: 0')\n",
21
+ " return qini_05_conversion_test, qini_05_benefit_test\n",
22
+ " elif discount_group == 10:\n",
23
+ " qini_10_conversion_test = pd.read_csv(self.data_path + 'qini_10_conversion_test.csv').drop(columns='Unnamed: 0')\n",
24
+ " qini_10_benefit_test = pd.read_csv(self.data_path + 'qini_10_benefit_test.csv').drop(columns='Unnamed: 0')\n",
25
+ " return qini_10_conversion_test, qini_10_benefit_test\n",
26
+ " elif discount_group == 15:\n",
27
+ " qini_15_conversion_test = pd.read_csv(self.data_path + 'qini_15_conversion_test.csv').drop(columns='Unnamed: 0')\n",
28
+ " qini_15_benefit_test = pd.read_csv(self.data_path + 'qini_15_benefit_test.csv').drop(columns='Unnamed: 0')\n",
29
+ " return qini_15_conversion_test, qini_15_benefit_test"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": 7,
35
+ "metadata": {},
36
+ "outputs": [],
37
+ "source": [
38
+ "eval = CATEEvaluateSimulationReady('../data/effect_data/')\n",
39
+ "qini_05_conversion_test, qini_05_benefit_test = eval.evaluate(5)\n",
40
+ "qini_10_conversion_test, qini_10_benefit_test = eval.evaluate(10)\n",
41
+ "qini_15_conversion_test, qini_15_benefit_test = eval.evaluate(15)"
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "code",
46
+ "execution_count": 23,
47
+ "metadata": {},
48
+ "outputs": [
49
+ {
50
+ "data": {
51
+ "text/plain": [
52
+ "Text(0.5, 1.0, 'CATE conversion vs Targeted Population')"
53
+ ]
54
+ },
55
+ "execution_count": 23,
56
+ "metadata": {},
57
+ "output_type": "execute_result"
58
+ },
59
+ {
60
+ "data": {
61
+ "image/png": "",
62
+ "text/plain": [
63
+ "<Figure size 432x288 with 1 Axes>"
64
+ ]
65
+ },
66
+ "metadata": {
67
+ "needs_background": "light"
68
+ },
69
+ "output_type": "display_data"
70
+ }
71
+ ],
72
+ "source": [
73
+ "fig_conversion, ax_conversion = plt.subplots()\n",
74
+ "\n",
75
+ "qini_05_conversion_test.plot(ax=ax_conversion, x='index', y='Random', color='b', ls='--', lw=0.5, label = '5% random')\n",
76
+ "qini_10_conversion_test.plot(ax=ax_conversion, x='index', y='Random', color='g', ls='--', lw=0.5, label = '10% random')\n",
77
+ "qini_15_conversion_test.plot(ax=ax_conversion, x='index', y='Random', color='y', ls='--', lw=0.5, label = '15% random')\n",
78
+ "qini_05_conversion_test.plot(ax=ax_conversion, x='index', y='S', color='b', label = '5% model')\n",
79
+ "qini_10_conversion_test.plot(ax=ax_conversion, x='index', y='S', color='g', label = '10% model')\n",
80
+ "qini_15_conversion_test.plot(ax=ax_conversion, x='index', y='S', color='y', label = '15% model')\n",
81
+ "\n",
82
+ "ax_conversion.legend()\n",
83
+ "ax_conversion.set_xlabel('Fraction of Targeted Users')\n",
84
+ "ax_conversion.set_ylabel('CATE conversion')\n",
85
+ "ax_conversion.set_title('CATE conversion vs Targeted Population')"
86
+ ]
87
+ },
88
+ {
89
+ "cell_type": "code",
90
+ "execution_count": 26,
91
+ "metadata": {},
92
+ "outputs": [
93
+ {
94
+ "data": {
95
+ "text/plain": [
96
+ "Text(0.5, 1.0, 'CATE benefit vs Targeted Population')"
97
+ ]
98
+ },
99
+ "execution_count": 26,
100
+ "metadata": {},
101
+ "output_type": "execute_result"
102
+ },
103
+ {
104
+ "data": {
105
+ "image/png": "",
106
+ "text/plain": [
107
+ "<Figure size 432x288 with 1 Axes>"
108
+ ]
109
+ },
110
+ "metadata": {
111
+ "needs_background": "light"
112
+ },
113
+ "output_type": "display_data"
114
+ }
115
+ ],
116
+ "source": [
117
+ "fig_benefit, ax_benefit = plt.subplots()\n",
118
+ "qini_05_benefit_test.plot(ax=ax_benefit, x='index', y='Random', color='b', ls='--', lw=0.5, label = '5% random')\n",
119
+ "qini_10_benefit_test.plot(ax=ax_benefit, x='index', y='Random', color='g', ls='--', lw=0.5, label = '10% random')\n",
120
+ "qini_15_benefit_test.plot(ax=ax_benefit, x='index', y='Random', color='y', ls='--', lw=0.5, label = '15% random')\n",
121
+ "qini_05_benefit_test.plot(ax=ax_benefit, x='index', y='S', color='b', label = '5% model')\n",
122
+ "qini_10_benefit_test.plot(ax=ax_benefit, x='index', y='S', color='g', label = '10% model')\n",
123
+ "qini_15_benefit_test.plot(ax=ax_benefit, x='index', y='S', color='y', label = '15% model')\n",
124
+ "ax_benefit.legend()\n",
125
+ "ax_benefit.set_xlabel('Fraction of Targeted Users')\n",
126
+ "ax_benefit.set_ylabel('CATE Benefit')\n",
127
+ "ax_benefit.set_title('CATE benefit vs Targeted Population')"
128
+ ]
129
+ }
130
+ ],
131
+ "metadata": {
132
+ "kernelspec": {
133
+ "display_name": "Python 3",
134
+ "language": "python",
135
+ "name": "python3"
136
+ },
137
+ "language_info": {
138
+ "codemirror_mode": {
139
+ "name": "ipython",
140
+ "version": 3
141
+ },
142
+ "file_extension": ".py",
143
+ "mimetype": "text/x-python",
144
+ "name": "python",
145
+ "nbconvert_exporter": "python",
146
+ "pygments_lexer": "ipython3",
147
+ "version": "3.9.6"
148
+ }
149
+ },
150
+ "nbformat": 4,
151
+ "nbformat_minor": 2
152
+ }
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ causalml==0.15.0
2
+ matplotlib==3.8.3
3
+ numpy==1.23.5
4
+ pandas==2.2.1
5
+ scikit_learn==1.4.1.post1
6
+ streamlit==1.32.2
7
+ xgboost==2.0.3