File size: 7,317 Bytes
6f4f21f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import pandas as pd
from functools import reduce
from random import randint

from causalml.dataset import make_uplift_classification

class UpliftSimulation:

    def __init__(self, n=50000, y_name='conversion', 
                 treatment_group_keys=['control', 'discount_05', 'discount_10', 'discount_15'], 
                 n_classification_features=15, n_classification_informative=7, 
                 n_classification_repeated=0, 
                 n_uplift_increase_dict={'discount_05': 4, 'discount_10': 3, 'discount_15': 3}, 
                 n_uplift_decrease_dict={'discount_05': 0, 'discount_10': 0, 'discount_15': 0}, 
                 delta_uplift_increase_dict={'discount_05': 0.0020, 'discount_10': 0.0045, 'discount_15': 0.008}, 
                 delta_uplift_decrease_dict={'discount_05': 0, 'discount_10': 0, 'discount_15': 0}, 
                 n_uplift_increase_mix_informative_dict={'discount_05': 3, 'discount_10': 2, 'discount_15': 3}, 
                 n_uplift_decrease_mix_informative_dict={'discount_05': 0, 'discount_10': 0, 'discount_15': 0}, 
                 positive_class_proportion=0.05, random_seed=8097):
        self.n = n
        self.y_name = y_name
        self.treatment_group_keys = treatment_group_keys
        self.n_classification_features = n_classification_features
        self.n_classification_informative = n_classification_informative
        self.n_classification_repeated = n_classification_repeated
        self.n_uplift_increase_dict = n_uplift_increase_dict
        self.n_uplift_decrease_dict = n_uplift_decrease_dict
        self.delta_uplift_increase_dict = delta_uplift_increase_dict
        self.delta_uplift_decrease_dict = delta_uplift_decrease_dict
        self.n_uplift_increase_mix_informative_dict = n_uplift_increase_mix_informative_dict
        self.n_uplift_decrease_mix_informative_dict = n_uplift_decrease_mix_informative_dict
        self.positive_class_proportion = positive_class_proportion
        self.random_seed = random_seed
        self.df = None
        self.X_names = None

    def simulate_dataset(self):
        self.df, self.X_names = make_uplift_classification(
            treatment_name=self.treatment_group_keys,
            y_name=self.y_name,
            n_samples=self.n,
            n_classification_features=self.n_classification_features,
            n_classification_informative=self.n_classification_informative,
            n_classification_repeated=self.n_classification_repeated,
            n_uplift_increase_dict=self.n_uplift_increase_dict,
            n_uplift_decrease_dict=self.n_uplift_decrease_dict,
            delta_uplift_increase_dict=self.delta_uplift_increase_dict,
            delta_uplift_decrease_dict=self.delta_uplift_decrease_dict,
            n_uplift_increase_mix_informative_dict=self.n_uplift_increase_mix_informative_dict,
            n_uplift_decrease_mix_informative_dict=self.n_uplift_decrease_mix_informative_dict,
            positive_class_proportion=self.positive_class_proportion,
            random_seed=self.random_seed,
        )

    def apply_discounts_and_clean(self):
        discounts_dict = {'control': 0, 'discount_05': 0.05, 'discount_10': 0.10, 'discount_15': 0.15}
        self.df['discount'] = self.df['treatment_group_key']
        self.df = self.df.replace({"discount": discounts_dict})
        self.df.drop(columns=['treatment_effect'], inplace=True)


    def postprocess_tables(self):

        # Add a synthetic UserID for each entry
        self.df['UserID'] = range(len(self.df))

        # Mapping the columns
        informative_cols = [col for col in self.df.columns if 'informative' in col]
        uplift_cols = [col for col in self.df.columns if 'uplift' in col]
        irrelevant_cols = [col for col in self.df.columns if 'irrelevant' in col]
        transaction_cols = ['treatment_group_key', 'conversion', 'discount']

        # User Demographics and Profiles Table (Including Informative Features)
        user_profiles = self.df[['UserID'] + informative_cols].copy()

        # Web Interaction Data Table (This might need adjustment based on actual data)
        # If any of the 'informative' columns relate to web interaction, include them here.

        # Uplift-Related Data Table
        uplift_data = self.df[['UserID'] + uplift_cols].copy()

        # Adjusting the Uplift-Related Data table to include the mixed features
        mixed_uplift_columns = ['x31_increase_mix', 'x22_increase_mix', 'x20_increase_mix',
                                'x33_increase_mix', 'x32_increase_mix', 'x27_increase_mix',
                                'x21_increase_mix', 'x26_increase_mix']

        # Assuming uplift_data already includes the 'UserID' column
        uplift_data = pd.concat([uplift_data, self.df[mixed_uplift_columns]], axis=1)

        # Irrelevant Data Table
        irrelevant_data = self.df[['UserID'] + irrelevant_cols].copy()

        # Transaction Data Table
        transaction_data = self.df[['UserID'] + transaction_cols].copy()

        user_profiles.columns = [
            'UserID', 'AgeIndex', 'IncomeIndex', 'PurchaseFrequencyIndex',
            'AccountLifetimeIndex', 'AverageTransactionValueIndex', 'PreferredPaymentMethodIndex', 'RegionIndex'
        ]

        uplift_data.columns = [
            'UserID', 'EmailDiscountCTRIndex', 'WebDiscountCTRIndex', 'SocialMediaEngagementIndex',
            'DirectMailDiscountResponseIndex', 'InAppDiscountEngagementIndex', 'FlashSaleParticipationIndex',
            'SeasonalPromoInterestIndex', 'LoyaltyProgramEngagementIndex', 'ReferralBonusUsageIndex',
            'DiscountCodeRedemptionIndex', 'VIPSaleAccessIndex', 'EarlyAccessOptInIndex',
            'ProductReviewAfterDiscountIndex', 'UpsellConversionIndex', 'CrossSellInterestIndex',
            'BundlePurchaseIndex', 'SubscriptionUpgradeIndex', 'CustomerFeedbackIndex'
        ]

        irrelevant_data.columns = [
            'UserID', 'BrowserTypeIndex', 'DeviceCategoryIndex', 'OperatingSystemIndex',
            'SessionStartTimeIndex', 'LanguagePreferenceIndex', 'NewsletterSubscriptionIndex',
            'AccountVerificationStatusIndex', 'AdBlockerPresenceIndex'
        ]

        # transaction_data.columns = [
        #     'UserID', 'DiscountCategoryIndex', 'PurchaseIndex', 'DiscountPercentageIndex'
        # ]
        transaction_data.columns = ['UserID'] + transaction_cols

        # List of all DataFrames to be merged
        self.dataframes = [user_profiles, uplift_data, irrelevant_data, transaction_data]

        # Merge all DataFrames on 'UserID' in one line
        self.df = reduce(lambda left, right: pd.merge(left, right, on='UserID'), self.dataframes)


    def add_monetary_effect(self):
        # Adding a monetary effect column
        def base_price(df, informative_features):
            if df.conversion == 0:
                base_price = 0
            else:
                base_price = randint(1, 100)
            return base_price

        informative_features = [k for k in self.X_names if 'informative' in k]
        self.df['base_price'] = self.df.apply(lambda x: base_price(x, informative_features), axis=1)
        self.df['discounted_price'] = self.df['base_price']*(1-self.df['discount'])
        self.df['benefit'] = self.df['discounted_price']-0.8*self.df['base_price']