Spaces:

neuronslabs
/

uplift_modeling

Sleeping

App Files Files Community

uplift_modeling / data_utils /data_generation.py

howardroark

initial commit

6f4f21f over 1 year ago

raw

history blame contribute delete

7.32 kB

	import pandas as pd
	from functools import reduce
	from random import randint

	from causalml.dataset import make_uplift_classification

	class UpliftSimulation:

	def __init__(self, n=50000, y_name='conversion',
	treatment_group_keys=['control', 'discount_05', 'discount_10', 'discount_15'],
	n_classification_features=15, n_classification_informative=7,
	n_classification_repeated=0,
	n_uplift_increase_dict={'discount_05': 4, 'discount_10': 3, 'discount_15': 3},
	n_uplift_decrease_dict={'discount_05': 0, 'discount_10': 0, 'discount_15': 0},
	delta_uplift_increase_dict={'discount_05': 0.0020, 'discount_10': 0.0045, 'discount_15': 0.008},
	delta_uplift_decrease_dict={'discount_05': 0, 'discount_10': 0, 'discount_15': 0},
	n_uplift_increase_mix_informative_dict={'discount_05': 3, 'discount_10': 2, 'discount_15': 3},
	n_uplift_decrease_mix_informative_dict={'discount_05': 0, 'discount_10': 0, 'discount_15': 0},
	positive_class_proportion=0.05, random_seed=8097):
	self.n = n
	self.y_name = y_name
	self.treatment_group_keys = treatment_group_keys
	self.n_classification_features = n_classification_features
	self.n_classification_informative = n_classification_informative
	self.n_classification_repeated = n_classification_repeated
	self.n_uplift_increase_dict = n_uplift_increase_dict
	self.n_uplift_decrease_dict = n_uplift_decrease_dict
	self.delta_uplift_increase_dict = delta_uplift_increase_dict
	self.delta_uplift_decrease_dict = delta_uplift_decrease_dict
	self.n_uplift_increase_mix_informative_dict = n_uplift_increase_mix_informative_dict
	self.n_uplift_decrease_mix_informative_dict = n_uplift_decrease_mix_informative_dict
	self.positive_class_proportion = positive_class_proportion
	self.random_seed = random_seed
	self.df = None
	self.X_names = None

	def simulate_dataset(self):
	self.df, self.X_names = make_uplift_classification(
	treatment_name=self.treatment_group_keys,
	y_name=self.y_name,
	n_samples=self.n,
	n_classification_features=self.n_classification_features,
	n_classification_informative=self.n_classification_informative,
	n_classification_repeated=self.n_classification_repeated,
	n_uplift_increase_dict=self.n_uplift_increase_dict,
	n_uplift_decrease_dict=self.n_uplift_decrease_dict,
	delta_uplift_increase_dict=self.delta_uplift_increase_dict,
	delta_uplift_decrease_dict=self.delta_uplift_decrease_dict,
	n_uplift_increase_mix_informative_dict=self.n_uplift_increase_mix_informative_dict,
	n_uplift_decrease_mix_informative_dict=self.n_uplift_decrease_mix_informative_dict,
	positive_class_proportion=self.positive_class_proportion,
	random_seed=self.random_seed,
	)

	def apply_discounts_and_clean(self):
	discounts_dict = {'control': 0, 'discount_05': 0.05, 'discount_10': 0.10, 'discount_15': 0.15}
	self.df['discount'] = self.df['treatment_group_key']
	self.df = self.df.replace({"discount": discounts_dict})
	self.df.drop(columns=['treatment_effect'], inplace=True)


	def postprocess_tables(self):

	# Add a synthetic UserID for each entry
	self.df['UserID'] = range(len(self.df))

	# Mapping the columns
	informative_cols = [col for col in self.df.columns if 'informative' in col]
	uplift_cols = [col for col in self.df.columns if 'uplift' in col]
	irrelevant_cols = [col for col in self.df.columns if 'irrelevant' in col]
	transaction_cols = ['treatment_group_key', 'conversion', 'discount']

	# User Demographics and Profiles Table (Including Informative Features)
	user_profiles = self.df[['UserID'] + informative_cols].copy()

	# Web Interaction Data Table (This might need adjustment based on actual data)
	# If any of the 'informative' columns relate to web interaction, include them here.

	# Uplift-Related Data Table
	uplift_data = self.df[['UserID'] + uplift_cols].copy()

	# Adjusting the Uplift-Related Data table to include the mixed features
	mixed_uplift_columns = ['x31_increase_mix', 'x22_increase_mix', 'x20_increase_mix',
	'x33_increase_mix', 'x32_increase_mix', 'x27_increase_mix',
	'x21_increase_mix', 'x26_increase_mix']

	# Assuming uplift_data already includes the 'UserID' column
	uplift_data = pd.concat([uplift_data, self.df[mixed_uplift_columns]], axis=1)

	# Irrelevant Data Table
	irrelevant_data = self.df[['UserID'] + irrelevant_cols].copy()

	# Transaction Data Table
	transaction_data = self.df[['UserID'] + transaction_cols].copy()

	user_profiles.columns = [
	'UserID', 'AgeIndex', 'IncomeIndex', 'PurchaseFrequencyIndex',
	'AccountLifetimeIndex', 'AverageTransactionValueIndex', 'PreferredPaymentMethodIndex', 'RegionIndex'
	]

	uplift_data.columns = [
	'UserID', 'EmailDiscountCTRIndex', 'WebDiscountCTRIndex', 'SocialMediaEngagementIndex',
	'DirectMailDiscountResponseIndex', 'InAppDiscountEngagementIndex', 'FlashSaleParticipationIndex',
	'SeasonalPromoInterestIndex', 'LoyaltyProgramEngagementIndex', 'ReferralBonusUsageIndex',
	'DiscountCodeRedemptionIndex', 'VIPSaleAccessIndex', 'EarlyAccessOptInIndex',
	'ProductReviewAfterDiscountIndex', 'UpsellConversionIndex', 'CrossSellInterestIndex',
	'BundlePurchaseIndex', 'SubscriptionUpgradeIndex', 'CustomerFeedbackIndex'
	]

	irrelevant_data.columns = [
	'UserID', 'BrowserTypeIndex', 'DeviceCategoryIndex', 'OperatingSystemIndex',
	'SessionStartTimeIndex', 'LanguagePreferenceIndex', 'NewsletterSubscriptionIndex',
	'AccountVerificationStatusIndex', 'AdBlockerPresenceIndex'
	]

	# transaction_data.columns = [
	# 'UserID', 'DiscountCategoryIndex', 'PurchaseIndex', 'DiscountPercentageIndex'
	# ]
	transaction_data.columns = ['UserID'] + transaction_cols

	# List of all DataFrames to be merged
	self.dataframes = [user_profiles, uplift_data, irrelevant_data, transaction_data]

	# Merge all DataFrames on 'UserID' in one line
	self.df = reduce(lambda left, right: pd.merge(left, right, on='UserID'), self.dataframes)


	def add_monetary_effect(self):
	# Adding a monetary effect column
	def base_price(df, informative_features):
	if df.conversion == 0:
	base_price = 0
	else:
	base_price = randint(1, 100)
	return base_price

	informative_features = [k for k in self.X_names if 'informative' in k]
	self.df['base_price'] = self.df.apply(lambda x: base_price(x, informative_features), axis=1)
	self.df['discounted_price'] = self.df['base_price']*(1-self.df['discount'])
	self.df['benefit'] = self.df['discounted_price']-0.8*self.df['base_price']