Spaces:

samkeet
/

MediaMixOptimization

Sleeping

App Files Files Community

MediaMixOptimization / pages /7_AI_Model_Media_Performance.py

samkeet

Upload 40 files

00b00eb verified 10 days ago

raw

history blame contribute delete

26.9 kB

	import streamlit as st
	import pandas as pd
	from sklearn.preprocessing import MinMaxScaler
	import pickle
	import os
	from utilities_with_panel import load_local_css, set_header
	import yaml
	from yaml import SafeLoader
	import sqlite3
	from datetime import timedelta
	from utilities import (
	set_header,
	load_local_css,
	update_db,
	project_selection,
	retrieve_pkl_object,
	)
	from utilities_with_panel import (
	overview_test_data_prep_panel,
	overview_test_data_prep_nonpanel,
	initialize_data_cmp,
	create_channel_summary,
	create_contribution_pie,
	create_contribuion_stacked_plot,
	create_channel_spends_sales_plot,
	format_numbers,
	channel_name_formating,
	)
	from log_application import log_message
	import sys, traceback
	from post_gres_cred import db_cred

	st.set_page_config(layout="wide")
	load_local_css("styles.css")
	set_header()


	schema = db_cred["schema"]

	if "username" not in st.session_state:
	st.session_state["username"] = None

	if "project_name" not in st.session_state:
	st.session_state["project_name"] = None

	if "project_dct" not in st.session_state:
	project_selection()
	st.stop()

	tuned_model = retrieve_pkl_object(
	st.session_state["project_number"], "Model_Tuning", "tuned_model", schema
	)

	if tuned_model is None:
	st.error("Please save a tuned model")
	st.stop()

	if (
	"session_state_saved" in st.session_state["project_dct"]["model_tuning"].keys()
	and st.session_state["project_dct"]["model_tuning"]["session_state_saved"] != []
	):
	for key in ["used_response_metrics", "media_data", "bin_dict"]:
	if key not in st.session_state:
	st.session_state[key] = st.session_state["project_dct"]["model_tuning"][
	"session_state_saved"
	][key]


	## DEFINE ALL FUNCTIONS
	def get_random_effects(media_data, panel_col, mdf):
	random_eff_df = pd.DataFrame(columns=[panel_col, "random_effect"])
	for i, market in enumerate(media_data[panel_col].unique()):
	print(i, end="\r")
	intercept = mdf.random_effects[market].values[0]
	random_eff_df.loc[i, "random_effect"] = intercept
	random_eff_df.loc[i, panel_col] = market

	return random_eff_df


	def process_train_and_test(train, test, features, panel_col, target_col):
	X1 = train[features]

	ss = MinMaxScaler()
	X1 = pd.DataFrame(ss.fit_transform(X1), columns=X1.columns)

	X1[panel_col] = train[panel_col]
	X1[target_col] = train[target_col]

	if test is not None:
	X2 = test[features]
	X2 = pd.DataFrame(ss.transform(X2), columns=X2.columns)
	X2[panel_col] = test[panel_col]
	X2[target_col] = test[target_col]
	return X1, X2
	return X1


	def mdf_predict(X_df, mdf, random_eff_df):
	X = X_df.copy()
	X = pd.merge(
	X,
	random_eff_df[[panel_col, "random_effect"]],
	on=panel_col,
	how="left",
	)
	X["pred_fixed_effect"] = mdf.predict(X)

	X["pred"] = X["pred_fixed_effect"] + X["random_effect"]
	X.drop(columns=["pred_fixed_effect", "random_effect"], inplace=True)

	return X


	try:
	if "username" in st.session_state and st.session_state["username"] is not None:

	# conn = sqlite3.connect(
	# r"DB/User.db", check_same_thread=False
	# ) # connection with sql db
	# c = conn.cursor()

	tuned_model = retrieve_pkl_object(
	st.session_state["project_number"], "Model_Tuning", "tuned_model", schema
	)

	if tuned_model is None:
	st.error("Please save a tuned model")
	st.stop()

	if (
	"session_state_saved"
	in st.session_state["project_dct"]["model_tuning"].keys()
	and st.session_state["project_dct"]["model_tuning"]["session_state_saved"]
	!= []
	):
	for key in [
	"used_response_metrics",
	"is_tuned_model",
	"media_data",
	"X_test_spends",
	"spends_data",
	]:
	st.session_state[key] = st.session_state["project_dct"]["model_tuning"][
	"session_state_saved"
	][key]
	elif (
	"session_state_saved"
	in st.session_state["project_dct"]["model_build"].keys()
	and st.session_state["project_dct"]["model_build"]["session_state_saved"]
	!= []
	):
	for key in [
	"used_response_metrics",
	"date",
	"saved_model_names",
	"media_data",
	"X_test_spends",
	]:
	st.session_state[key] = st.session_state["project_dct"]["model_build"][
	"session_state_saved"
	][key]
	else:
	st.error("Please tune a model first")
	st.session_state["bin_dict"] = st.session_state["project_dct"]["model_build"][
	"session_state_saved"
	]["bin_dict"]
	st.session_state["media_data"].columns = [
	c.lower() for c in st.session_state["media_data"].columns
	]

	# with open(
	# os.path.join(st.session_state["project_path"], "data_import.pkl"),
	# "rb",
	# ) as f:
	# data = pickle.load(f)

	# # Accessing the loaded objects

	# st.session_state["orig_media_data"] = data["final_df"]

	st.session_state["orig_media_data"] = st.session_state["project_dct"][
	"data_import"
	][
	"imputed_tool_df"
	].copy() # db
	st.session_state["channels"] = st.session_state["project_dct"]["data_import"][
	"group_dict"
	].copy()
	# target='Revenue'

	# set the panel column
	panel_col = "panel"
	is_panel = (
	True if st.session_state["media_data"][panel_col].nunique() > 1 else False
	)

	date_col = "date"

	cols1 = st.columns([2, 1])
	with cols1[0]:
	st.markdown(f"Welcome {st.session_state['username']}")
	with cols1[1]:
	st.markdown(f"Current Project: {st.session_state['project_name']}")

	st.title("AI Model Media Performance")

	def remove_response_metric(name):
	# Convert the name to a lowercase string and remove any leading or trailing spaces
	name_str = str(name).lower().strip()

	# Check if the name starts with "response metric" or "response_metric"
	if name_str.startswith("response metric"):
	return name[len("response metric") :].replace("_", " ").strip().title()
	elif name_str.startswith("response_metric"):
	return name[len("response_metric") :].replace("_", " ").strip().title()
	else:
	return name

	sel_target_col = st.selectbox(
	"Select the response metric",
	st.session_state["used_response_metrics"],
	format_func=remove_response_metric,
	)
	sel_target_col_frmttd = sel_target_col.replace("_", " ").replace("-", " ")
	sel_target_col_frmttd = sel_target_col_frmttd.title()
	target_col = (
	sel_target_col.lower()
	.replace(" ", "_")
	.replace("-", "")
	.replace(":", "")
	.replace("__", "_")
	)
	target = sel_target_col

	# Contribution
	if is_panel:
	# read tuned mixedLM model
	if st.session_state["is_tuned_model"][target_col] == True:

	model_dict = retrieve_pkl_object(
	st.session_state["project_number"],
	"Model_Tuning",
	"tuned_model",
	schema,
	) # db

	saved_models = list(model_dict.keys())
	required_saved_models = [
	m.split("__")[0]
	for m in saved_models
	if m.split("__")[1] == target_col
	]

	sel_model = required_saved_models[
	0
	] # only 1 tuned model available per resp metric

	sel_model_dict = model_dict[sel_model + "__" + target_col]

	model = sel_model_dict["Model_object"]
	X_train = sel_model_dict["X_train_tuned"]
	X_test = sel_model_dict["X_test_tuned"]
	best_feature_set = sel_model_dict["feature_set"]

	# Calculate contributions

	st.session_state["orig_media_data"].columns = [
	col.lower()
	.replace(".", "_")
	.replace("@", "_")
	.replace(" ", "_")
	.replace("-", "")
	.replace(":", "")
	.replace("__", "_")
	for col in st.session_state["orig_media_data"].columns
	]

	media_data = st.session_state["media_data"]

	contri_df = pd.DataFrame()

	y = []
	y_pred = []

	random_eff_df = get_random_effects(media_data, panel_col, model)
	random_eff_df["fixed_effect"] = model.fe_params["Intercept"]
	random_eff_df["panel_effect"] = (
	random_eff_df["random_effect"] + random_eff_df["fixed_effect"]
	)

	coef_df = pd.DataFrame(model.fe_params)
	coef_df.reset_index(inplace=True)
	coef_df.columns = ["feature", "coef"]

	x_train_contribution = X_train.copy()
	x_test_contribution = X_test.copy()

	# preprocessing not needed since X_train is already preprocessed
	# X1, X2 = process_train_and_test(x_train_contribution, x_test_contribution, best_feature_set, panel_col, target_col)
	# x_train_contribution[best_feature_set] = X1[best_feature_set]
	# x_test_contribution[best_feature_set] = X2[best_feature_set]

	x_train_contribution = mdf_predict(
	x_train_contribution, model, random_eff_df
	)
	x_test_contribution = mdf_predict(x_test_contribution, model, random_eff_df)

	x_train_contribution = pd.merge(
	x_train_contribution,
	random_eff_df[[panel_col, "panel_effect"]],
	on=panel_col,
	how="left",
	)
	x_test_contribution = pd.merge(
	x_test_contribution,
	random_eff_df[[panel_col, "panel_effect"]],
	on=panel_col,
	how="left",
	)

	for i in range(len(coef_df))[1:]:
	coef = coef_df.loc[i, "coef"]
	col = coef_df.loc[i, "feature"]
	if col.lower() != "intercept":
	x_train_contribution[str(col) + "_contr"] = (
	coef * x_train_contribution[col]
	)
	x_test_contribution[str(col) + "_contr"] = (
	coef * x_train_contribution[col]
	)

	tuning_cols = [
	c
	for c in x_train_contribution.filter(regex="contr").columns
	if c
	in [
	"day_of_week_contr",
	"Trend_contr",
	"sine_wave_contr",
	"cosine_wave_contr",
	]
	]
	flag_cols = [
	c
	for c in x_train_contribution.filter(regex="contr").columns
	if "_flag" in c
	]

	# add exogenous contribution to base
	all_exog_vars = st.session_state["bin_dict"]["Exogenous"]
	all_exog_vars = [
	var.lower()
	.replace(".", "_")
	.replace("@", "_")
	.replace(" ", "_")
	.replace("-", "")
	.replace(":", "")
	.replace("__", "_")
	for var in all_exog_vars
	]
	exog_cols = []
	if len(all_exog_vars) > 0:
	for col in x_train_contribution.filter(regex="contr").columns:
	if (
	len([exog_var for exog_var in all_exog_vars if exog_var in col])
	> 0
	):
	exog_cols.append(col)

	base_cols = ["panel_effect"] + flag_cols + tuning_cols + exog_cols

	x_train_contribution["base_contr"] = x_train_contribution[base_cols].sum(
	axis=1
	)
	x_train_contribution.drop(columns=base_cols, inplace=True)
	x_test_contribution["base_contr"] = x_test_contribution[base_cols].sum(
	axis=1
	)
	x_test_contribution.drop(columns=base_cols, inplace=True)

	overall_contributions = pd.concat(
	[x_train_contribution, x_test_contribution]
	).reset_index(drop=True)

	overview_test_data_prep_panel(
	overall_contributions,
	st.session_state["orig_media_data"],
	st.session_state["spends_data"],
	date_col,
	panel_col,
	target_col,
	)

	else: # NON PANEL
	if st.session_state["is_tuned_model"][target_col] == True: # Sprint4
	# with open(
	# os.path.join(st.session_state["project_path"], "tuned_model.pkl"),
	# "rb",
	# ) as file:
	# model_dict = pickle.load(file)

	model_dict = retrieve_pkl_object(
	st.session_state["project_number"],
	"Model_Tuning",
	"tuned_model",
	schema,
	) # db

	saved_models = list(model_dict.keys())
	required_saved_models = [
	m.split("__")[0]
	for m in saved_models
	if m.split("__")[1] == target_col
	]

	sel_model = required_saved_models[
	0
	] # only 1 tuned model available per resp metric
	sel_model_dict = model_dict[sel_model + "__" + target_col]

	model = sel_model_dict["Model_object"]
	X_train = sel_model_dict["X_train_tuned"]
	X_test = sel_model_dict["X_test_tuned"]
	best_feature_set = sel_model_dict["feature_set"]

	x_train_contribution = X_train.copy()
	x_test_contribution = X_test.copy()

	x_train_contribution["pred"] = model.predict(
	x_train_contribution[best_feature_set]
	)
	x_test_contribution["pred"] = model.predict(
	x_test_contribution[best_feature_set]
	)

	coef_df = pd.DataFrame(model.params)
	coef_df.reset_index(inplace=True)
	coef_df.columns = ["feature", "coef"]

	# st.write(coef_df)
	for i in range(len(coef_df)):
	coef = coef_df.loc[i, "coef"]
	col = coef_df.loc[i, "feature"]
	if col != "const":
	x_train_contribution[str(col) + "_contr"] = (
	coef * x_train_contribution[col]
	)
	x_test_contribution[str(col) + "_contr"] = (
	coef * x_test_contribution[col]
	)
	else:
	x_train_contribution["const"] = coef
	x_test_contribution["const"] = coef

	tuning_cols = [
	c
	for c in x_train_contribution.filter(regex="contr").columns
	if c
	in [
	"day_of_week_contr",
	"Trend_contr",
	"sine_wave_contr",
	"cosine_wave_contr",
	]
	]
	flag_cols = [
	c
	for c in x_train_contribution.filter(regex="contr").columns
	if "_flag" in c
	]

	# add exogenous contribution to base
	all_exog_vars = st.session_state["bin_dict"]["Exogenous"]
	all_exog_vars = [
	var.lower()
	.replace(".", "_")
	.replace("@", "_")
	.replace(" ", "_")
	.replace("-", "")
	.replace(":", "")
	.replace("__", "_")
	for var in all_exog_vars
	]
	exog_cols = []
	if len(all_exog_vars) > 0:
	for col in x_train_contribution.filter(regex="contr").columns:
	if (
	len([exog_var for exog_var in all_exog_vars if exog_var in col])
	> 0
	):
	exog_cols.append(col)

	base_cols = ["const"] + flag_cols + tuning_cols + exog_cols
	# st.write(base_cols)
	x_train_contribution["base_contr"] = x_train_contribution[base_cols].sum(
	axis=1
	)
	x_train_contribution.drop(columns=base_cols, inplace=True)

	x_test_contribution["base_contr"] = x_test_contribution[base_cols].sum(
	axis=1
	)
	x_test_contribution.drop(columns=base_cols, inplace=True)
	# x_test_contribution.to_csv("Test/test_contr.csv", index=False)

	overall_contributions = pd.concat(
	[x_train_contribution, x_test_contribution]
	).reset_index(drop=True)
	# overall_contributions.to_csv("Test/overall_contributions.csv", index=False)

	overview_test_data_prep_nonpanel(
	overall_contributions,
	st.session_state["orig_media_data"].copy(),
	st.session_state["spends_data"].copy(),
	date_col,
	target_col,
	)
	# for k, v in st.session_sta
	# te.items():

	# if k not in ['logout', 'login','config'] and not k.startswith('FormSubmitter'):
	# st.session_state[k] = v

	# authenticator = st.session_state.get('authenticator')

	# if authenticator is None:
	# authenticator = load_authenticator()

	# name, authentication_status, username = authenticator.login('Login', 'main')
	# auth_status = st.session_state['authentication_status']

	# if auth_status:
	# authenticator.logout('Logout', 'main')

	# is_state_initiaized = st.session_state.get('initialized',False)
	# if not is_state_initiaized:

	min_date = X_train[date_col].min().date()
	max_date = X_test[date_col].max().date()
	if "media_performance" not in st.session_state["project_dct"]:
	st.session_state["project_dct"]["media_performance"] = {
	"start_date": None,
	"end_date": None,
	}

	start_default = st.session_state["project_dct"]["media_performance"].get(
	"start_date", None
	)
	start_default = start_default if start_default is not None else min_date
	start_default = start_default if start_default > min_date else min_date
	start_default = start_default if start_default < max_date else min_date

	end_default = st.session_state["project_dct"]["media_performance"].get(
	"end_date", None
	)
	end_default = end_default if end_default is not None else max_date
	end_default = end_default if end_default > min_date else max_date
	end_default = end_default if end_default < max_date else max_date

	st.write("Select a timeline for analysis")
	date_columns = st.columns(2)

	with date_columns[0]:
	start_date = st.date_input(
	"Select Start Date",
	start_default,
	min_value=min_date,
	max_value=max_date,
	)
	if (start_date < min_date) or (start_date > max_date):
	st.error("Please select dates in the range of the dates in the data")
	st.stop()
	end_default = (
	end_default if end_default > start_date + timedelta(days=1) else max_date
	)
	with date_columns[1]:
	end_default = (
	end_default
	if pd.Timestamp(end_default) >= pd.Timestamp(start_date)
	else start_date
	)

	end_date = st.date_input(
	"Select End Date",
	end_default,
	min_value=start_date + timedelta(days=1),
	max_value=max_date,
	)
	if (
	(start_date < min_date)
	or (end_date < min_date)
	or (start_date > max_date)
	or (end_date > max_date)
	):
	st.error("Please select dates in the range of the dates in the data")
	st.stop()
	if end_date < start_date + timedelta(days=1):
	st.error("Please select end date after start date")
	st.stop()

	st.session_state["project_dct"]["media_performance"]["start_date"] = start_date
	st.session_state["project_dct"]["media_performance"]["end_date"] = end_date

	st.header("Overview of Previous Media Spend")

	initialize_data_cmp(target_col, is_panel, panel_col, start_date, end_date)
	scenario = st.session_state["scenario"]
	raw_df = st.session_state["raw_df"]

	columns = st.columns(2)

	with columns[0]:
	st.metric(
	label="Media Spend",
	value=format_numbers(float(scenario.actual_total_spends)),
	)
	###print(f"##################### {scenario.actual_total_sales} ##################")
	with columns[1]:
	st.metric(
	label=sel_target_col_frmttd,
	value=format_numbers(
	float(scenario.actual_total_sales), include_indicator=False
	),
	)

	actual_summary_df = create_channel_summary(scenario, sel_target_col_frmttd)
	actual_summary_df["Channel"] = actual_summary_df["Channel"].apply(
	channel_name_formating
	)

	columns = st.columns((3, 1))
	with columns[0]:
	with st.expander("Channel wise overview"):
	st.markdown(
	actual_summary_df.style.set_table_styles(
	[
	{
	"selector": "th",
	"props": [("background-color", "#f6dcc7")],
	},
	{
	"selector": "tr:nth-child(even)",
	"props": [("background-color", "#f6dcc7")],
	},
	]
	).to_html(),
	unsafe_allow_html=True,
	)

	st.markdown("<hr>", unsafe_allow_html=True)
	##############################

	st.plotly_chart(
	create_contribution_pie(scenario, sel_target_col_frmttd),
	use_container_width=True,
	)
	st.markdown("<hr>", unsafe_allow_html=True)

	################################3
	st.plotly_chart(
	create_contribuion_stacked_plot(scenario, sel_target_col_frmttd),
	use_container_width=True,
	)
	st.markdown("<hr>", unsafe_allow_html=True)
	#######################################

	selected_channel_name = st.selectbox(
	"Channel",
	st.session_state["channels_list"] + ["non media"],
	format_func=channel_name_formating,
	)
	selected_channel = scenario.channels.get(selected_channel_name, None)

	st.plotly_chart(
	create_channel_spends_sales_plot(selected_channel, sel_target_col_frmttd),
	use_container_width=True,
	)

	st.markdown("<hr>", unsafe_allow_html=True)

	if st.button("Save this session", use_container_width=True):

	project_dct_pkl = pickle.dumps(st.session_state["project_dct"])

	update_db(
	st.session_state["project_number"],
	"Current_Media_Performance",
	"project_dct",
	project_dct_pkl,
	schema,
	# resp_mtrc=None,
	) # db

	st.success("Session Saved!")

	# Remove "response_metric_" from the start and "_total" from the end
	if str(target_col).startswith("response_metric_"):
	target_col = target_col.replace("response_metric_", "", 1)

	# Remove the last 6 characters (length of "_total")
	if str(target_col).endswith("_total"):
	target_col = target_col[:-6]

	if (
	st.session_state["project_dct"]["current_media_performance"][
	"model_outputs"
	][target_col]
	is not None
	):
	if (
	len(
	st.session_state["project_dct"]["current_media_performance"][
	"model_outputs"
	][target_col]["contribution_data"]
	)
	> 0
	):
	st.download_button(
	label="Download Contribution File",
	data=st.session_state["project_dct"]["current_media_performance"][
	"model_outputs"
	][target_col]["contribution_data"].to_csv(),
	file_name="contributions.csv",
	key="dwnld_contr",
	)
	except:
	exc_type, exc_value, exc_traceback = sys.exc_info()
	error_message = "".join(
	traceback.format_exception(exc_type, exc_value, exc_traceback)
	)
	log_message("error", f"Error: {error_message}", "Current Media Performance")
	st.warning("An error occured, please try again", icon="⚠️")