MediaMixOptimization / pages /7_AI_Model_Media_Performance.py
samkeet's picture
Upload 40 files
00b00eb verified
import streamlit as st
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import pickle
import os
from utilities_with_panel import load_local_css, set_header
import yaml
from yaml import SafeLoader
import sqlite3
from datetime import timedelta
from utilities import (
set_header,
load_local_css,
update_db,
project_selection,
retrieve_pkl_object,
)
from utilities_with_panel import (
overview_test_data_prep_panel,
overview_test_data_prep_nonpanel,
initialize_data_cmp,
create_channel_summary,
create_contribution_pie,
create_contribuion_stacked_plot,
create_channel_spends_sales_plot,
format_numbers,
channel_name_formating,
)
from log_application import log_message
import sys, traceback
from post_gres_cred import db_cred
st.set_page_config(layout="wide")
load_local_css("styles.css")
set_header()
schema = db_cred["schema"]
if "username" not in st.session_state:
st.session_state["username"] = None
if "project_name" not in st.session_state:
st.session_state["project_name"] = None
if "project_dct" not in st.session_state:
project_selection()
st.stop()
tuned_model = retrieve_pkl_object(
st.session_state["project_number"], "Model_Tuning", "tuned_model", schema
)
if tuned_model is None:
st.error("Please save a tuned model")
st.stop()
if (
"session_state_saved" in st.session_state["project_dct"]["model_tuning"].keys()
and st.session_state["project_dct"]["model_tuning"]["session_state_saved"] != []
):
for key in ["used_response_metrics", "media_data", "bin_dict"]:
if key not in st.session_state:
st.session_state[key] = st.session_state["project_dct"]["model_tuning"][
"session_state_saved"
][key]
## DEFINE ALL FUNCTIONS
def get_random_effects(media_data, panel_col, mdf):
random_eff_df = pd.DataFrame(columns=[panel_col, "random_effect"])
for i, market in enumerate(media_data[panel_col].unique()):
print(i, end="\r")
intercept = mdf.random_effects[market].values[0]
random_eff_df.loc[i, "random_effect"] = intercept
random_eff_df.loc[i, panel_col] = market
return random_eff_df
def process_train_and_test(train, test, features, panel_col, target_col):
X1 = train[features]
ss = MinMaxScaler()
X1 = pd.DataFrame(ss.fit_transform(X1), columns=X1.columns)
X1[panel_col] = train[panel_col]
X1[target_col] = train[target_col]
if test is not None:
X2 = test[features]
X2 = pd.DataFrame(ss.transform(X2), columns=X2.columns)
X2[panel_col] = test[panel_col]
X2[target_col] = test[target_col]
return X1, X2
return X1
def mdf_predict(X_df, mdf, random_eff_df):
X = X_df.copy()
X = pd.merge(
X,
random_eff_df[[panel_col, "random_effect"]],
on=panel_col,
how="left",
)
X["pred_fixed_effect"] = mdf.predict(X)
X["pred"] = X["pred_fixed_effect"] + X["random_effect"]
X.drop(columns=["pred_fixed_effect", "random_effect"], inplace=True)
return X
try:
if "username" in st.session_state and st.session_state["username"] is not None:
# conn = sqlite3.connect(
# r"DB/User.db", check_same_thread=False
# ) # connection with sql db
# c = conn.cursor()
tuned_model = retrieve_pkl_object(
st.session_state["project_number"], "Model_Tuning", "tuned_model", schema
)
if tuned_model is None:
st.error("Please save a tuned model")
st.stop()
if (
"session_state_saved"
in st.session_state["project_dct"]["model_tuning"].keys()
and st.session_state["project_dct"]["model_tuning"]["session_state_saved"]
!= []
):
for key in [
"used_response_metrics",
"is_tuned_model",
"media_data",
"X_test_spends",
"spends_data",
]:
st.session_state[key] = st.session_state["project_dct"]["model_tuning"][
"session_state_saved"
][key]
elif (
"session_state_saved"
in st.session_state["project_dct"]["model_build"].keys()
and st.session_state["project_dct"]["model_build"]["session_state_saved"]
!= []
):
for key in [
"used_response_metrics",
"date",
"saved_model_names",
"media_data",
"X_test_spends",
]:
st.session_state[key] = st.session_state["project_dct"]["model_build"][
"session_state_saved"
][key]
else:
st.error("Please tune a model first")
st.session_state["bin_dict"] = st.session_state["project_dct"]["model_build"][
"session_state_saved"
]["bin_dict"]
st.session_state["media_data"].columns = [
c.lower() for c in st.session_state["media_data"].columns
]
# with open(
# os.path.join(st.session_state["project_path"], "data_import.pkl"),
# "rb",
# ) as f:
# data = pickle.load(f)
# # Accessing the loaded objects
# st.session_state["orig_media_data"] = data["final_df"]
st.session_state["orig_media_data"] = st.session_state["project_dct"][
"data_import"
][
"imputed_tool_df"
].copy() # db
st.session_state["channels"] = st.session_state["project_dct"]["data_import"][
"group_dict"
].copy()
# target='Revenue'
# set the panel column
panel_col = "panel"
is_panel = (
True if st.session_state["media_data"][panel_col].nunique() > 1 else False
)
date_col = "date"
cols1 = st.columns([2, 1])
with cols1[0]:
st.markdown(f"**Welcome {st.session_state['username']}**")
with cols1[1]:
st.markdown(f"**Current Project: {st.session_state['project_name']}**")
st.title("AI Model Media Performance")
def remove_response_metric(name):
# Convert the name to a lowercase string and remove any leading or trailing spaces
name_str = str(name).lower().strip()
# Check if the name starts with "response metric" or "response_metric"
if name_str.startswith("response metric"):
return name[len("response metric") :].replace("_", " ").strip().title()
elif name_str.startswith("response_metric"):
return name[len("response_metric") :].replace("_", " ").strip().title()
else:
return name
sel_target_col = st.selectbox(
"Select the response metric",
st.session_state["used_response_metrics"],
format_func=remove_response_metric,
)
sel_target_col_frmttd = sel_target_col.replace("_", " ").replace("-", " ")
sel_target_col_frmttd = sel_target_col_frmttd.title()
target_col = (
sel_target_col.lower()
.replace(" ", "_")
.replace("-", "")
.replace(":", "")
.replace("__", "_")
)
target = sel_target_col
# Contribution
if is_panel:
# read tuned mixedLM model
if st.session_state["is_tuned_model"][target_col] == True:
model_dict = retrieve_pkl_object(
st.session_state["project_number"],
"Model_Tuning",
"tuned_model",
schema,
) # db
saved_models = list(model_dict.keys())
required_saved_models = [
m.split("__")[0]
for m in saved_models
if m.split("__")[1] == target_col
]
sel_model = required_saved_models[
0
] # only 1 tuned model available per resp metric
sel_model_dict = model_dict[sel_model + "__" + target_col]
model = sel_model_dict["Model_object"]
X_train = sel_model_dict["X_train_tuned"]
X_test = sel_model_dict["X_test_tuned"]
best_feature_set = sel_model_dict["feature_set"]
# Calculate contributions
st.session_state["orig_media_data"].columns = [
col.lower()
.replace(".", "_")
.replace("@", "_")
.replace(" ", "_")
.replace("-", "")
.replace(":", "")
.replace("__", "_")
for col in st.session_state["orig_media_data"].columns
]
media_data = st.session_state["media_data"]
contri_df = pd.DataFrame()
y = []
y_pred = []
random_eff_df = get_random_effects(media_data, panel_col, model)
random_eff_df["fixed_effect"] = model.fe_params["Intercept"]
random_eff_df["panel_effect"] = (
random_eff_df["random_effect"] + random_eff_df["fixed_effect"]
)
coef_df = pd.DataFrame(model.fe_params)
coef_df.reset_index(inplace=True)
coef_df.columns = ["feature", "coef"]
x_train_contribution = X_train.copy()
x_test_contribution = X_test.copy()
# preprocessing not needed since X_train is already preprocessed
# X1, X2 = process_train_and_test(x_train_contribution, x_test_contribution, best_feature_set, panel_col, target_col)
# x_train_contribution[best_feature_set] = X1[best_feature_set]
# x_test_contribution[best_feature_set] = X2[best_feature_set]
x_train_contribution = mdf_predict(
x_train_contribution, model, random_eff_df
)
x_test_contribution = mdf_predict(x_test_contribution, model, random_eff_df)
x_train_contribution = pd.merge(
x_train_contribution,
random_eff_df[[panel_col, "panel_effect"]],
on=panel_col,
how="left",
)
x_test_contribution = pd.merge(
x_test_contribution,
random_eff_df[[panel_col, "panel_effect"]],
on=panel_col,
how="left",
)
for i in range(len(coef_df))[1:]:
coef = coef_df.loc[i, "coef"]
col = coef_df.loc[i, "feature"]
if col.lower() != "intercept":
x_train_contribution[str(col) + "_contr"] = (
coef * x_train_contribution[col]
)
x_test_contribution[str(col) + "_contr"] = (
coef * x_train_contribution[col]
)
tuning_cols = [
c
for c in x_train_contribution.filter(regex="contr").columns
if c
in [
"day_of_week_contr",
"Trend_contr",
"sine_wave_contr",
"cosine_wave_contr",
]
]
flag_cols = [
c
for c in x_train_contribution.filter(regex="contr").columns
if "_flag" in c
]
# add exogenous contribution to base
all_exog_vars = st.session_state["bin_dict"]["Exogenous"]
all_exog_vars = [
var.lower()
.replace(".", "_")
.replace("@", "_")
.replace(" ", "_")
.replace("-", "")
.replace(":", "")
.replace("__", "_")
for var in all_exog_vars
]
exog_cols = []
if len(all_exog_vars) > 0:
for col in x_train_contribution.filter(regex="contr").columns:
if (
len([exog_var for exog_var in all_exog_vars if exog_var in col])
> 0
):
exog_cols.append(col)
base_cols = ["panel_effect"] + flag_cols + tuning_cols + exog_cols
x_train_contribution["base_contr"] = x_train_contribution[base_cols].sum(
axis=1
)
x_train_contribution.drop(columns=base_cols, inplace=True)
x_test_contribution["base_contr"] = x_test_contribution[base_cols].sum(
axis=1
)
x_test_contribution.drop(columns=base_cols, inplace=True)
overall_contributions = pd.concat(
[x_train_contribution, x_test_contribution]
).reset_index(drop=True)
overview_test_data_prep_panel(
overall_contributions,
st.session_state["orig_media_data"],
st.session_state["spends_data"],
date_col,
panel_col,
target_col,
)
else: # NON PANEL
if st.session_state["is_tuned_model"][target_col] == True: # Sprint4
# with open(
# os.path.join(st.session_state["project_path"], "tuned_model.pkl"),
# "rb",
# ) as file:
# model_dict = pickle.load(file)
model_dict = retrieve_pkl_object(
st.session_state["project_number"],
"Model_Tuning",
"tuned_model",
schema,
) # db
saved_models = list(model_dict.keys())
required_saved_models = [
m.split("__")[0]
for m in saved_models
if m.split("__")[1] == target_col
]
sel_model = required_saved_models[
0
] # only 1 tuned model available per resp metric
sel_model_dict = model_dict[sel_model + "__" + target_col]
model = sel_model_dict["Model_object"]
X_train = sel_model_dict["X_train_tuned"]
X_test = sel_model_dict["X_test_tuned"]
best_feature_set = sel_model_dict["feature_set"]
x_train_contribution = X_train.copy()
x_test_contribution = X_test.copy()
x_train_contribution["pred"] = model.predict(
x_train_contribution[best_feature_set]
)
x_test_contribution["pred"] = model.predict(
x_test_contribution[best_feature_set]
)
coef_df = pd.DataFrame(model.params)
coef_df.reset_index(inplace=True)
coef_df.columns = ["feature", "coef"]
# st.write(coef_df)
for i in range(len(coef_df)):
coef = coef_df.loc[i, "coef"]
col = coef_df.loc[i, "feature"]
if col != "const":
x_train_contribution[str(col) + "_contr"] = (
coef * x_train_contribution[col]
)
x_test_contribution[str(col) + "_contr"] = (
coef * x_test_contribution[col]
)
else:
x_train_contribution["const"] = coef
x_test_contribution["const"] = coef
tuning_cols = [
c
for c in x_train_contribution.filter(regex="contr").columns
if c
in [
"day_of_week_contr",
"Trend_contr",
"sine_wave_contr",
"cosine_wave_contr",
]
]
flag_cols = [
c
for c in x_train_contribution.filter(regex="contr").columns
if "_flag" in c
]
# add exogenous contribution to base
all_exog_vars = st.session_state["bin_dict"]["Exogenous"]
all_exog_vars = [
var.lower()
.replace(".", "_")
.replace("@", "_")
.replace(" ", "_")
.replace("-", "")
.replace(":", "")
.replace("__", "_")
for var in all_exog_vars
]
exog_cols = []
if len(all_exog_vars) > 0:
for col in x_train_contribution.filter(regex="contr").columns:
if (
len([exog_var for exog_var in all_exog_vars if exog_var in col])
> 0
):
exog_cols.append(col)
base_cols = ["const"] + flag_cols + tuning_cols + exog_cols
# st.write(base_cols)
x_train_contribution["base_contr"] = x_train_contribution[base_cols].sum(
axis=1
)
x_train_contribution.drop(columns=base_cols, inplace=True)
x_test_contribution["base_contr"] = x_test_contribution[base_cols].sum(
axis=1
)
x_test_contribution.drop(columns=base_cols, inplace=True)
# x_test_contribution.to_csv("Test/test_contr.csv", index=False)
overall_contributions = pd.concat(
[x_train_contribution, x_test_contribution]
).reset_index(drop=True)
# overall_contributions.to_csv("Test/overall_contributions.csv", index=False)
overview_test_data_prep_nonpanel(
overall_contributions,
st.session_state["orig_media_data"].copy(),
st.session_state["spends_data"].copy(),
date_col,
target_col,
)
# for k, v in st.session_sta
# te.items():
# if k not in ['logout', 'login','config'] and not k.startswith('FormSubmitter'):
# st.session_state[k] = v
# authenticator = st.session_state.get('authenticator')
# if authenticator is None:
# authenticator = load_authenticator()
# name, authentication_status, username = authenticator.login('Login', 'main')
# auth_status = st.session_state['authentication_status']
# if auth_status:
# authenticator.logout('Logout', 'main')
# is_state_initiaized = st.session_state.get('initialized',False)
# if not is_state_initiaized:
min_date = X_train[date_col].min().date()
max_date = X_test[date_col].max().date()
if "media_performance" not in st.session_state["project_dct"]:
st.session_state["project_dct"]["media_performance"] = {
"start_date": None,
"end_date": None,
}
start_default = st.session_state["project_dct"]["media_performance"].get(
"start_date", None
)
start_default = start_default if start_default is not None else min_date
start_default = start_default if start_default > min_date else min_date
start_default = start_default if start_default < max_date else min_date
end_default = st.session_state["project_dct"]["media_performance"].get(
"end_date", None
)
end_default = end_default if end_default is not None else max_date
end_default = end_default if end_default > min_date else max_date
end_default = end_default if end_default < max_date else max_date
st.write("Select a timeline for analysis")
date_columns = st.columns(2)
with date_columns[0]:
start_date = st.date_input(
"Select Start Date",
start_default,
min_value=min_date,
max_value=max_date,
)
if (start_date < min_date) or (start_date > max_date):
st.error("Please select dates in the range of the dates in the data")
st.stop()
end_default = (
end_default if end_default > start_date + timedelta(days=1) else max_date
)
with date_columns[1]:
end_default = (
end_default
if pd.Timestamp(end_default) >= pd.Timestamp(start_date)
else start_date
)
end_date = st.date_input(
"Select End Date",
end_default,
min_value=start_date + timedelta(days=1),
max_value=max_date,
)
if (
(start_date < min_date)
or (end_date < min_date)
or (start_date > max_date)
or (end_date > max_date)
):
st.error("Please select dates in the range of the dates in the data")
st.stop()
if end_date < start_date + timedelta(days=1):
st.error("Please select end date after start date")
st.stop()
st.session_state["project_dct"]["media_performance"]["start_date"] = start_date
st.session_state["project_dct"]["media_performance"]["end_date"] = end_date
st.header("Overview of Previous Media Spend")
initialize_data_cmp(target_col, is_panel, panel_col, start_date, end_date)
scenario = st.session_state["scenario"]
raw_df = st.session_state["raw_df"]
columns = st.columns(2)
with columns[0]:
st.metric(
label="Media Spend",
value=format_numbers(float(scenario.actual_total_spends)),
)
###print(f"##################### {scenario.actual_total_sales} ##################")
with columns[1]:
st.metric(
label=sel_target_col_frmttd,
value=format_numbers(
float(scenario.actual_total_sales), include_indicator=False
),
)
actual_summary_df = create_channel_summary(scenario, sel_target_col_frmttd)
actual_summary_df["Channel"] = actual_summary_df["Channel"].apply(
channel_name_formating
)
columns = st.columns((3, 1))
with columns[0]:
with st.expander("Channel wise overview"):
st.markdown(
actual_summary_df.style.set_table_styles(
[
{
"selector": "th",
"props": [("background-color", "#f6dcc7")],
},
{
"selector": "tr:nth-child(even)",
"props": [("background-color", "#f6dcc7")],
},
]
).to_html(),
unsafe_allow_html=True,
)
st.markdown("<hr>", unsafe_allow_html=True)
##############################
st.plotly_chart(
create_contribution_pie(scenario, sel_target_col_frmttd),
use_container_width=True,
)
st.markdown("<hr>", unsafe_allow_html=True)
################################3
st.plotly_chart(
create_contribuion_stacked_plot(scenario, sel_target_col_frmttd),
use_container_width=True,
)
st.markdown("<hr>", unsafe_allow_html=True)
#######################################
selected_channel_name = st.selectbox(
"Channel",
st.session_state["channels_list"] + ["non media"],
format_func=channel_name_formating,
)
selected_channel = scenario.channels.get(selected_channel_name, None)
st.plotly_chart(
create_channel_spends_sales_plot(selected_channel, sel_target_col_frmttd),
use_container_width=True,
)
st.markdown("<hr>", unsafe_allow_html=True)
if st.button("Save this session", use_container_width=True):
project_dct_pkl = pickle.dumps(st.session_state["project_dct"])
update_db(
st.session_state["project_number"],
"Current_Media_Performance",
"project_dct",
project_dct_pkl,
schema,
# resp_mtrc=None,
) # db
st.success("Session Saved!")
# Remove "response_metric_" from the start and "_total" from the end
if str(target_col).startswith("response_metric_"):
target_col = target_col.replace("response_metric_", "", 1)
# Remove the last 6 characters (length of "_total")
if str(target_col).endswith("_total"):
target_col = target_col[:-6]
if (
st.session_state["project_dct"]["current_media_performance"][
"model_outputs"
][target_col]
is not None
):
if (
len(
st.session_state["project_dct"]["current_media_performance"][
"model_outputs"
][target_col]["contribution_data"]
)
> 0
):
st.download_button(
label="Download Contribution File",
data=st.session_state["project_dct"]["current_media_performance"][
"model_outputs"
][target_col]["contribution_data"].to_csv(),
file_name="contributions.csv",
key="dwnld_contr",
)
except:
exc_type, exc_value, exc_traceback = sys.exc_info()
error_message = "".join(
traceback.format_exception(exc_type, exc_value, exc_traceback)
)
log_message("error", f"Error: {error_message}", "Current Media Performance")
st.warning("An error occured, please try again", icon="⚠️")