Spaces:
Sleeping
Sleeping
Upload 78 files
Browse files- .gitattributes +4 -5
- Data_Import (1).py +995 -0
- Data_Import .py +1019 -0
- Data_prep_functions.py +72 -59
- Model/model_0.pkl +3 -0
- Model/model_1.pkl +3 -0
- Model/model_2.pkl +3 -0
- Model/model_3.pkl +3 -0
- Model/model_4.pkl +3 -0
- Overview_data_test_panel@#app_installs.xlsx +0 -0
- Overview_data_test_panel@#revenue.xlsx +0 -0
- Overview_data_test_panelreplace_meapp_installs.xlsx +0 -0
- README.md +1 -1
- Test/merged_df_contri.csv +0 -0
- Test/output_df.csv +16 -0
- Test/scenario_test_df.csv +16 -0
- Test/x_test_contribution.csv +0 -0
- Test/x_test_to_save.csv +0 -0
- Test/x_train_contribution.csv +0 -0
- Test/x_train_to_save.csv +0 -0
- best_models.pkl +2 -2
- classes.py +130 -106
- data_import.pkl +3 -0
- data_test_overview_panel_#total_approved_accounts_revenue.xlsx +3 -0
- final_df_transformed.pkl +3 -0
- metrics_level_data/Overview_data_test_panel@#app_installs.xlsx +0 -0
- metrics_level_data/Overview_data_test_panel@#revenue.xlsx +0 -0
- model_output.csv +6 -11
- pages/10_Optimized_Result_Analysis.py +23 -77
- pages/1_Data_Validation.py +158 -188
- pages/2_Transformations.py +522 -0
- pages/4_Model_Build.py +826 -0
- pages/4_Saved_Model_Results.py +461 -267
- pages/5_Model_Tuning_with_panel.py +527 -0
- pages/6_Model_Result_Overview.py +348 -0
- pages/7_Build_Response_Curves.py +185 -0
- pages/8_Scenario_Planner.py +458 -167
- requirements.txt +94 -102
- summary_df.pkl +1 -1
- tuned_model.pkl +3 -0
- upf_data_converted_old.csv +0 -0
- upf_data_converted_old.xlsx +3 -0
- upf_data_converted_randomized_resp_metrics.csv +0 -0
- upf_data_converted_randomized_resp_metrics.xlsx +3 -0
- utilities.py +534 -263
- utilities_with_panel.py +1018 -0
.gitattributes
CHANGED
@@ -33,9 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
-
|
37 |
-
Profile_Report.html filter=lfs diff=lfs merge=lfs -text
|
38 |
-
raw_data_nov7_combined.xlsx filter=lfs diff=lfs merge=lfs -text
|
39 |
-
raw_data_nov7_combined1.xlsx filter=lfs diff=lfs merge=lfs -text
|
40 |
-
upf_data_converted.xlsx filter=lfs diff=lfs merge=lfs -text
|
41 |
Pickle_files/main_df filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
data_test_overview_panel_\#total_approved_accounts_revenue.xlsx filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
37 |
Pickle_files/main_df filter=lfs diff=lfs merge=lfs -text
|
38 |
+
upf_data_converted_old.xlsx filter=lfs diff=lfs merge=lfs -text
|
39 |
+
upf_data_converted_randomized_resp_metrics.xlsx filter=lfs diff=lfs merge=lfs -text
|
40 |
+
upf_data_converted.xlsx filter=lfs diff=lfs merge=lfs -text
|
Data_Import (1).py
ADDED
@@ -0,0 +1,995 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Importing necessary libraries
|
2 |
+
import streamlit as st
|
3 |
+
|
4 |
+
st.set_page_config(
|
5 |
+
page_title="Data Import",
|
6 |
+
page_icon=":shark:",
|
7 |
+
layout="wide",
|
8 |
+
initial_sidebar_state="collapsed",
|
9 |
+
)
|
10 |
+
|
11 |
+
import pickle
|
12 |
+
import pandas as pd
|
13 |
+
from utilities import set_header, load_local_css, authentication
|
14 |
+
|
15 |
+
load_local_css("styles.css")
|
16 |
+
set_header()
|
17 |
+
|
18 |
+
|
19 |
+
# Check for authentication status
|
20 |
+
authenticator, name, authentication_status, username = authentication()
|
21 |
+
if authentication_status != True:
|
22 |
+
st.stop()
|
23 |
+
else:
|
24 |
+
authenticator.logout("Logout", "main")
|
25 |
+
|
26 |
+
|
27 |
+
# Function to validate date column in dataframe
|
28 |
+
def validate_date_column(df):
|
29 |
+
try:
|
30 |
+
# Attempt to convert the 'Date' column to datetime
|
31 |
+
df["date"] = pd.to_datetime(df["date"], format="%d-%m-%Y")
|
32 |
+
return True
|
33 |
+
except:
|
34 |
+
return False
|
35 |
+
|
36 |
+
|
37 |
+
# Function to determine data interval
|
38 |
+
def determine_data_interval(common_freq):
|
39 |
+
if common_freq == 1:
|
40 |
+
return "daily"
|
41 |
+
elif common_freq == 7:
|
42 |
+
return "weekly"
|
43 |
+
elif 28 <= common_freq <= 31:
|
44 |
+
return "monthly"
|
45 |
+
else:
|
46 |
+
return "irregular"
|
47 |
+
|
48 |
+
|
49 |
+
# Function to read each uploaded Excel file into a pandas DataFrame and stores them in a dictionary
|
50 |
+
st.cache_resource(show_spinner=False)
|
51 |
+
|
52 |
+
|
53 |
+
def files_to_dataframes(uploaded_files):
|
54 |
+
df_dict = {}
|
55 |
+
for uploaded_file in uploaded_files:
|
56 |
+
# Extract file name without extension
|
57 |
+
file_name = uploaded_file.name.rsplit(".", 1)[0]
|
58 |
+
|
59 |
+
# Check for duplicate file names
|
60 |
+
if file_name in df_dict:
|
61 |
+
st.warning(
|
62 |
+
f"Duplicate File: {file_name}. This file will be skipped.",
|
63 |
+
icon="⚠️",
|
64 |
+
)
|
65 |
+
continue
|
66 |
+
|
67 |
+
# Read the file into a DataFrame
|
68 |
+
df = pd.read_excel(uploaded_file)
|
69 |
+
|
70 |
+
# Convert all column names to lowercase
|
71 |
+
df.columns = df.columns.str.lower().str.strip()
|
72 |
+
|
73 |
+
# Separate numeric and non-numeric columns
|
74 |
+
numeric_cols = list(df.select_dtypes(include=["number"]).columns)
|
75 |
+
non_numeric_cols = [
|
76 |
+
col
|
77 |
+
for col in df.select_dtypes(exclude=["number"]).columns
|
78 |
+
if col.lower() != "date"
|
79 |
+
]
|
80 |
+
|
81 |
+
# Check for 'Date' column
|
82 |
+
if not (validate_date_column(df) and len(numeric_cols) > 0):
|
83 |
+
st.warning(
|
84 |
+
f"File Name: {file_name} ➜ Please upload data with Date column in 'DD-MM-YYYY' format and at least one media/exogenous column. This file will be skipped.",
|
85 |
+
icon="⚠️",
|
86 |
+
)
|
87 |
+
continue
|
88 |
+
|
89 |
+
# Check for interval
|
90 |
+
common_freq = common_freq = (
|
91 |
+
pd.Series(df["date"].unique()).diff().dt.days.dropna().mode()[0]
|
92 |
+
)
|
93 |
+
# Calculate the data interval (daily, weekly, monthly or irregular)
|
94 |
+
interval = determine_data_interval(common_freq)
|
95 |
+
if interval == "irregular":
|
96 |
+
st.warning(
|
97 |
+
f"File Name: {file_name} ➜ Please upload data in daily, weekly or monthly interval. This file will be skipped.",
|
98 |
+
icon="⚠️",
|
99 |
+
)
|
100 |
+
continue
|
101 |
+
|
102 |
+
# Store both DataFrames in the dictionary under their respective keys
|
103 |
+
df_dict[file_name] = {
|
104 |
+
"numeric": numeric_cols,
|
105 |
+
"non_numeric": non_numeric_cols,
|
106 |
+
"interval": interval,
|
107 |
+
"df": df,
|
108 |
+
}
|
109 |
+
|
110 |
+
return df_dict
|
111 |
+
|
112 |
+
|
113 |
+
# Function to adjust dataframe granularity
|
114 |
+
def adjust_dataframe_granularity(df, current_granularity, target_granularity):
|
115 |
+
# Set index
|
116 |
+
df.set_index("date", inplace=True)
|
117 |
+
|
118 |
+
# Define aggregation rules for resampling
|
119 |
+
aggregation_rules = {
|
120 |
+
col: "sum" if pd.api.types.is_numeric_dtype(df[col]) else "first"
|
121 |
+
for col in df.columns
|
122 |
+
}
|
123 |
+
|
124 |
+
# Initialize resampled_df
|
125 |
+
resampled_df = df
|
126 |
+
if current_granularity == "daily" and target_granularity == "weekly":
|
127 |
+
resampled_df = df.resample("W-MON", closed="left", label="left").agg(
|
128 |
+
aggregation_rules
|
129 |
+
)
|
130 |
+
|
131 |
+
elif current_granularity == "daily" and target_granularity == "monthly":
|
132 |
+
resampled_df = df.resample("MS", closed="left", label="left").agg(
|
133 |
+
aggregation_rules
|
134 |
+
)
|
135 |
+
|
136 |
+
elif current_granularity == "daily" and target_granularity == "daily":
|
137 |
+
resampled_df = df.resample("D").agg(aggregation_rules)
|
138 |
+
|
139 |
+
elif current_granularity in ["weekly", "monthly"] and target_granularity == "daily":
|
140 |
+
# For higher to lower granularity, distribute numeric and replicate non-numeric values equally across the new period
|
141 |
+
expanded_data = []
|
142 |
+
for _, row in df.iterrows():
|
143 |
+
if current_granularity == "weekly":
|
144 |
+
period_range = pd.date_range(start=row.name, periods=7)
|
145 |
+
elif current_granularity == "monthly":
|
146 |
+
period_range = pd.date_range(
|
147 |
+
start=row.name, periods=row.name.days_in_month
|
148 |
+
)
|
149 |
+
|
150 |
+
for date in period_range:
|
151 |
+
new_row = {}
|
152 |
+
for col in df.columns:
|
153 |
+
if pd.api.types.is_numeric_dtype(df[col]):
|
154 |
+
if current_granularity == "weekly":
|
155 |
+
new_row[col] = row[col] / 7
|
156 |
+
elif current_granularity == "monthly":
|
157 |
+
new_row[col] = row[col] / row.name.days_in_month
|
158 |
+
else:
|
159 |
+
new_row[col] = row[col]
|
160 |
+
expanded_data.append((date, new_row))
|
161 |
+
|
162 |
+
resampled_df = pd.DataFrame(
|
163 |
+
[data for _, data in expanded_data],
|
164 |
+
index=[date for date, _ in expanded_data],
|
165 |
+
)
|
166 |
+
|
167 |
+
# Reset index
|
168 |
+
resampled_df = resampled_df.reset_index().rename(columns={"index": "date"})
|
169 |
+
|
170 |
+
return resampled_df
|
171 |
+
|
172 |
+
|
173 |
+
# Function to clean and extract unique values of Panel_1 and Panel_2
|
174 |
+
st.cache_resource(show_spinner=False)
|
175 |
+
|
176 |
+
|
177 |
+
def clean_and_extract_unique_values(files_dict, selections):
|
178 |
+
all_panel1_values = set()
|
179 |
+
all_panel2_values = set()
|
180 |
+
|
181 |
+
for file_name, file_data in files_dict.items():
|
182 |
+
df = file_data["df"]
|
183 |
+
|
184 |
+
# 'Panel_1' and 'Panel_2' selections
|
185 |
+
selected_panel1 = selections[file_name].get("Panel_1")
|
186 |
+
selected_panel2 = selections[file_name].get("Panel_2")
|
187 |
+
|
188 |
+
# Clean and standardize Panel_1 column if it exists and is selected
|
189 |
+
if (
|
190 |
+
selected_panel1
|
191 |
+
and selected_panel1 != "N/A"
|
192 |
+
and selected_panel1 in df.columns
|
193 |
+
):
|
194 |
+
df[selected_panel1] = (
|
195 |
+
df[selected_panel1].str.lower().str.strip().str.replace("_", " ")
|
196 |
+
)
|
197 |
+
all_panel1_values.update(df[selected_panel1].dropna().unique())
|
198 |
+
|
199 |
+
# Clean and standardize Panel_2 column if it exists and is selected
|
200 |
+
if (
|
201 |
+
selected_panel2
|
202 |
+
and selected_panel2 != "N/A"
|
203 |
+
and selected_panel2 in df.columns
|
204 |
+
):
|
205 |
+
df[selected_panel2] = (
|
206 |
+
df[selected_panel2].str.lower().str.strip().str.replace("_", " ")
|
207 |
+
)
|
208 |
+
all_panel2_values.update(df[selected_panel2].dropna().unique())
|
209 |
+
|
210 |
+
# Update the processed DataFrame back in the dictionary
|
211 |
+
files_dict[file_name]["df"] = df
|
212 |
+
|
213 |
+
return all_panel1_values, all_panel2_values
|
214 |
+
|
215 |
+
|
216 |
+
# Function to format values for display
|
217 |
+
st.cache_resource(show_spinner=False)
|
218 |
+
|
219 |
+
|
220 |
+
def format_values_for_display(values_list):
|
221 |
+
# Capitalize the first letter of each word and replace underscores with spaces
|
222 |
+
formatted_list = [value.replace("_", " ").title() for value in values_list]
|
223 |
+
# Join values with commas and 'and' before the last value
|
224 |
+
if len(formatted_list) > 1:
|
225 |
+
return ", ".join(formatted_list[:-1]) + ", and " + formatted_list[-1]
|
226 |
+
elif formatted_list:
|
227 |
+
return formatted_list[0]
|
228 |
+
return "No values available"
|
229 |
+
|
230 |
+
|
231 |
+
# Function to normalizes all data within files_dict to a daily granularity
|
232 |
+
st.cache(show_spinner=False, allow_output_mutation=True)
|
233 |
+
|
234 |
+
|
235 |
+
def standardize_data_to_daily(files_dict, selections):
|
236 |
+
# Normalize all data to a daily granularity using a provided function
|
237 |
+
files_dict = apply_granularity_to_all(files_dict, "daily", selections)
|
238 |
+
|
239 |
+
# Update the "interval" attribute for each dataset to indicate the new granularity
|
240 |
+
for files_name, files_data in files_dict.items():
|
241 |
+
files_data["interval"] = "daily"
|
242 |
+
|
243 |
+
return files_dict
|
244 |
+
|
245 |
+
|
246 |
+
# Function to apply granularity transformation to all DataFrames in files_dict
|
247 |
+
st.cache_resource(show_spinner=False)
|
248 |
+
|
249 |
+
|
250 |
+
def apply_granularity_to_all(files_dict, granularity_selection, selections):
|
251 |
+
for file_name, file_data in files_dict.items():
|
252 |
+
df = file_data["df"].copy()
|
253 |
+
|
254 |
+
# Handling when Panel_1 or Panel_2 might be 'N/A'
|
255 |
+
selected_panel1 = selections[file_name].get("Panel_1")
|
256 |
+
selected_panel2 = selections[file_name].get("Panel_2")
|
257 |
+
|
258 |
+
# Correcting the segment selection logic & handling 'N/A'
|
259 |
+
if selected_panel1 != "N/A" and selected_panel2 != "N/A":
|
260 |
+
unique_combinations = df[
|
261 |
+
[selected_panel1, selected_panel2]
|
262 |
+
].drop_duplicates()
|
263 |
+
elif selected_panel1 != "N/A":
|
264 |
+
unique_combinations = df[[selected_panel1]].drop_duplicates()
|
265 |
+
selected_panel2 = None # Ensure Panel_2 is ignored if N/A
|
266 |
+
elif selected_panel2 != "N/A":
|
267 |
+
unique_combinations = df[[selected_panel2]].drop_duplicates()
|
268 |
+
selected_panel1 = None # Ensure Panel_1 is ignored if N/A
|
269 |
+
else:
|
270 |
+
# If both are 'N/A', process the entire dataframe as is
|
271 |
+
df = adjust_dataframe_granularity(
|
272 |
+
df, file_data["interval"], granularity_selection
|
273 |
+
)
|
274 |
+
files_dict[file_name]["df"] = df
|
275 |
+
continue # Skip to the next file
|
276 |
+
|
277 |
+
transformed_segments = []
|
278 |
+
for _, combo in unique_combinations.iterrows():
|
279 |
+
if selected_panel1 and selected_panel2:
|
280 |
+
segment = df[
|
281 |
+
(df[selected_panel1] == combo[selected_panel1])
|
282 |
+
& (df[selected_panel2] == combo[selected_panel2])
|
283 |
+
]
|
284 |
+
elif selected_panel1:
|
285 |
+
segment = df[df[selected_panel1] == combo[selected_panel1]]
|
286 |
+
elif selected_panel2:
|
287 |
+
segment = df[df[selected_panel2] == combo[selected_panel2]]
|
288 |
+
|
289 |
+
# Adjust granularity of the segment
|
290 |
+
transformed_segment = adjust_dataframe_granularity(
|
291 |
+
segment, file_data["interval"], granularity_selection
|
292 |
+
)
|
293 |
+
transformed_segments.append(transformed_segment)
|
294 |
+
|
295 |
+
# Combine all transformed segments into a single DataFrame for this file
|
296 |
+
transformed_df = pd.concat(transformed_segments, ignore_index=True)
|
297 |
+
files_dict[file_name]["df"] = transformed_df
|
298 |
+
|
299 |
+
return files_dict
|
300 |
+
|
301 |
+
|
302 |
+
# Function to create main dataframe structure
|
303 |
+
st.cache_resource(show_spinner=False)
|
304 |
+
|
305 |
+
|
306 |
+
def create_main_dataframe(
|
307 |
+
files_dict, all_panel1_values, all_panel2_values, granularity_selection
|
308 |
+
):
|
309 |
+
# Determine the global start and end dates across all DataFrames
|
310 |
+
global_start = min(df["df"]["date"].min() for df in files_dict.values())
|
311 |
+
global_end = max(df["df"]["date"].max() for df in files_dict.values())
|
312 |
+
|
313 |
+
# Adjust the date_range generation based on the granularity_selection
|
314 |
+
if granularity_selection == "weekly":
|
315 |
+
# Generate a weekly range, with weeks starting on Monday
|
316 |
+
date_range = pd.date_range(start=global_start, end=global_end, freq="W-MON")
|
317 |
+
elif granularity_selection == "monthly":
|
318 |
+
# Generate a monthly range, starting from the first day of each month
|
319 |
+
date_range = pd.date_range(start=global_start, end=global_end, freq="MS")
|
320 |
+
else: # Default to daily if not weekly or monthly
|
321 |
+
date_range = pd.date_range(start=global_start, end=global_end, freq="D")
|
322 |
+
|
323 |
+
# Collect all unique Panel_1 and Panel_2 values, excluding 'N/A'
|
324 |
+
all_panel1s = all_panel1_values
|
325 |
+
all_panel2s = all_panel2_values
|
326 |
+
|
327 |
+
# Dynamically build the list of dimensions (Panel_1, Panel_2) to include in the main DataFrame based on availability
|
328 |
+
dimensions, merge_keys = [], []
|
329 |
+
if all_panel1s:
|
330 |
+
dimensions.append(all_panel1s)
|
331 |
+
merge_keys.append("Panel_1")
|
332 |
+
if all_panel2s:
|
333 |
+
dimensions.append(all_panel2s)
|
334 |
+
merge_keys.append("Panel_2")
|
335 |
+
|
336 |
+
dimensions.append(date_range) # Date range is always included
|
337 |
+
merge_keys.append("date") # Date range is always included
|
338 |
+
|
339 |
+
# Create a main DataFrame template with the dimensions
|
340 |
+
main_df = pd.MultiIndex.from_product(
|
341 |
+
dimensions,
|
342 |
+
names=[name for name, _ in zip(merge_keys, dimensions)],
|
343 |
+
).to_frame(index=False)
|
344 |
+
|
345 |
+
return main_df.reset_index(drop=True)
|
346 |
+
|
347 |
+
|
348 |
+
# Function to prepare and merge dataFrames
|
349 |
+
st.cache_resource(show_spinner=False)
|
350 |
+
|
351 |
+
|
352 |
+
def merge_into_main_df(main_df, files_dict, selections):
|
353 |
+
for file_name, file_data in files_dict.items():
|
354 |
+
df = file_data["df"].copy()
|
355 |
+
|
356 |
+
# Rename selected Panel_1 and Panel_2 columns if not 'N/A'
|
357 |
+
selected_panel1 = selections[file_name].get("Panel_1", "N/A")
|
358 |
+
selected_panel2 = selections[file_name].get("Panel_2", "N/A")
|
359 |
+
if selected_panel1 != "N/A":
|
360 |
+
df.rename(columns={selected_panel1: "Panel_1"}, inplace=True)
|
361 |
+
if selected_panel2 != "N/A":
|
362 |
+
df.rename(columns={selected_panel2: "Panel_2"}, inplace=True)
|
363 |
+
|
364 |
+
# Merge current DataFrame into main_df based on 'date', and where applicable, 'Panel_1' and 'Panel_2'
|
365 |
+
merge_keys = ["date"]
|
366 |
+
if "Panel_1" in df.columns:
|
367 |
+
merge_keys.append("Panel_1")
|
368 |
+
if "Panel_2" in df.columns:
|
369 |
+
merge_keys.append("Panel_2")
|
370 |
+
main_df = pd.merge(main_df, df, on=merge_keys, how="left")
|
371 |
+
|
372 |
+
# After all merges, sort by 'date' and reset index for cleanliness
|
373 |
+
sort_by = ["date"]
|
374 |
+
if "Panel_1" in main_df.columns:
|
375 |
+
sort_by.append("Panel_1")
|
376 |
+
if "Panel_2" in main_df.columns:
|
377 |
+
sort_by.append("Panel_2")
|
378 |
+
main_df.sort_values(by=sort_by, inplace=True)
|
379 |
+
main_df.reset_index(drop=True, inplace=True)
|
380 |
+
|
381 |
+
return main_df
|
382 |
+
|
383 |
+
|
384 |
+
# Function to categorize column
|
385 |
+
def categorize_column(column_name):
|
386 |
+
# Define keywords for each category
|
387 |
+
internal_keywords = [
|
388 |
+
"Price",
|
389 |
+
"Discount",
|
390 |
+
"product_price",
|
391 |
+
"cost",
|
392 |
+
"margin",
|
393 |
+
"inventory",
|
394 |
+
"sales",
|
395 |
+
"revenue",
|
396 |
+
"turnover",
|
397 |
+
"expense",
|
398 |
+
]
|
399 |
+
exogenous_keywords = [
|
400 |
+
"GDP",
|
401 |
+
"Tax",
|
402 |
+
"Inflation",
|
403 |
+
"interest_rate",
|
404 |
+
"employment_rate",
|
405 |
+
"exchange_rate",
|
406 |
+
"consumer_spending",
|
407 |
+
"retail_sales",
|
408 |
+
"oil_prices",
|
409 |
+
"weather",
|
410 |
+
]
|
411 |
+
|
412 |
+
# Check if the column name matches any of the keywords for Internal or Exogenous categories
|
413 |
+
for keyword in internal_keywords:
|
414 |
+
if keyword.lower() in column_name.lower():
|
415 |
+
return "Internal"
|
416 |
+
for keyword in exogenous_keywords:
|
417 |
+
if keyword.lower() in column_name.lower():
|
418 |
+
return "Exogenous"
|
419 |
+
|
420 |
+
# Default to Media if no match found
|
421 |
+
return "Media"
|
422 |
+
|
423 |
+
|
424 |
+
# Function to calculate missing stats and prepare for editable DataFrame
|
425 |
+
st.cache_resource(show_spinner=False)
|
426 |
+
|
427 |
+
|
428 |
+
def prepare_missing_stats_df(df):
|
429 |
+
missing_stats = []
|
430 |
+
for column in df.columns:
|
431 |
+
if (
|
432 |
+
column == "date" or column == "Panel_2" or column == "Panel_1"
|
433 |
+
): # Skip Date, Panel_1 and Panel_2 column
|
434 |
+
continue
|
435 |
+
|
436 |
+
missing = df[column].isnull().sum()
|
437 |
+
pct_missing = round((missing / len(df)) * 100, 2)
|
438 |
+
|
439 |
+
# Dynamically assign category based on column name
|
440 |
+
category = categorize_column(column)
|
441 |
+
# category = "Media" # Keep default bin as Media
|
442 |
+
|
443 |
+
missing_stats.append(
|
444 |
+
{
|
445 |
+
"Column": column,
|
446 |
+
"Missing Values": missing,
|
447 |
+
"Missing Percentage": pct_missing,
|
448 |
+
"Impute Method": "Fill with 0", # Default value
|
449 |
+
"Category": category,
|
450 |
+
}
|
451 |
+
)
|
452 |
+
stats_df = pd.DataFrame(missing_stats)
|
453 |
+
|
454 |
+
return stats_df
|
455 |
+
|
456 |
+
|
457 |
+
# Function to add API DataFrame details to the files dictionary
|
458 |
+
st.cache_resource(show_spinner=False)
|
459 |
+
|
460 |
+
|
461 |
+
def add_api_dataframe_to_dict(main_df, files_dict):
|
462 |
+
files_dict["API"] = {
|
463 |
+
"numeric": list(main_df.select_dtypes(include=["number"]).columns),
|
464 |
+
"non_numeric": [
|
465 |
+
col
|
466 |
+
for col in main_df.select_dtypes(exclude=["number"]).columns
|
467 |
+
if col.lower() != "date"
|
468 |
+
],
|
469 |
+
"interval": determine_data_interval(
|
470 |
+
pd.Series(main_df["date"].unique()).diff().dt.days.dropna().mode()[0]
|
471 |
+
),
|
472 |
+
"df": main_df,
|
473 |
+
}
|
474 |
+
|
475 |
+
return files_dict
|
476 |
+
|
477 |
+
|
478 |
+
# Function to reads an API into a DataFrame, parsing specified columns as datetime
|
479 |
+
@st.cache_resource(show_spinner=False)
|
480 |
+
def read_API_data():
|
481 |
+
return pd.read_excel(r".\upf_data_converted.xlsx", parse_dates=["Date"])
|
482 |
+
|
483 |
+
|
484 |
+
# Function to set the 'Panel_1_Panel_2_Selected' session state variable to False
|
485 |
+
def set_Panel_1_Panel_2_Selected_false():
|
486 |
+
st.session_state["Panel_1_Panel_2_Selected"] = False
|
487 |
+
|
488 |
+
|
489 |
+
# Function to serialize and save the objects into a pickle file
|
490 |
+
@st.cache_resource(show_spinner=False)
|
491 |
+
def save_to_pickle(file_path, final_df, bin_dict):
|
492 |
+
# Open the file in write-binary mode and dump the objects
|
493 |
+
with open(file_path, "wb") as f:
|
494 |
+
pickle.dump({"final_df": final_df, "bin_dict": bin_dict}, f)
|
495 |
+
# Data is now saved to file
|
496 |
+
|
497 |
+
|
498 |
+
# Function to processes the merged_df DataFrame based on operations defined in edited_df
|
499 |
+
@st.cache_resource(show_spinner=False)
|
500 |
+
def process_dataframes(merged_df, edited_df, edited_stats_df):
|
501 |
+
# Ensure there are operations defined by the user
|
502 |
+
if edited_df.empty:
|
503 |
+
return merged_df, edited_stats_df # No operations to apply
|
504 |
+
|
505 |
+
# Perform operations as defined by the user
|
506 |
+
for index, row in edited_df.iterrows():
|
507 |
+
result_column_name = f"{row['Column 1']}{row['Operator']}{row['Column 2']}"
|
508 |
+
col1 = row["Column 1"]
|
509 |
+
col2 = row["Column 2"]
|
510 |
+
op = row["Operator"]
|
511 |
+
|
512 |
+
# Apply the specified operation
|
513 |
+
if op == "+":
|
514 |
+
merged_df[result_column_name] = merged_df[col1] + merged_df[col2]
|
515 |
+
elif op == "-":
|
516 |
+
merged_df[result_column_name] = merged_df[col1] - merged_df[col2]
|
517 |
+
elif op == "*":
|
518 |
+
merged_df[result_column_name] = merged_df[col1] * merged_df[col2]
|
519 |
+
elif op == "/":
|
520 |
+
merged_df[result_column_name] = merged_df[col1] / merged_df[col2].replace(
|
521 |
+
0, 1e-9
|
522 |
+
)
|
523 |
+
|
524 |
+
# Add summary of operation to edited_stats_df
|
525 |
+
new_row = {
|
526 |
+
"Column": result_column_name,
|
527 |
+
"Missing Values": None,
|
528 |
+
"Missing Percentage": None,
|
529 |
+
"Impute Method": None,
|
530 |
+
"Category": row["Category"],
|
531 |
+
}
|
532 |
+
new_row_df = pd.DataFrame([new_row])
|
533 |
+
|
534 |
+
# Use pd.concat to add the new_row_df to edited_stats_df
|
535 |
+
edited_stats_df = pd.concat(
|
536 |
+
[edited_stats_df, new_row_df], ignore_index=True, axis=0
|
537 |
+
)
|
538 |
+
|
539 |
+
# Combine column names from edited_df for cleanup
|
540 |
+
combined_columns = set(edited_df["Column 1"]).union(set(edited_df["Column 2"]))
|
541 |
+
|
542 |
+
# Filter out rows in edited_stats_df and drop columns from merged_df
|
543 |
+
edited_stats_df = edited_stats_df[~edited_stats_df["Column"].isin(combined_columns)]
|
544 |
+
merged_df.drop(columns=list(combined_columns), errors="ignore", inplace=True)
|
545 |
+
|
546 |
+
return merged_df, edited_stats_df
|
547 |
+
|
548 |
+
|
549 |
+
# Function to prepare a list of numeric column names and initialize an empty DataFrame with predefined structure
|
550 |
+
st.cache_resource(show_spinner=False)
|
551 |
+
|
552 |
+
|
553 |
+
def prepare_numeric_columns_and_default_df(merged_df, edited_stats_df):
|
554 |
+
# Get columns categorized as 'Response Metrics'
|
555 |
+
columns_response_metrics = edited_stats_df[
|
556 |
+
edited_stats_df["Category"] == "Response Metrics"
|
557 |
+
]["Column"].tolist()
|
558 |
+
|
559 |
+
# Filter numeric columns, excluding those categorized as 'Response Metrics'
|
560 |
+
numeric_columns = [
|
561 |
+
col
|
562 |
+
for col in merged_df.select_dtypes(include=["number"]).columns
|
563 |
+
if col not in columns_response_metrics
|
564 |
+
]
|
565 |
+
|
566 |
+
# Define the structure of the empty DataFrame
|
567 |
+
data = {
|
568 |
+
"Column 1": pd.Series([], dtype="str"),
|
569 |
+
"Operator": pd.Series([], dtype="str"),
|
570 |
+
"Column 2": pd.Series([], dtype="str"),
|
571 |
+
"Category": pd.Series([], dtype="str"),
|
572 |
+
}
|
573 |
+
default_df = pd.DataFrame(data)
|
574 |
+
|
575 |
+
return numeric_columns, default_df
|
576 |
+
|
577 |
+
|
578 |
+
# Initialize 'final_df' in session state
|
579 |
+
if "final_df" not in st.session_state:
|
580 |
+
st.session_state["final_df"] = pd.DataFrame()
|
581 |
+
|
582 |
+
# Initialize 'bin_dict' in session state
|
583 |
+
if "bin_dict" not in st.session_state:
|
584 |
+
st.session_state["bin_dict"] = {}
|
585 |
+
|
586 |
+
# Initialize 'Panel_1_Panel_2_Selected' in session state
|
587 |
+
if "Panel_1_Panel_2_Selected" not in st.session_state:
|
588 |
+
st.session_state["Panel_1_Panel_2_Selected"] = False
|
589 |
+
|
590 |
+
|
591 |
+
# Page Title
|
592 |
+
st.write("") # Top padding
|
593 |
+
st.title("Data Import")
|
594 |
+
|
595 |
+
|
596 |
+
#########################################################################################################################################################
|
597 |
+
# Create a dictionary to hold all DataFrames and collect user input to specify "Panel_2" and "Panel_1" columns for each file
|
598 |
+
#########################################################################################################################################################
|
599 |
+
|
600 |
+
|
601 |
+
# Read the Excel file, parsing 'Date' column as datetime
|
602 |
+
main_df = read_API_data()
|
603 |
+
|
604 |
+
# Convert all column names to lowercase
|
605 |
+
main_df.columns = main_df.columns.str.lower().str.strip()
|
606 |
+
|
607 |
+
# File uploader
|
608 |
+
uploaded_files = st.file_uploader(
|
609 |
+
"Upload additional data",
|
610 |
+
type=["xlsx"],
|
611 |
+
accept_multiple_files=True,
|
612 |
+
on_change=set_Panel_1_Panel_2_Selected_false,
|
613 |
+
)
|
614 |
+
|
615 |
+
# Custom HTML for upload instructions
|
616 |
+
recommendation_html = f"""
|
617 |
+
<div style="text-align: justify;">
|
618 |
+
<strong>Recommendation:</strong> For optimal processing, please ensure that all uploaded datasets including panel, media, internal, and exogenous data adhere to the following guidelines: Each dataset must include a <code>Date</code> column formatted as <code>DD-MM-YYYY</code>, be free of missing values.
|
619 |
+
</div>
|
620 |
+
"""
|
621 |
+
st.markdown(recommendation_html, unsafe_allow_html=True)
|
622 |
+
|
623 |
+
# Choose Desired Granularity
|
624 |
+
st.markdown("#### Choose Desired Granularity")
|
625 |
+
# Granularity Selection
|
626 |
+
granularity_selection = st.selectbox(
|
627 |
+
"Choose Date Granularity",
|
628 |
+
["Daily", "Weekly", "Monthly"],
|
629 |
+
label_visibility="collapsed",
|
630 |
+
on_change=set_Panel_1_Panel_2_Selected_false,
|
631 |
+
)
|
632 |
+
granularity_selection = str(granularity_selection).lower()
|
633 |
+
|
634 |
+
# Convert files to dataframes
|
635 |
+
files_dict = files_to_dataframes(uploaded_files)
|
636 |
+
|
637 |
+
# Add API Dataframe
|
638 |
+
if main_df is not None:
|
639 |
+
files_dict = add_api_dataframe_to_dict(main_df, files_dict)
|
640 |
+
|
641 |
+
# Display a warning message if no files have been uploaded and halt further execution
|
642 |
+
if not files_dict:
|
643 |
+
st.warning(
|
644 |
+
"Please upload at least one file to proceed.",
|
645 |
+
icon="⚠️",
|
646 |
+
)
|
647 |
+
st.stop() # Halts further execution until file is uploaded
|
648 |
+
|
649 |
+
|
650 |
+
# Select Panel_1 and Panel_2 columns
|
651 |
+
st.markdown("#### Select Panel columns")
|
652 |
+
selections = {}
|
653 |
+
with st.expander("Select Panel columns", expanded=False):
|
654 |
+
count = 0 # Initialize counter to manage the visibility of labels and keys
|
655 |
+
for file_name, file_data in files_dict.items():
|
656 |
+
# Determine visibility of the label based on the count
|
657 |
+
if count == 0:
|
658 |
+
label_visibility = "visible"
|
659 |
+
else:
|
660 |
+
label_visibility = "collapsed"
|
661 |
+
|
662 |
+
# Extract non-numeric columns
|
663 |
+
non_numeric_cols = file_data["non_numeric"]
|
664 |
+
|
665 |
+
# Prepare Panel_1 and Panel_2 values for dropdown, adding "N/A" as an option
|
666 |
+
panel1_values = non_numeric_cols + ["N/A"]
|
667 |
+
panel2_values = non_numeric_cols + ["N/A"]
|
668 |
+
|
669 |
+
# Skip if only one option is available
|
670 |
+
if len(panel1_values) == 1 and len(panel2_values) == 1:
|
671 |
+
selected_panel1, selected_panel2 = "N/A", "N/A"
|
672 |
+
# Update the selections for Panel_1 and Panel_2 for the current file
|
673 |
+
selections[file_name] = {
|
674 |
+
"Panel_1": selected_panel1,
|
675 |
+
"Panel_2": selected_panel2,
|
676 |
+
}
|
677 |
+
continue
|
678 |
+
|
679 |
+
# Create layout columns for File Name, Panel_2, and Panel_1 selections
|
680 |
+
file_name_col, Panel_1_col, Panel_2_col = st.columns([2, 4, 4])
|
681 |
+
|
682 |
+
with file_name_col:
|
683 |
+
# Display "File Name" label only for the first file
|
684 |
+
if count == 0:
|
685 |
+
st.write("File Name")
|
686 |
+
else:
|
687 |
+
st.write("")
|
688 |
+
st.write(file_name) # Display the file name
|
689 |
+
|
690 |
+
with Panel_1_col:
|
691 |
+
# Display a selectbox for Panel_1 values
|
692 |
+
selected_panel1 = st.selectbox(
|
693 |
+
"Select Panel Level 1",
|
694 |
+
panel2_values,
|
695 |
+
on_change=set_Panel_1_Panel_2_Selected_false,
|
696 |
+
label_visibility=label_visibility, # Control visibility of the label
|
697 |
+
key=f"Panel_1_selectbox{count}", # Ensure unique key for each selectbox
|
698 |
+
)
|
699 |
+
|
700 |
+
with Panel_2_col:
|
701 |
+
# Display a selectbox for Panel_2 values
|
702 |
+
selected_panel2 = st.selectbox(
|
703 |
+
"Select Panel Level 2",
|
704 |
+
panel1_values,
|
705 |
+
on_change=set_Panel_1_Panel_2_Selected_false,
|
706 |
+
label_visibility=label_visibility, # Control visibility of the label
|
707 |
+
key=f"Panel_2_selectbox{count}", # Ensure unique key for each selectbox
|
708 |
+
)
|
709 |
+
|
710 |
+
# Skip processing if the same column is selected for both Panel_1 and Panel_2 due to potential data integrity issues
|
711 |
+
if selected_panel2 == selected_panel1 and not (
|
712 |
+
selected_panel2 == "N/A" and selected_panel1 == "N/A"
|
713 |
+
):
|
714 |
+
st.warning(
|
715 |
+
f"File: {file_name} → The same column cannot serve as both Panel_1 and Panel_2. Please adjust your selections.",
|
716 |
+
)
|
717 |
+
selected_panel1, selected_panel2 = "N/A", "N/A"
|
718 |
+
st.stop()
|
719 |
+
|
720 |
+
# Update the selections for Panel_1 and Panel_2 for the current file
|
721 |
+
selections[file_name] = {
|
722 |
+
"Panel_1": selected_panel1,
|
723 |
+
"Panel_2": selected_panel2,
|
724 |
+
}
|
725 |
+
|
726 |
+
count += 1 # Increment the counter after processing each file
|
727 |
+
|
728 |
+
# Accept Panel_1 and Panel_2 selection
|
729 |
+
if st.button("Accept and Process", use_container_width=True):
|
730 |
+
|
731 |
+
# Normalize all data to a daily granularity. This initial standardization simplifies subsequent conversions to other levels of granularity
|
732 |
+
with st.spinner("Processing...", cache=True):
|
733 |
+
files_dict = standardize_data_to_daily(files_dict, selections)
|
734 |
+
|
735 |
+
# Convert all data to daily level granularity
|
736 |
+
files_dict = apply_granularity_to_all(
|
737 |
+
files_dict, granularity_selection, selections
|
738 |
+
)
|
739 |
+
|
740 |
+
# Update the 'files_dict' in the session state
|
741 |
+
st.session_state["files_dict"] = files_dict
|
742 |
+
|
743 |
+
# Set a flag in the session state to indicate that selection has been made
|
744 |
+
st.session_state["Panel_1_Panel_2_Selected"] = True
|
745 |
+
|
746 |
+
|
747 |
+
#########################################################################################################################################################
|
748 |
+
# Display unique Panel_1 and Panel_2 values
|
749 |
+
#########################################################################################################################################################
|
750 |
+
|
751 |
+
|
752 |
+
# Halts further execution until Panel_1 and Panel_2 columns are selected
|
753 |
+
if "files_dict" in st.session_state and st.session_state["Panel_1_Panel_2_Selected"]:
|
754 |
+
files_dict = st.session_state["files_dict"]
|
755 |
+
else:
|
756 |
+
st.stop()
|
757 |
+
|
758 |
+
# Set to store unique values of Panel_1 and Panel_2
|
759 |
+
with st.spinner("Fetching Panel values..."):
|
760 |
+
all_panel1_values, all_panel2_values = clean_and_extract_unique_values(
|
761 |
+
files_dict, selections
|
762 |
+
)
|
763 |
+
|
764 |
+
# List of Panel_1 and Panel_2 columns unique values
|
765 |
+
list_of_all_panel1_values = list(all_panel1_values)
|
766 |
+
list_of_all_panel2_values = list(all_panel2_values)
|
767 |
+
|
768 |
+
# Format Panel_1 and Panel_2 values for display
|
769 |
+
formatted_panel1_values = format_values_for_display(list_of_all_panel1_values)
|
770 |
+
formatted_panel2_values = format_values_for_display(list_of_all_panel2_values)
|
771 |
+
|
772 |
+
# Unique Panel_1 and Panel_2 values
|
773 |
+
st.markdown("#### Unique Panel values")
|
774 |
+
# Display Panel_1 and Panel_2 values
|
775 |
+
with st.expander("Unique Panel values"):
|
776 |
+
st.write("")
|
777 |
+
st.markdown(
|
778 |
+
f"""
|
779 |
+
<style>
|
780 |
+
.justify-text {{
|
781 |
+
text-align: justify;
|
782 |
+
}}
|
783 |
+
</style>
|
784 |
+
<div class="justify-text">
|
785 |
+
<strong>Panel Level 1 Values:</strong> {formatted_panel1_values}<br>
|
786 |
+
<strong>Panel Level 2 Values:</strong> {formatted_panel2_values}
|
787 |
+
</div>
|
788 |
+
""",
|
789 |
+
unsafe_allow_html=True,
|
790 |
+
)
|
791 |
+
|
792 |
+
# Display total Panel_1 and Panel_2
|
793 |
+
st.write("")
|
794 |
+
st.markdown(
|
795 |
+
f"""
|
796 |
+
<div style="text-align: justify;">
|
797 |
+
<strong>Number of Level 1 Panels detected:</strong> {len(list_of_all_panel1_values)}<br>
|
798 |
+
<strong>Number of Level 2 Panels detected:</strong> {len(list_of_all_panel2_values)}
|
799 |
+
</div>
|
800 |
+
""",
|
801 |
+
unsafe_allow_html=True,
|
802 |
+
)
|
803 |
+
st.write("")
|
804 |
+
|
805 |
+
|
806 |
+
#########################################################################################################################################################
|
807 |
+
# Merge all DataFrames
|
808 |
+
#########################################################################################################################################################
|
809 |
+
|
810 |
+
|
811 |
+
# Merge all DataFrames selected
|
812 |
+
main_df = create_main_dataframe(
|
813 |
+
files_dict, all_panel1_values, all_panel2_values, granularity_selection
|
814 |
+
)
|
815 |
+
merged_df = merge_into_main_df(main_df, files_dict, selections)
|
816 |
+
|
817 |
+
|
818 |
+
#########################################################################################################################################################
|
819 |
+
# Categorize Variables and Impute Missing Values
|
820 |
+
#########################################################################################################################################################
|
821 |
+
|
822 |
+
|
823 |
+
# Create an editable DataFrame in Streamlit
|
824 |
+
st.markdown("#### Select Variables Category & Impute Missing Values")
|
825 |
+
|
826 |
+
# Prepare missing stats DataFrame for editing
|
827 |
+
missing_stats_df = prepare_missing_stats_df(merged_df)
|
828 |
+
|
829 |
+
edited_stats_df = st.data_editor(
|
830 |
+
missing_stats_df,
|
831 |
+
column_config={
|
832 |
+
"Impute Method": st.column_config.SelectboxColumn(
|
833 |
+
options=[
|
834 |
+
"Drop Column",
|
835 |
+
"Fill with Mean",
|
836 |
+
"Fill with Median",
|
837 |
+
"Fill with 0",
|
838 |
+
],
|
839 |
+
required=True,
|
840 |
+
default="Fill with 0",
|
841 |
+
),
|
842 |
+
"Category": st.column_config.SelectboxColumn(
|
843 |
+
options=[
|
844 |
+
"Media",
|
845 |
+
"Exogenous",
|
846 |
+
"Internal",
|
847 |
+
"Response Metrics",
|
848 |
+
],
|
849 |
+
required=True,
|
850 |
+
default="Media",
|
851 |
+
),
|
852 |
+
},
|
853 |
+
disabled=["Column", "Missing Values", "Missing Percentage"],
|
854 |
+
hide_index=True,
|
855 |
+
use_container_width=True,
|
856 |
+
)
|
857 |
+
|
858 |
+
# Apply changes based on edited DataFrame
|
859 |
+
for i, row in edited_stats_df.iterrows():
|
860 |
+
column = row["Column"]
|
861 |
+
if row["Impute Method"] == "Drop Column":
|
862 |
+
merged_df.drop(columns=[column], inplace=True)
|
863 |
+
|
864 |
+
elif row["Impute Method"] == "Fill with Mean":
|
865 |
+
merged_df[column].fillna(merged_df[column].mean(), inplace=True)
|
866 |
+
|
867 |
+
elif row["Impute Method"] == "Fill with Median":
|
868 |
+
merged_df[column].fillna(merged_df[column].median(), inplace=True)
|
869 |
+
|
870 |
+
elif row["Impute Method"] == "Fill with 0":
|
871 |
+
merged_df[column].fillna(0, inplace=True)
|
872 |
+
|
873 |
+
|
874 |
+
#########################################################################################################################################################
|
875 |
+
# Group columns
|
876 |
+
#########################################################################################################################################################
|
877 |
+
|
878 |
+
|
879 |
+
# Display Group columns header
|
880 |
+
st.markdown("#### Feature engineering")
|
881 |
+
|
882 |
+
# Prepare the numeric columns and an empty DataFrame for user input
|
883 |
+
numeric_columns, default_df = prepare_numeric_columns_and_default_df(
|
884 |
+
merged_df, edited_stats_df
|
885 |
+
)
|
886 |
+
|
887 |
+
# Display editable Dataframe
|
888 |
+
edited_df = st.data_editor(
|
889 |
+
default_df,
|
890 |
+
column_config={
|
891 |
+
"Column 1": st.column_config.SelectboxColumn(
|
892 |
+
options=numeric_columns,
|
893 |
+
required=True,
|
894 |
+
default=numeric_columns[0],
|
895 |
+
width=400,
|
896 |
+
),
|
897 |
+
"Operator": st.column_config.SelectboxColumn(
|
898 |
+
options=["+", "-", "*", "/"],
|
899 |
+
required=True,
|
900 |
+
default="+",
|
901 |
+
width=100,
|
902 |
+
),
|
903 |
+
"Column 2": st.column_config.SelectboxColumn(
|
904 |
+
options=numeric_columns,
|
905 |
+
required=True,
|
906 |
+
default=numeric_columns[0],
|
907 |
+
width=400,
|
908 |
+
),
|
909 |
+
"Category": st.column_config.SelectboxColumn(
|
910 |
+
options=[
|
911 |
+
"Media",
|
912 |
+
"Exogenous",
|
913 |
+
"Internal",
|
914 |
+
"Response Metrics",
|
915 |
+
],
|
916 |
+
required=True,
|
917 |
+
default="Media",
|
918 |
+
width=200,
|
919 |
+
),
|
920 |
+
},
|
921 |
+
num_rows="dynamic",
|
922 |
+
)
|
923 |
+
|
924 |
+
# Process the DataFrame based on user inputs and operations specified in edited_df
|
925 |
+
final_df, edited_stats_df = process_dataframes(merged_df, edited_df, edited_stats_df)
|
926 |
+
|
927 |
+
|
928 |
+
#########################################################################################################################################################
|
929 |
+
# Display the Final DataFrame and variables
|
930 |
+
#########################################################################################################################################################
|
931 |
+
|
932 |
+
|
933 |
+
# Display the Final DataFrame and variables
|
934 |
+
st.markdown("#### Final DataFrame")
|
935 |
+
st.dataframe(final_df, hide_index=True)
|
936 |
+
|
937 |
+
# Initialize an empty dictionary to hold categories and their variables
|
938 |
+
category_dict = {}
|
939 |
+
|
940 |
+
# Iterate over each row in the edited DataFrame to populate the dictionary
|
941 |
+
for i, row in edited_stats_df.iterrows():
|
942 |
+
column = row["Column"]
|
943 |
+
category = row["Category"] # The category chosen by the user for this variable
|
944 |
+
|
945 |
+
# Check if the category already exists in the dictionary
|
946 |
+
if category not in category_dict:
|
947 |
+
# If not, initialize it with the current column as its first element
|
948 |
+
category_dict[category] = [column]
|
949 |
+
else:
|
950 |
+
# If it exists, append the current column to the list of variables under this category
|
951 |
+
category_dict[category].append(column)
|
952 |
+
|
953 |
+
# Add Date, Panel_1 and Panel_12 in category dictionary
|
954 |
+
category_dict.update({"Date": ["date"]})
|
955 |
+
if "Panel_1" in final_df.columns:
|
956 |
+
category_dict["Panel Level 1"] = ["Panel_1"]
|
957 |
+
if "Panel_2" in final_df.columns:
|
958 |
+
category_dict["Panel Level 2"] = ["Panel_2"]
|
959 |
+
|
960 |
+
# Display the dictionary
|
961 |
+
st.markdown("#### Variable Category")
|
962 |
+
for category, variables in category_dict.items():
|
963 |
+
# Check if there are multiple variables to handle "and" insertion correctly
|
964 |
+
if len(variables) > 1:
|
965 |
+
# Join all but the last variable with ", ", then add " and " before the last variable
|
966 |
+
variables_str = ", ".join(variables[:-1]) + " and " + variables[-1]
|
967 |
+
else:
|
968 |
+
# If there's only one variable, no need for "and"
|
969 |
+
variables_str = variables[0]
|
970 |
+
|
971 |
+
# Display the category and its variables in the desired format
|
972 |
+
st.markdown(
|
973 |
+
f"<div style='text-align: justify;'><strong>{category}:</strong> {variables_str}</div>",
|
974 |
+
unsafe_allow_html=True,
|
975 |
+
)
|
976 |
+
|
977 |
+
# Function to check if Response Metrics is selected
|
978 |
+
st.write("")
|
979 |
+
response_metrics_col = category_dict.get("Response Metrics", [])
|
980 |
+
if len(response_metrics_col) == 0:
|
981 |
+
st.warning("Please select Response Metrics column", icon="⚠️")
|
982 |
+
st.stop()
|
983 |
+
# elif len(response_metrics_col) > 1:
|
984 |
+
# st.warning("Please select only one Response Metrics column", icon="⚠️")
|
985 |
+
# st.stop()
|
986 |
+
|
987 |
+
# Store final dataframe and bin dictionary into session state
|
988 |
+
st.session_state["final_df"], st.session_state["bin_dict"] = final_df, category_dict
|
989 |
+
|
990 |
+
# Save the DataFrame and dictionary from the session state to the pickle file
|
991 |
+
if st.button("Accept and Save", use_container_width=True):
|
992 |
+
save_to_pickle(
|
993 |
+
"data_import.pkl", st.session_state["final_df"], st.session_state["bin_dict"]
|
994 |
+
)
|
995 |
+
st.toast("💾 Saved Successfully!")
|
Data_Import .py
ADDED
@@ -0,0 +1,1019 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Importing necessary libraries
|
2 |
+
import streamlit as st
|
3 |
+
|
4 |
+
st.set_page_config(
|
5 |
+
page_title="Data Import",
|
6 |
+
page_icon=":shark:",
|
7 |
+
layout="wide",
|
8 |
+
initial_sidebar_state="collapsed",
|
9 |
+
)
|
10 |
+
|
11 |
+
import pickle
|
12 |
+
import pandas as pd
|
13 |
+
from utilities import set_header, load_local_css
|
14 |
+
import streamlit_authenticator as stauth
|
15 |
+
import yaml
|
16 |
+
from yaml import SafeLoader
|
17 |
+
|
18 |
+
load_local_css("styles.css")
|
19 |
+
set_header()
|
20 |
+
|
21 |
+
|
22 |
+
for k, v in st.session_state.items():
|
23 |
+
if k not in ["logout", "login", "config"] and not k.startswith(
|
24 |
+
"FormSubmitter"
|
25 |
+
):
|
26 |
+
st.session_state[k] = v
|
27 |
+
with open("config.yaml") as file:
|
28 |
+
config = yaml.load(file, Loader=SafeLoader)
|
29 |
+
st.session_state["config"] = config
|
30 |
+
authenticator = stauth.Authenticate(
|
31 |
+
config["credentials"],
|
32 |
+
config["cookie"]["name"],
|
33 |
+
config["cookie"]["key"],
|
34 |
+
config["cookie"]["expiry_days"],
|
35 |
+
config["preauthorized"],
|
36 |
+
)
|
37 |
+
st.session_state["authenticator"] = authenticator
|
38 |
+
name, authentication_status, username = authenticator.login("Login", "main")
|
39 |
+
auth_status = st.session_state.get("authentication_status")
|
40 |
+
|
41 |
+
if auth_status == True:
|
42 |
+
authenticator.logout("Logout", "main")
|
43 |
+
is_state_initiaized = st.session_state.get("initialized", False)
|
44 |
+
|
45 |
+
if not is_state_initiaized:
|
46 |
+
|
47 |
+
if 'session_name' not in st.session_state:
|
48 |
+
st.session_state['session_name']=None
|
49 |
+
|
50 |
+
|
51 |
+
# Function to validate date column in dataframe
|
52 |
+
def validate_date_column(df):
|
53 |
+
try:
|
54 |
+
# Attempt to convert the 'Date' column to datetime
|
55 |
+
df["date"] = pd.to_datetime(df["date"], format="%d-%m-%Y")
|
56 |
+
return True
|
57 |
+
except:
|
58 |
+
return False
|
59 |
+
|
60 |
+
|
61 |
+
# Function to determine data interval
|
62 |
+
def determine_data_interval(common_freq):
|
63 |
+
if common_freq == 1:
|
64 |
+
return "daily"
|
65 |
+
elif common_freq == 7:
|
66 |
+
return "weekly"
|
67 |
+
elif 28 <= common_freq <= 31:
|
68 |
+
return "monthly"
|
69 |
+
else:
|
70 |
+
return "irregular"
|
71 |
+
|
72 |
+
|
73 |
+
# Function to read each uploaded Excel file into a pandas DataFrame and stores them in a dictionary
|
74 |
+
st.cache_resource(show_spinner=False)
|
75 |
+
|
76 |
+
|
77 |
+
def files_to_dataframes(uploaded_files):
|
78 |
+
df_dict = {}
|
79 |
+
for uploaded_file in uploaded_files:
|
80 |
+
# Extract file name without extension
|
81 |
+
file_name = uploaded_file.name.rsplit(".", 1)[0]
|
82 |
+
|
83 |
+
# Check for duplicate file names
|
84 |
+
if file_name in df_dict:
|
85 |
+
st.warning(
|
86 |
+
f"Duplicate File: {file_name}. This file will be skipped.",
|
87 |
+
icon="⚠️",
|
88 |
+
)
|
89 |
+
continue
|
90 |
+
|
91 |
+
# Read the file into a DataFrame
|
92 |
+
df = pd.read_excel(uploaded_file)
|
93 |
+
|
94 |
+
# Convert all column names to lowercase
|
95 |
+
df.columns = df.columns.str.lower().str.strip()
|
96 |
+
|
97 |
+
# Separate numeric and non-numeric columns
|
98 |
+
numeric_cols = list(df.select_dtypes(include=["number"]).columns)
|
99 |
+
non_numeric_cols = [
|
100 |
+
col
|
101 |
+
for col in df.select_dtypes(exclude=["number"]).columns
|
102 |
+
if col.lower() != "date"
|
103 |
+
]
|
104 |
+
|
105 |
+
# Check for 'Date' column
|
106 |
+
if not (validate_date_column(df) and len(numeric_cols) > 0):
|
107 |
+
st.warning(
|
108 |
+
f"File Name: {file_name} ➜ Please upload data with Date column in 'DD-MM-YYYY' format and at least one media/exogenous column. This file will be skipped.",
|
109 |
+
icon="⚠️",
|
110 |
+
)
|
111 |
+
continue
|
112 |
+
|
113 |
+
# Check for interval
|
114 |
+
common_freq = common_freq = (
|
115 |
+
pd.Series(df["date"].unique()).diff().dt.days.dropna().mode()[0]
|
116 |
+
)
|
117 |
+
# Calculate the data interval (daily, weekly, monthly or irregular)
|
118 |
+
interval = determine_data_interval(common_freq)
|
119 |
+
if interval == "irregular":
|
120 |
+
st.warning(
|
121 |
+
f"File Name: {file_name} ➜ Please upload data in daily, weekly or monthly interval. This file will be skipped.",
|
122 |
+
icon="⚠️",
|
123 |
+
)
|
124 |
+
continue
|
125 |
+
|
126 |
+
# Store both DataFrames in the dictionary under their respective keys
|
127 |
+
df_dict[file_name] = {
|
128 |
+
"numeric": numeric_cols,
|
129 |
+
"non_numeric": non_numeric_cols,
|
130 |
+
"interval": interval,
|
131 |
+
"df": df,
|
132 |
+
}
|
133 |
+
|
134 |
+
return df_dict
|
135 |
+
|
136 |
+
|
137 |
+
# Function to adjust dataframe granularity
|
138 |
+
def adjust_dataframe_granularity(df, current_granularity, target_granularity):
|
139 |
+
# Set index
|
140 |
+
df.set_index("date", inplace=True)
|
141 |
+
|
142 |
+
# Define aggregation rules for resampling
|
143 |
+
aggregation_rules = {
|
144 |
+
col: "sum" if pd.api.types.is_numeric_dtype(df[col]) else "first"
|
145 |
+
for col in df.columns
|
146 |
+
}
|
147 |
+
|
148 |
+
# Initialize resampled_df
|
149 |
+
resampled_df = df
|
150 |
+
if current_granularity == "daily" and target_granularity == "weekly":
|
151 |
+
resampled_df = df.resample("W-MON", closed="left", label="left").agg(
|
152 |
+
aggregation_rules
|
153 |
+
)
|
154 |
+
|
155 |
+
elif current_granularity == "daily" and target_granularity == "monthly":
|
156 |
+
resampled_df = df.resample("MS", closed="left", label="left").agg(
|
157 |
+
aggregation_rules
|
158 |
+
)
|
159 |
+
|
160 |
+
elif current_granularity == "daily" and target_granularity == "daily":
|
161 |
+
resampled_df = df.resample("D").agg(aggregation_rules)
|
162 |
+
|
163 |
+
elif current_granularity in ["weekly", "monthly"] and target_granularity == "daily":
|
164 |
+
# For higher to lower granularity, distribute numeric and replicate non-numeric values equally across the new period
|
165 |
+
expanded_data = []
|
166 |
+
for _, row in df.iterrows():
|
167 |
+
if current_granularity == "weekly":
|
168 |
+
period_range = pd.date_range(start=row.name, periods=7)
|
169 |
+
elif current_granularity == "monthly":
|
170 |
+
period_range = pd.date_range(
|
171 |
+
start=row.name, periods=row.name.days_in_month
|
172 |
+
)
|
173 |
+
|
174 |
+
for date in period_range:
|
175 |
+
new_row = {}
|
176 |
+
for col in df.columns:
|
177 |
+
if pd.api.types.is_numeric_dtype(df[col]):
|
178 |
+
if current_granularity == "weekly":
|
179 |
+
new_row[col] = row[col] / 7
|
180 |
+
elif current_granularity == "monthly":
|
181 |
+
new_row[col] = row[col] / row.name.days_in_month
|
182 |
+
else:
|
183 |
+
new_row[col] = row[col]
|
184 |
+
expanded_data.append((date, new_row))
|
185 |
+
|
186 |
+
resampled_df = pd.DataFrame(
|
187 |
+
[data for _, data in expanded_data],
|
188 |
+
index=[date for date, _ in expanded_data],
|
189 |
+
)
|
190 |
+
|
191 |
+
# Reset index
|
192 |
+
resampled_df = resampled_df.reset_index().rename(columns={"index": "date"})
|
193 |
+
|
194 |
+
return resampled_df
|
195 |
+
|
196 |
+
|
197 |
+
# Function to clean and extract unique values of Panel_1 and Panel_2
|
198 |
+
st.cache_resource(show_spinner=False)
|
199 |
+
|
200 |
+
|
201 |
+
def clean_and_extract_unique_values(files_dict, selections):
|
202 |
+
all_panel1_values = set()
|
203 |
+
all_panel2_values = set()
|
204 |
+
|
205 |
+
for file_name, file_data in files_dict.items():
|
206 |
+
df = file_data["df"]
|
207 |
+
|
208 |
+
# 'Panel_1' and 'Panel_2' selections
|
209 |
+
selected_panel1 = selections[file_name].get("Panel_1")
|
210 |
+
selected_panel2 = selections[file_name].get("Panel_2")
|
211 |
+
|
212 |
+
# Clean and standardize Panel_1 column if it exists and is selected
|
213 |
+
if (
|
214 |
+
selected_panel1
|
215 |
+
and selected_panel1 != "N/A"
|
216 |
+
and selected_panel1 in df.columns
|
217 |
+
):
|
218 |
+
df[selected_panel1] = (
|
219 |
+
df[selected_panel1].str.lower().str.strip().str.replace("_", " ")
|
220 |
+
)
|
221 |
+
all_panel1_values.update(df[selected_panel1].dropna().unique())
|
222 |
+
|
223 |
+
# Clean and standardize Panel_2 column if it exists and is selected
|
224 |
+
if (
|
225 |
+
selected_panel2
|
226 |
+
and selected_panel2 != "N/A"
|
227 |
+
and selected_panel2 in df.columns
|
228 |
+
):
|
229 |
+
df[selected_panel2] = (
|
230 |
+
df[selected_panel2].str.lower().str.strip().str.replace("_", " ")
|
231 |
+
)
|
232 |
+
all_panel2_values.update(df[selected_panel2].dropna().unique())
|
233 |
+
|
234 |
+
# Update the processed DataFrame back in the dictionary
|
235 |
+
files_dict[file_name]["df"] = df
|
236 |
+
|
237 |
+
return all_panel1_values, all_panel2_values
|
238 |
+
|
239 |
+
|
240 |
+
# Function to format values for display
|
241 |
+
st.cache_resource(show_spinner=False)
|
242 |
+
|
243 |
+
|
244 |
+
def format_values_for_display(values_list):
|
245 |
+
# Capitalize the first letter of each word and replace underscores with spaces
|
246 |
+
formatted_list = [value.replace("_", " ").title() for value in values_list]
|
247 |
+
# Join values with commas and 'and' before the last value
|
248 |
+
if len(formatted_list) > 1:
|
249 |
+
return ", ".join(formatted_list[:-1]) + ", and " + formatted_list[-1]
|
250 |
+
elif formatted_list:
|
251 |
+
return formatted_list[0]
|
252 |
+
return "No values available"
|
253 |
+
|
254 |
+
|
255 |
+
# Function to normalizes all data within files_dict to a daily granularity
|
256 |
+
st.cache(show_spinner=False, allow_output_mutation=True)
|
257 |
+
|
258 |
+
|
259 |
+
def standardize_data_to_daily(files_dict, selections):
|
260 |
+
# Normalize all data to a daily granularity using a provided function
|
261 |
+
files_dict = apply_granularity_to_all(files_dict, "daily", selections)
|
262 |
+
|
263 |
+
# Update the "interval" attribute for each dataset to indicate the new granularity
|
264 |
+
for files_name, files_data in files_dict.items():
|
265 |
+
files_data["interval"] = "daily"
|
266 |
+
|
267 |
+
return files_dict
|
268 |
+
|
269 |
+
|
270 |
+
# Function to apply granularity transformation to all DataFrames in files_dict
|
271 |
+
st.cache_resource(show_spinner=False)
|
272 |
+
|
273 |
+
|
274 |
+
def apply_granularity_to_all(files_dict, granularity_selection, selections):
|
275 |
+
for file_name, file_data in files_dict.items():
|
276 |
+
df = file_data["df"].copy()
|
277 |
+
|
278 |
+
# Handling when Panel_1 or Panel_2 might be 'N/A'
|
279 |
+
selected_panel1 = selections[file_name].get("Panel_1")
|
280 |
+
selected_panel2 = selections[file_name].get("Panel_2")
|
281 |
+
|
282 |
+
# Correcting the segment selection logic & handling 'N/A'
|
283 |
+
if selected_panel1 != "N/A" and selected_panel2 != "N/A":
|
284 |
+
unique_combinations = df[
|
285 |
+
[selected_panel1, selected_panel2]
|
286 |
+
].drop_duplicates()
|
287 |
+
elif selected_panel1 != "N/A":
|
288 |
+
unique_combinations = df[[selected_panel1]].drop_duplicates()
|
289 |
+
selected_panel2 = None # Ensure Panel_2 is ignored if N/A
|
290 |
+
elif selected_panel2 != "N/A":
|
291 |
+
unique_combinations = df[[selected_panel2]].drop_duplicates()
|
292 |
+
selected_panel1 = None # Ensure Panel_1 is ignored if N/A
|
293 |
+
else:
|
294 |
+
# If both are 'N/A', process the entire dataframe as is
|
295 |
+
df = adjust_dataframe_granularity(
|
296 |
+
df, file_data["interval"], granularity_selection
|
297 |
+
)
|
298 |
+
files_dict[file_name]["df"] = df
|
299 |
+
continue # Skip to the next file
|
300 |
+
|
301 |
+
transformed_segments = []
|
302 |
+
for _, combo in unique_combinations.iterrows():
|
303 |
+
if selected_panel1 and selected_panel2:
|
304 |
+
segment = df[
|
305 |
+
(df[selected_panel1] == combo[selected_panel1])
|
306 |
+
& (df[selected_panel2] == combo[selected_panel2])
|
307 |
+
]
|
308 |
+
elif selected_panel1:
|
309 |
+
segment = df[df[selected_panel1] == combo[selected_panel1]]
|
310 |
+
elif selected_panel2:
|
311 |
+
segment = df[df[selected_panel2] == combo[selected_panel2]]
|
312 |
+
|
313 |
+
# Adjust granularity of the segment
|
314 |
+
transformed_segment = adjust_dataframe_granularity(
|
315 |
+
segment, file_data["interval"], granularity_selection
|
316 |
+
)
|
317 |
+
transformed_segments.append(transformed_segment)
|
318 |
+
|
319 |
+
# Combine all transformed segments into a single DataFrame for this file
|
320 |
+
transformed_df = pd.concat(transformed_segments, ignore_index=True)
|
321 |
+
files_dict[file_name]["df"] = transformed_df
|
322 |
+
|
323 |
+
return files_dict
|
324 |
+
|
325 |
+
|
326 |
+
# Function to create main dataframe structure
|
327 |
+
st.cache_resource(show_spinner=False)
|
328 |
+
|
329 |
+
|
330 |
+
def create_main_dataframe(
|
331 |
+
files_dict, all_panel1_values, all_panel2_values, granularity_selection
|
332 |
+
):
|
333 |
+
# Determine the global start and end dates across all DataFrames
|
334 |
+
global_start = min(df["df"]["date"].min() for df in files_dict.values())
|
335 |
+
global_end = max(df["df"]["date"].max() for df in files_dict.values())
|
336 |
+
|
337 |
+
# Adjust the date_range generation based on the granularity_selection
|
338 |
+
if granularity_selection == "weekly":
|
339 |
+
# Generate a weekly range, with weeks starting on Monday
|
340 |
+
date_range = pd.date_range(start=global_start, end=global_end, freq="W-MON")
|
341 |
+
elif granularity_selection == "monthly":
|
342 |
+
# Generate a monthly range, starting from the first day of each month
|
343 |
+
date_range = pd.date_range(start=global_start, end=global_end, freq="MS")
|
344 |
+
else: # Default to daily if not weekly or monthly
|
345 |
+
date_range = pd.date_range(start=global_start, end=global_end, freq="D")
|
346 |
+
|
347 |
+
# Collect all unique Panel_1 and Panel_2 values, excluding 'N/A'
|
348 |
+
all_panel1s = all_panel1_values
|
349 |
+
all_panel2s = all_panel2_values
|
350 |
+
|
351 |
+
# Dynamically build the list of dimensions (Panel_1, Panel_2) to include in the main DataFrame based on availability
|
352 |
+
dimensions, merge_keys = [], []
|
353 |
+
if all_panel1s:
|
354 |
+
dimensions.append(all_panel1s)
|
355 |
+
merge_keys.append("Panel_1")
|
356 |
+
if all_panel2s:
|
357 |
+
dimensions.append(all_panel2s)
|
358 |
+
merge_keys.append("Panel_2")
|
359 |
+
|
360 |
+
dimensions.append(date_range) # Date range is always included
|
361 |
+
merge_keys.append("date") # Date range is always included
|
362 |
+
|
363 |
+
# Create a main DataFrame template with the dimensions
|
364 |
+
main_df = pd.MultiIndex.from_product(
|
365 |
+
dimensions,
|
366 |
+
names=[name for name, _ in zip(merge_keys, dimensions)],
|
367 |
+
).to_frame(index=False)
|
368 |
+
|
369 |
+
return main_df.reset_index(drop=True)
|
370 |
+
|
371 |
+
|
372 |
+
# Function to prepare and merge dataFrames
|
373 |
+
st.cache_resource(show_spinner=False)
|
374 |
+
|
375 |
+
|
376 |
+
def merge_into_main_df(main_df, files_dict, selections):
|
377 |
+
for file_name, file_data in files_dict.items():
|
378 |
+
df = file_data["df"].copy()
|
379 |
+
|
380 |
+
# Rename selected Panel_1 and Panel_2 columns if not 'N/A'
|
381 |
+
selected_panel1 = selections[file_name].get("Panel_1", "N/A")
|
382 |
+
selected_panel2 = selections[file_name].get("Panel_2", "N/A")
|
383 |
+
if selected_panel1 != "N/A":
|
384 |
+
df.rename(columns={selected_panel1: "Panel_1"}, inplace=True)
|
385 |
+
if selected_panel2 != "N/A":
|
386 |
+
df.rename(columns={selected_panel2: "Panel_2"}, inplace=True)
|
387 |
+
|
388 |
+
# Merge current DataFrame into main_df based on 'date', and where applicable, 'Panel_1' and 'Panel_2'
|
389 |
+
merge_keys = ["date"]
|
390 |
+
if "Panel_1" in df.columns:
|
391 |
+
merge_keys.append("Panel_1")
|
392 |
+
if "Panel_2" in df.columns:
|
393 |
+
merge_keys.append("Panel_2")
|
394 |
+
main_df = pd.merge(main_df, df, on=merge_keys, how="left")
|
395 |
+
|
396 |
+
# After all merges, sort by 'date' and reset index for cleanliness
|
397 |
+
sort_by = ["date"]
|
398 |
+
if "Panel_1" in main_df.columns:
|
399 |
+
sort_by.append("Panel_1")
|
400 |
+
if "Panel_2" in main_df.columns:
|
401 |
+
sort_by.append("Panel_2")
|
402 |
+
main_df.sort_values(by=sort_by, inplace=True)
|
403 |
+
main_df.reset_index(drop=True, inplace=True)
|
404 |
+
|
405 |
+
return main_df
|
406 |
+
|
407 |
+
|
408 |
+
# Function to categorize column
|
409 |
+
def categorize_column(column_name):
|
410 |
+
# Define keywords for each category
|
411 |
+
internal_keywords = [
|
412 |
+
"Price",
|
413 |
+
"Discount",
|
414 |
+
"product_price",
|
415 |
+
"cost",
|
416 |
+
"margin",
|
417 |
+
"inventory",
|
418 |
+
"sales",
|
419 |
+
"revenue",
|
420 |
+
"turnover",
|
421 |
+
"expense",
|
422 |
+
]
|
423 |
+
exogenous_keywords = [
|
424 |
+
"GDP",
|
425 |
+
"Tax",
|
426 |
+
"Inflation",
|
427 |
+
"interest_rate",
|
428 |
+
"employment_rate",
|
429 |
+
"exchange_rate",
|
430 |
+
"consumer_spending",
|
431 |
+
"retail_sales",
|
432 |
+
"oil_prices",
|
433 |
+
"weather",
|
434 |
+
]
|
435 |
+
|
436 |
+
# Check if the column name matches any of the keywords for Internal or Exogenous categories
|
437 |
+
for keyword in internal_keywords:
|
438 |
+
if keyword.lower() in column_name.lower():
|
439 |
+
return "Internal"
|
440 |
+
for keyword in exogenous_keywords:
|
441 |
+
if keyword.lower() in column_name.lower():
|
442 |
+
return "Exogenous"
|
443 |
+
|
444 |
+
# Default to Media if no match found
|
445 |
+
return "Media"
|
446 |
+
|
447 |
+
|
448 |
+
# Function to calculate missing stats and prepare for editable DataFrame
|
449 |
+
st.cache_resource(show_spinner=False)
|
450 |
+
|
451 |
+
|
452 |
+
def prepare_missing_stats_df(df):
|
453 |
+
missing_stats = []
|
454 |
+
for column in df.columns:
|
455 |
+
if (
|
456 |
+
column == "date" or column == "Panel_2" or column == "Panel_1"
|
457 |
+
): # Skip Date, Panel_1 and Panel_2 column
|
458 |
+
continue
|
459 |
+
|
460 |
+
missing = df[column].isnull().sum()
|
461 |
+
pct_missing = round((missing / len(df)) * 100, 2)
|
462 |
+
|
463 |
+
# Dynamically assign category based on column name
|
464 |
+
category = categorize_column(column)
|
465 |
+
# category = "Media" # Keep default bin as Media
|
466 |
+
|
467 |
+
missing_stats.append(
|
468 |
+
{
|
469 |
+
"Column": column,
|
470 |
+
"Missing Values": missing,
|
471 |
+
"Missing Percentage": pct_missing,
|
472 |
+
"Impute Method": "Fill with 0", # Default value
|
473 |
+
"Category": category,
|
474 |
+
}
|
475 |
+
)
|
476 |
+
stats_df = pd.DataFrame(missing_stats)
|
477 |
+
|
478 |
+
return stats_df
|
479 |
+
|
480 |
+
|
481 |
+
# Function to add API DataFrame details to the files dictionary
|
482 |
+
st.cache_resource(show_spinner=False)
|
483 |
+
|
484 |
+
|
485 |
+
def add_api_dataframe_to_dict(main_df, files_dict):
|
486 |
+
files_dict["API"] = {
|
487 |
+
"numeric": list(main_df.select_dtypes(include=["number"]).columns),
|
488 |
+
"non_numeric": [
|
489 |
+
col
|
490 |
+
for col in main_df.select_dtypes(exclude=["number"]).columns
|
491 |
+
if col.lower() != "date"
|
492 |
+
],
|
493 |
+
"interval": determine_data_interval(
|
494 |
+
pd.Series(main_df["date"].unique()).diff().dt.days.dropna().mode()[0]
|
495 |
+
),
|
496 |
+
"df": main_df,
|
497 |
+
}
|
498 |
+
|
499 |
+
return files_dict
|
500 |
+
|
501 |
+
|
502 |
+
# Function to reads an API into a DataFrame, parsing specified columns as datetime
|
503 |
+
@st.cache_resource(show_spinner=False)
|
504 |
+
def read_API_data():
|
505 |
+
return pd.read_excel(r".\upf_data_converted_randomized_resp_metrics.xlsx", parse_dates=["Date"])
|
506 |
+
|
507 |
+
|
508 |
+
# Function to set the 'Panel_1_Panel_2_Selected' session state variable to False
|
509 |
+
def set_Panel_1_Panel_2_Selected_false():
|
510 |
+
st.session_state["Panel_1_Panel_2_Selected"] = False
|
511 |
+
|
512 |
+
|
513 |
+
# Function to serialize and save the objects into a pickle file
|
514 |
+
@st.cache_resource(show_spinner=False)
|
515 |
+
def save_to_pickle(file_path, final_df, bin_dict):
|
516 |
+
# Open the file in write-binary mode and dump the objects
|
517 |
+
with open(file_path, "wb") as f:
|
518 |
+
pickle.dump({"final_df": final_df, "bin_dict": bin_dict}, f)
|
519 |
+
# Data is now saved to file
|
520 |
+
|
521 |
+
|
522 |
+
# Function to processes the merged_df DataFrame based on operations defined in edited_df
|
523 |
+
@st.cache_resource(show_spinner=False)
|
524 |
+
def process_dataframes(merged_df, edited_df, edited_stats_df):
|
525 |
+
# Ensure there are operations defined by the user
|
526 |
+
if edited_df.empty:
|
527 |
+
return merged_df, edited_stats_df # No operations to apply
|
528 |
+
|
529 |
+
# Perform operations as defined by the user
|
530 |
+
for index, row in edited_df.iterrows():
|
531 |
+
result_column_name = f"{row['Column 1']}{row['Operator']}{row['Column 2']}"
|
532 |
+
col1 = row["Column 1"]
|
533 |
+
col2 = row["Column 2"]
|
534 |
+
op = row["Operator"]
|
535 |
+
|
536 |
+
# Apply the specified operation
|
537 |
+
if op == "+":
|
538 |
+
merged_df[result_column_name] = merged_df[col1] + merged_df[col2]
|
539 |
+
elif op == "-":
|
540 |
+
merged_df[result_column_name] = merged_df[col1] - merged_df[col2]
|
541 |
+
elif op == "*":
|
542 |
+
merged_df[result_column_name] = merged_df[col1] * merged_df[col2]
|
543 |
+
elif op == "/":
|
544 |
+
merged_df[result_column_name] = merged_df[col1] / merged_df[col2].replace(
|
545 |
+
0, 1e-9
|
546 |
+
)
|
547 |
+
|
548 |
+
# Add summary of operation to edited_stats_df
|
549 |
+
new_row = {
|
550 |
+
"Column": result_column_name,
|
551 |
+
"Missing Values": None,
|
552 |
+
"Missing Percentage": None,
|
553 |
+
"Impute Method": None,
|
554 |
+
"Category": row["Category"],
|
555 |
+
}
|
556 |
+
new_row_df = pd.DataFrame([new_row])
|
557 |
+
|
558 |
+
# Use pd.concat to add the new_row_df to edited_stats_df
|
559 |
+
edited_stats_df = pd.concat(
|
560 |
+
[edited_stats_df, new_row_df], ignore_index=True, axis=0
|
561 |
+
)
|
562 |
+
|
563 |
+
# Combine column names from edited_df for cleanup
|
564 |
+
combined_columns = set(edited_df["Column 1"]).union(set(edited_df["Column 2"]))
|
565 |
+
|
566 |
+
# Filter out rows in edited_stats_df and drop columns from merged_df
|
567 |
+
edited_stats_df = edited_stats_df[~edited_stats_df["Column"].isin(combined_columns)]
|
568 |
+
merged_df.drop(columns=list(combined_columns), errors="ignore", inplace=True)
|
569 |
+
|
570 |
+
return merged_df, edited_stats_df
|
571 |
+
|
572 |
+
|
573 |
+
# Function to prepare a list of numeric column names and initialize an empty DataFrame with predefined structure
|
574 |
+
st.cache_resource(show_spinner=False)
|
575 |
+
|
576 |
+
|
577 |
+
def prepare_numeric_columns_and_default_df(merged_df, edited_stats_df):
|
578 |
+
# Get columns categorized as 'Response Metrics'
|
579 |
+
columns_response_metrics = edited_stats_df[
|
580 |
+
edited_stats_df["Category"] == "Response Metrics"
|
581 |
+
]["Column"].tolist()
|
582 |
+
|
583 |
+
# Filter numeric columns, excluding those categorized as 'Response Metrics'
|
584 |
+
numeric_columns = [
|
585 |
+
col
|
586 |
+
for col in merged_df.select_dtypes(include=["number"]).columns
|
587 |
+
if col not in columns_response_metrics
|
588 |
+
]
|
589 |
+
|
590 |
+
# Define the structure of the empty DataFrame
|
591 |
+
data = {
|
592 |
+
"Column 1": pd.Series([], dtype="str"),
|
593 |
+
"Operator": pd.Series([], dtype="str"),
|
594 |
+
"Column 2": pd.Series([], dtype="str"),
|
595 |
+
"Category": pd.Series([], dtype="str"),
|
596 |
+
}
|
597 |
+
default_df = pd.DataFrame(data)
|
598 |
+
|
599 |
+
return numeric_columns, default_df
|
600 |
+
|
601 |
+
|
602 |
+
# Initialize 'final_df' in session state
|
603 |
+
if "final_df" not in st.session_state:
|
604 |
+
st.session_state["final_df"] = pd.DataFrame()
|
605 |
+
|
606 |
+
# Initialize 'bin_dict' in session state
|
607 |
+
if "bin_dict" not in st.session_state:
|
608 |
+
st.session_state["bin_dict"] = {}
|
609 |
+
|
610 |
+
# Initialize 'Panel_1_Panel_2_Selected' in session state
|
611 |
+
if "Panel_1_Panel_2_Selected" not in st.session_state:
|
612 |
+
st.session_state["Panel_1_Panel_2_Selected"] = False
|
613 |
+
|
614 |
+
|
615 |
+
# Page Title
|
616 |
+
st.write("") # Top padding
|
617 |
+
st.title("Data Import")
|
618 |
+
|
619 |
+
|
620 |
+
#########################################################################################################################################################
|
621 |
+
# Create a dictionary to hold all DataFrames and collect user input to specify "Panel_2" and "Panel_1" columns for each file
|
622 |
+
#########################################################################################################################################################
|
623 |
+
|
624 |
+
|
625 |
+
# Read the Excel file, parsing 'Date' column as datetime
|
626 |
+
main_df = read_API_data()
|
627 |
+
|
628 |
+
# Convert all column names to lowercase
|
629 |
+
main_df.columns = main_df.columns.str.lower().str.strip()
|
630 |
+
|
631 |
+
# File uploader
|
632 |
+
uploaded_files = st.file_uploader(
|
633 |
+
"Upload additional data",
|
634 |
+
type=["xlsx"],
|
635 |
+
accept_multiple_files=True,
|
636 |
+
on_change=set_Panel_1_Panel_2_Selected_false,
|
637 |
+
)
|
638 |
+
|
639 |
+
# Custom HTML for upload instructions
|
640 |
+
recommendation_html = f"""
|
641 |
+
<div style="text-align: justify;">
|
642 |
+
<strong>Recommendation:</strong> For optimal processing, please ensure that all uploaded datasets including panel, media, internal, and exogenous data adhere to the following guidelines: Each dataset must include a <code>Date</code> column formatted as <code>DD-MM-YYYY</code>, be free of missing values.
|
643 |
+
</div>
|
644 |
+
"""
|
645 |
+
st.markdown(recommendation_html, unsafe_allow_html=True)
|
646 |
+
|
647 |
+
# Choose Desired Granularity
|
648 |
+
st.markdown("#### Choose Desired Granularity")
|
649 |
+
# Granularity Selection
|
650 |
+
granularity_selection = st.selectbox(
|
651 |
+
"Choose Date Granularity",
|
652 |
+
["Daily", "Weekly", "Monthly"],
|
653 |
+
label_visibility="collapsed",
|
654 |
+
on_change=set_Panel_1_Panel_2_Selected_false,
|
655 |
+
)
|
656 |
+
granularity_selection = str(granularity_selection).lower()
|
657 |
+
|
658 |
+
# Convert files to dataframes
|
659 |
+
files_dict = files_to_dataframes(uploaded_files)
|
660 |
+
|
661 |
+
# Add API Dataframe
|
662 |
+
if main_df is not None:
|
663 |
+
files_dict = add_api_dataframe_to_dict(main_df, files_dict)
|
664 |
+
|
665 |
+
# Display a warning message if no files have been uploaded and halt further execution
|
666 |
+
if not files_dict:
|
667 |
+
st.warning(
|
668 |
+
"Please upload at least one file to proceed.",
|
669 |
+
icon="⚠️",
|
670 |
+
)
|
671 |
+
st.stop() # Halts further execution until file is uploaded
|
672 |
+
|
673 |
+
|
674 |
+
# Select Panel_1 and Panel_2 columns
|
675 |
+
st.markdown("#### Select Panel columns")
|
676 |
+
selections = {}
|
677 |
+
with st.expander("Select Panel columns", expanded=False):
|
678 |
+
count = 0 # Initialize counter to manage the visibility of labels and keys
|
679 |
+
for file_name, file_data in files_dict.items():
|
680 |
+
# Determine visibility of the label based on the count
|
681 |
+
if count == 0:
|
682 |
+
label_visibility = "visible"
|
683 |
+
else:
|
684 |
+
label_visibility = "collapsed"
|
685 |
+
|
686 |
+
# Extract non-numeric columns
|
687 |
+
non_numeric_cols = file_data["non_numeric"]
|
688 |
+
|
689 |
+
# Prepare Panel_1 and Panel_2 values for dropdown, adding "N/A" as an option
|
690 |
+
panel1_values = non_numeric_cols + ["N/A"]
|
691 |
+
panel2_values = non_numeric_cols + ["N/A"]
|
692 |
+
|
693 |
+
# Skip if only one option is available
|
694 |
+
if len(panel1_values) == 1 and len(panel2_values) == 1:
|
695 |
+
selected_panel1, selected_panel2 = "N/A", "N/A"
|
696 |
+
# Update the selections for Panel_1 and Panel_2 for the current file
|
697 |
+
selections[file_name] = {
|
698 |
+
"Panel_1": selected_panel1,
|
699 |
+
"Panel_2": selected_panel2,
|
700 |
+
}
|
701 |
+
continue
|
702 |
+
|
703 |
+
# Create layout columns for File Name, Panel_2, and Panel_1 selections
|
704 |
+
file_name_col, Panel_1_col, Panel_2_col = st.columns([2, 4, 4])
|
705 |
+
|
706 |
+
with file_name_col:
|
707 |
+
# Display "File Name" label only for the first file
|
708 |
+
if count == 0:
|
709 |
+
st.write("File Name")
|
710 |
+
else:
|
711 |
+
st.write("")
|
712 |
+
st.write(file_name) # Display the file name
|
713 |
+
|
714 |
+
with Panel_1_col:
|
715 |
+
# Display a selectbox for Panel_1 values
|
716 |
+
selected_panel1 = st.selectbox(
|
717 |
+
"Select Panel Level 1",
|
718 |
+
panel2_values,
|
719 |
+
on_change=set_Panel_1_Panel_2_Selected_false,
|
720 |
+
label_visibility=label_visibility, # Control visibility of the label
|
721 |
+
key=f"Panel_1_selectbox{count}", # Ensure unique key for each selectbox
|
722 |
+
)
|
723 |
+
|
724 |
+
with Panel_2_col:
|
725 |
+
# Display a selectbox for Panel_2 values
|
726 |
+
selected_panel2 = st.selectbox(
|
727 |
+
"Select Panel Level 2",
|
728 |
+
panel1_values,
|
729 |
+
on_change=set_Panel_1_Panel_2_Selected_false,
|
730 |
+
label_visibility=label_visibility, # Control visibility of the label
|
731 |
+
key=f"Panel_2_selectbox{count}", # Ensure unique key for each selectbox
|
732 |
+
)
|
733 |
+
|
734 |
+
# Skip processing if the same column is selected for both Panel_1 and Panel_2 due to potential data integrity issues
|
735 |
+
if selected_panel2 == selected_panel1 and not (
|
736 |
+
selected_panel2 == "N/A" and selected_panel1 == "N/A"
|
737 |
+
):
|
738 |
+
st.warning(
|
739 |
+
f"File: {file_name} → The same column cannot serve as both Panel_1 and Panel_2. Please adjust your selections.",
|
740 |
+
)
|
741 |
+
selected_panel1, selected_panel2 = "N/A", "N/A"
|
742 |
+
st.stop()
|
743 |
+
|
744 |
+
# Update the selections for Panel_1 and Panel_2 for the current file
|
745 |
+
selections[file_name] = {
|
746 |
+
"Panel_1": selected_panel1,
|
747 |
+
"Panel_2": selected_panel2,
|
748 |
+
}
|
749 |
+
|
750 |
+
count += 1 # Increment the counter after processing each file
|
751 |
+
|
752 |
+
# Accept Panel_1 and Panel_2 selection
|
753 |
+
if st.button("Accept and Process", use_container_width=True):
|
754 |
+
|
755 |
+
# Normalize all data to a daily granularity. This initial standardization simplifies subsequent conversions to other levels of granularity
|
756 |
+
with st.spinner("Processing..."):
|
757 |
+
files_dict = standardize_data_to_daily(files_dict, selections)
|
758 |
+
|
759 |
+
# Convert all data to daily level granularity
|
760 |
+
files_dict = apply_granularity_to_all(
|
761 |
+
files_dict, granularity_selection, selections
|
762 |
+
)
|
763 |
+
|
764 |
+
# Update the 'files_dict' in the session state
|
765 |
+
st.session_state["files_dict"] = files_dict
|
766 |
+
|
767 |
+
# Set a flag in the session state to indicate that selection has been made
|
768 |
+
st.session_state["Panel_1_Panel_2_Selected"] = True
|
769 |
+
|
770 |
+
|
771 |
+
#########################################################################################################################################################
|
772 |
+
# Display unique Panel_1 and Panel_2 values
|
773 |
+
#########################################################################################################################################################
|
774 |
+
|
775 |
+
|
776 |
+
# Halts further execution until Panel_1 and Panel_2 columns are selected
|
777 |
+
if "files_dict" in st.session_state and st.session_state["Panel_1_Panel_2_Selected"]:
|
778 |
+
files_dict = st.session_state["files_dict"]
|
779 |
+
else:
|
780 |
+
st.stop()
|
781 |
+
|
782 |
+
# Set to store unique values of Panel_1 and Panel_2
|
783 |
+
with st.spinner("Fetching Panel values..."):
|
784 |
+
all_panel1_values, all_panel2_values = clean_and_extract_unique_values(
|
785 |
+
files_dict, selections
|
786 |
+
)
|
787 |
+
|
788 |
+
# List of Panel_1 and Panel_2 columns unique values
|
789 |
+
list_of_all_panel1_values = list(all_panel1_values)
|
790 |
+
list_of_all_panel2_values = list(all_panel2_values)
|
791 |
+
|
792 |
+
# Format Panel_1 and Panel_2 values for display
|
793 |
+
formatted_panel1_values = format_values_for_display(list_of_all_panel1_values)
|
794 |
+
formatted_panel2_values = format_values_for_display(list_of_all_panel2_values)
|
795 |
+
|
796 |
+
# Unique Panel_1 and Panel_2 values
|
797 |
+
st.markdown("#### Unique Panel values")
|
798 |
+
# Display Panel_1 and Panel_2 values
|
799 |
+
with st.expander("Unique Panel values"):
|
800 |
+
st.write("")
|
801 |
+
st.markdown(
|
802 |
+
f"""
|
803 |
+
<style>
|
804 |
+
.justify-text {{
|
805 |
+
text-align: justify;
|
806 |
+
}}
|
807 |
+
</style>
|
808 |
+
<div class="justify-text">
|
809 |
+
<strong>Panel Level 1 Values:</strong> {formatted_panel1_values}<br>
|
810 |
+
<strong>Panel Level 2 Values:</strong> {formatted_panel2_values}
|
811 |
+
</div>
|
812 |
+
""",
|
813 |
+
unsafe_allow_html=True,
|
814 |
+
)
|
815 |
+
|
816 |
+
# Display total Panel_1 and Panel_2
|
817 |
+
st.write("")
|
818 |
+
st.markdown(
|
819 |
+
f"""
|
820 |
+
<div style="text-align: justify;">
|
821 |
+
<strong>Number of Level 1 Panels detected:</strong> {len(list_of_all_panel1_values)}<br>
|
822 |
+
<strong>Number of Level 2 Panels detected:</strong> {len(list_of_all_panel2_values)}
|
823 |
+
</div>
|
824 |
+
""",
|
825 |
+
unsafe_allow_html=True,
|
826 |
+
)
|
827 |
+
st.write("")
|
828 |
+
|
829 |
+
|
830 |
+
#########################################################################################################################################################
|
831 |
+
# Merge all DataFrames
|
832 |
+
#########################################################################################################################################################
|
833 |
+
|
834 |
+
|
835 |
+
# Merge all DataFrames selected
|
836 |
+
main_df = create_main_dataframe(
|
837 |
+
files_dict, all_panel1_values, all_panel2_values, granularity_selection
|
838 |
+
)
|
839 |
+
merged_df = merge_into_main_df(main_df, files_dict, selections)
|
840 |
+
|
841 |
+
|
842 |
+
#########################################################################################################################################################
|
843 |
+
# Categorize Variables and Impute Missing Values
|
844 |
+
#########################################################################################################################################################
|
845 |
+
|
846 |
+
|
847 |
+
# Create an editable DataFrame in Streamlit
|
848 |
+
st.markdown("#### Select Variables Category & Impute Missing Values")
|
849 |
+
|
850 |
+
# Prepare missing stats DataFrame for editing
|
851 |
+
missing_stats_df = prepare_missing_stats_df(merged_df)
|
852 |
+
|
853 |
+
edited_stats_df = st.data_editor(
|
854 |
+
missing_stats_df,
|
855 |
+
column_config={
|
856 |
+
"Impute Method": st.column_config.SelectboxColumn(
|
857 |
+
options=[
|
858 |
+
"Drop Column",
|
859 |
+
"Fill with Mean",
|
860 |
+
"Fill with Median",
|
861 |
+
"Fill with 0",
|
862 |
+
],
|
863 |
+
required=True,
|
864 |
+
default="Fill with 0",
|
865 |
+
),
|
866 |
+
"Category": st.column_config.SelectboxColumn(
|
867 |
+
options=[
|
868 |
+
"Media",
|
869 |
+
"Exogenous",
|
870 |
+
"Internal",
|
871 |
+
"Response Metrics",
|
872 |
+
],
|
873 |
+
required=True,
|
874 |
+
default="Media",
|
875 |
+
),
|
876 |
+
},
|
877 |
+
disabled=["Column", "Missing Values", "Missing Percentage"],
|
878 |
+
hide_index=True,
|
879 |
+
use_container_width=True,
|
880 |
+
)
|
881 |
+
|
882 |
+
# Apply changes based on edited DataFrame
|
883 |
+
for i, row in edited_stats_df.iterrows():
|
884 |
+
column = row["Column"]
|
885 |
+
if row["Impute Method"] == "Drop Column":
|
886 |
+
merged_df.drop(columns=[column], inplace=True)
|
887 |
+
|
888 |
+
elif row["Impute Method"] == "Fill with Mean":
|
889 |
+
merged_df[column].fillna(merged_df[column].mean(), inplace=True)
|
890 |
+
|
891 |
+
elif row["Impute Method"] == "Fill with Median":
|
892 |
+
merged_df[column].fillna(merged_df[column].median(), inplace=True)
|
893 |
+
|
894 |
+
elif row["Impute Method"] == "Fill with 0":
|
895 |
+
merged_df[column].fillna(0, inplace=True)
|
896 |
+
|
897 |
+
|
898 |
+
#########################################################################################################################################################
|
899 |
+
# Group columns
|
900 |
+
#########################################################################################################################################################
|
901 |
+
|
902 |
+
|
903 |
+
# Display Group columns header
|
904 |
+
st.markdown("#### Feature engineering")
|
905 |
+
|
906 |
+
# Prepare the numeric columns and an empty DataFrame for user input
|
907 |
+
numeric_columns, default_df = prepare_numeric_columns_and_default_df(
|
908 |
+
merged_df, edited_stats_df
|
909 |
+
)
|
910 |
+
|
911 |
+
# Display editable Dataframe
|
912 |
+
edited_df = st.data_editor(
|
913 |
+
default_df,
|
914 |
+
column_config={
|
915 |
+
"Column 1": st.column_config.SelectboxColumn(
|
916 |
+
options=numeric_columns,
|
917 |
+
required=True,
|
918 |
+
default=numeric_columns[0],
|
919 |
+
width=400,
|
920 |
+
),
|
921 |
+
"Operator": st.column_config.SelectboxColumn(
|
922 |
+
options=["+", "-", "*", "/"],
|
923 |
+
required=True,
|
924 |
+
default="+",
|
925 |
+
width=100,
|
926 |
+
),
|
927 |
+
"Column 2": st.column_config.SelectboxColumn(
|
928 |
+
options=numeric_columns,
|
929 |
+
required=True,
|
930 |
+
default=numeric_columns[0],
|
931 |
+
width=400,
|
932 |
+
),
|
933 |
+
"Category": st.column_config.SelectboxColumn(
|
934 |
+
options=[
|
935 |
+
"Media",
|
936 |
+
"Exogenous",
|
937 |
+
"Internal",
|
938 |
+
"Response Metrics",
|
939 |
+
],
|
940 |
+
required=True,
|
941 |
+
default="Media",
|
942 |
+
width=200,
|
943 |
+
),
|
944 |
+
},
|
945 |
+
num_rows="dynamic",
|
946 |
+
)
|
947 |
+
|
948 |
+
# Process the DataFrame based on user inputs and operations specified in edited_df
|
949 |
+
final_df, edited_stats_df = process_dataframes(merged_df, edited_df, edited_stats_df)
|
950 |
+
|
951 |
+
|
952 |
+
#########################################################################################################################################################
|
953 |
+
# Display the Final DataFrame and variables
|
954 |
+
#########################################################################################################################################################
|
955 |
+
|
956 |
+
|
957 |
+
# Display the Final DataFrame and variables
|
958 |
+
st.markdown("#### Final DataFrame")
|
959 |
+
st.dataframe(final_df, hide_index=True)
|
960 |
+
|
961 |
+
# Initialize an empty dictionary to hold categories and their variables
|
962 |
+
category_dict = {}
|
963 |
+
|
964 |
+
# Iterate over each row in the edited DataFrame to populate the dictionary
|
965 |
+
for i, row in edited_stats_df.iterrows():
|
966 |
+
column = row["Column"]
|
967 |
+
category = row["Category"] # The category chosen by the user for this variable
|
968 |
+
|
969 |
+
# Check if the category already exists in the dictionary
|
970 |
+
if category not in category_dict:
|
971 |
+
# If not, initialize it with the current column as its first element
|
972 |
+
category_dict[category] = [column]
|
973 |
+
else:
|
974 |
+
# If it exists, append the current column to the list of variables under this category
|
975 |
+
category_dict[category].append(column)
|
976 |
+
|
977 |
+
# Add Date, Panel_1 and Panel_12 in category dictionary
|
978 |
+
category_dict.update({"Date": ["date"]})
|
979 |
+
if "Panel_1" in final_df.columns:
|
980 |
+
category_dict["Panel Level 1"] = ["Panel_1"]
|
981 |
+
if "Panel_2" in final_df.columns:
|
982 |
+
category_dict["Panel Level 2"] = ["Panel_2"]
|
983 |
+
|
984 |
+
# Display the dictionary
|
985 |
+
st.markdown("#### Variable Category")
|
986 |
+
for category, variables in category_dict.items():
|
987 |
+
# Check if there are multiple variables to handle "and" insertion correctly
|
988 |
+
if len(variables) > 1:
|
989 |
+
# Join all but the last variable with ", ", then add " and " before the last variable
|
990 |
+
variables_str = ", ".join(variables[:-1]) + " and " + variables[-1]
|
991 |
+
else:
|
992 |
+
# If there's only one variable, no need for "and"
|
993 |
+
variables_str = variables[0]
|
994 |
+
|
995 |
+
# Display the category and its variables in the desired format
|
996 |
+
st.markdown(
|
997 |
+
f"<div style='text-align: justify;'><strong>{category}:</strong> {variables_str}</div>",
|
998 |
+
unsafe_allow_html=True,
|
999 |
+
)
|
1000 |
+
|
1001 |
+
# Function to check if Response Metrics is selected
|
1002 |
+
st.write("")
|
1003 |
+
response_metrics_col = category_dict.get("Response Metrics", [])
|
1004 |
+
if len(response_metrics_col) == 0:
|
1005 |
+
st.warning("Please select Response Metrics column", icon="⚠️")
|
1006 |
+
st.stop()
|
1007 |
+
# elif len(response_metrics_col) > 1:
|
1008 |
+
# st.warning("Please select only one Response Metrics column", icon="⚠️")
|
1009 |
+
# st.stop()
|
1010 |
+
|
1011 |
+
# Store final dataframe and bin dictionary into session state
|
1012 |
+
st.session_state["final_df"], st.session_state["bin_dict"] = final_df, category_dict
|
1013 |
+
|
1014 |
+
# Save the DataFrame and dictionary from the session state to the pickle file
|
1015 |
+
if st.button("Accept and Save", use_container_width=True):
|
1016 |
+
save_to_pickle(
|
1017 |
+
"data_import.pkl", st.session_state["final_df"], st.session_state["bin_dict"]
|
1018 |
+
)
|
1019 |
+
st.toast("💾 Saved Successfully!")
|
Data_prep_functions.py
CHANGED
@@ -86,76 +86,89 @@ def create_dual_axis_line_chart(date_series, promo_price_series, non_promo_price
|
|
86 |
def to_percentage(value):
|
87 |
return f'{value * 100:.1f}%'
|
88 |
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
|
|
|
|
|
|
113 |
if flag:
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
mape = mean_absolute_percentage_error(y, predicted_values)
|
|
|
|
|
129 |
r2 = r2_score(y, predicted_values)
|
130 |
adjr2 = 1 - (1 - r2) * (len(y) - 1) / (len(y) - len(model.fe_params) - 1)
|
131 |
|
|
|
132 |
metrics_table = pd.DataFrame({
|
133 |
-
|
134 |
-
|
135 |
})
|
136 |
-
|
137 |
-
# Convert date to datetime
|
138 |
-
date = pd.to_datetime(date)
|
139 |
-
|
140 |
-
# Calculate the number of days between each tick based on the date range
|
141 |
-
date_range = (max(date) - min(date)).days
|
142 |
-
#x_axis_tick_spacing = max(1, date_range // 50) # Divide the date range by 14 to get approximately 15 ticks
|
143 |
-
|
144 |
fig.update_layout(
|
145 |
-
|
146 |
-
|
|
|
147 |
)
|
148 |
-
|
149 |
fig.add_annotation(
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
)
|
157 |
-
|
158 |
-
|
|
|
159 |
|
160 |
def plot_residual_predicted(actual, predicted, df):
|
161 |
df_=df.copy()
|
|
|
86 |
def to_percentage(value):
|
87 |
return f'{value * 100:.1f}%'
|
88 |
|
89 |
+
def plot_actual_vs_predicted(date, y, predicted_values, model,target_column=None, flag=None, repeat_all_years=False, is_panel=False):
|
90 |
+
if flag is not None :
|
91 |
+
fig = make_subplots(specs=[[{"secondary_y": True}]])
|
92 |
+
else :
|
93 |
+
fig = go.Figure()
|
94 |
+
|
95 |
+
if is_panel :
|
96 |
+
df=pd.DataFrame()
|
97 |
+
df['date'] = date
|
98 |
+
df['Actual'] = y
|
99 |
+
df['Predicted'] = predicted_values
|
100 |
+
df_agg = df.groupby('date').agg({'Actual':'sum', 'Predicted':'sum'}).reset_index()
|
101 |
+
df_agg.columns = ['date', 'Actual', 'Predicted']
|
102 |
+
assert len(df_agg) == pd.Series(date).nunique()
|
103 |
+
# date = df_agg['date']
|
104 |
+
# y = df_agg['Actual']
|
105 |
+
# predicted_values = df_agg['Predicted']
|
106 |
+
# ymax = df_agg['Actual'].max() # Sprint3 - ymax to set y value for flag
|
107 |
+
|
108 |
+
fig.add_trace(go.Scatter(x=df_agg['date'], y=df_agg['Actual'], mode='lines', name='Actual', line=dict(color='#08083B')))
|
109 |
+
fig.add_trace(go.Scatter(x=df_agg['date'], y=df_agg['Predicted'], mode='lines', name='Predicted', line=dict(color='#11B6BD')))
|
110 |
+
|
111 |
+
else :
|
112 |
+
fig.add_trace(go.Scatter(x=date, y=y, mode='lines', name='Actual', line=dict(color='#08083B')))
|
113 |
+
fig.add_trace(go.Scatter(x=date, y=predicted_values, mode='lines', name='Predicted', line=dict(color='#11B6BD')))
|
114 |
+
|
115 |
+
line_values=[]
|
116 |
if flag:
|
117 |
+
min_date, max_date = flag[0], flag[1]
|
118 |
+
min_week = datetime.strptime(str(min_date), "%Y-%m-%d").strftime("%U")
|
119 |
+
max_week = datetime.strptime(str(max_date), "%Y-%m-%d").strftime("%U")
|
120 |
+
month=pd.to_datetime(min_date).month
|
121 |
+
day=pd.to_datetime(min_date).day
|
122 |
+
#st.write(pd.to_datetime(min_date).week)
|
123 |
+
#st.write(min_week)
|
124 |
+
# Initialize an empty list to store line values
|
125 |
+
|
126 |
+
# Sprint3 change : put flags to secondary axis, & made their y value to 1 instead of 5M
|
127 |
+
if repeat_all_years:
|
128 |
+
#line_values=list(pd.to_datetime((pd.Series(date)).dt.week).map(lambda x: 10000 if x==min_week else 0 ))
|
129 |
+
#st.write(pd.Series(date).map(lambda x: pd.Timestamp(x).week))
|
130 |
+
line_values=list(pd.Series(date).map(lambda x: 1 if (pd.Timestamp(x).week >=int(min_week)) & (pd.Timestamp(x).week <=int(max_week)) else 0))
|
131 |
+
assert len(line_values) == len(date)
|
132 |
+
#st.write(line_values)
|
133 |
+
fig.add_trace(go.Scatter(x=date, y=line_values, mode='lines', name='Flag', line=dict(color='#FF5733')),secondary_y=True)
|
134 |
+
else:
|
135 |
+
line_values = []
|
136 |
+
|
137 |
+
line_values = list(pd.Series(date).map(lambda x: 1 if (pd.Timestamp(x) >= pd.Timestamp(min_date)) and (pd.Timestamp(x) <= pd.Timestamp(max_date)) else 0))
|
138 |
+
|
139 |
+
#st.write(line_values)
|
140 |
+
fig.add_trace(go.Scatter(x=date, y=line_values, mode='lines', name='Flag', line=dict(color='#FF5733')),secondary_y=True)
|
141 |
+
|
142 |
+
|
143 |
+
# Calculate MAPE
|
144 |
mape = mean_absolute_percentage_error(y, predicted_values)
|
145 |
+
|
146 |
+
# Calculate AdjR2 # Assuming X is your feature matrix
|
147 |
r2 = r2_score(y, predicted_values)
|
148 |
adjr2 = 1 - (1 - r2) * (len(y) - 1) / (len(y) - len(model.fe_params) - 1)
|
149 |
|
150 |
+
# Create a table to display the metrics
|
151 |
metrics_table = pd.DataFrame({
|
152 |
+
'Metric': ['MAPE', 'R-squared', 'AdjR-squared'],
|
153 |
+
'Value': [mape, r2, adjr2]
|
154 |
})
|
155 |
+
# st.write(metrics_table)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
fig.update_layout(
|
157 |
+
xaxis=dict(title='Date'),
|
158 |
+
yaxis=dict(title=target_column),
|
159 |
+
xaxis_tickangle=-30
|
160 |
)
|
|
|
161 |
fig.add_annotation(
|
162 |
+
text=f"MAPE: {mape*100:0.1f}%, Adjr2: {adjr2 *100:.1f}%",
|
163 |
+
xref="paper",
|
164 |
+
yref="paper",
|
165 |
+
x=0.95, # Adjust these values to position the annotation
|
166 |
+
y=1.2,
|
167 |
+
showarrow=False,
|
168 |
)
|
169 |
+
# print("{}{}"*20, len(line_values))
|
170 |
+
#metrics_table.set_index(['Metric'],inplace=True)
|
171 |
+
return metrics_table,line_values, fig
|
172 |
|
173 |
def plot_residual_predicted(actual, predicted, df):
|
174 |
df_=df.copy()
|
Model/model_0.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e25f247a6804043e242b2a688b9b5ca840bce3da95bfd52863f33cd1a83ce2e2
|
3 |
+
size 3160085
|
Model/model_1.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8179ad18c0894ab80fc5bc7daf85da4c29a0d79989a04fdfb3fe448bae00c582
|
3 |
+
size 3160085
|
Model/model_2.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fc881f53a6a3dbca759c116f200606d946a48a1342dbabf75c84802df9cacd0d
|
3 |
+
size 3160100
|
Model/model_3.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e70ce7759767772d382d41e509022338fb35efc361367d488d876494ff0a915e
|
3 |
+
size 3160100
|
Model/model_4.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4227165221399f4430d82db0afdb68af986789d38efee3cabbba07db2b286759
|
3 |
+
size 3160079
|
Overview_data_test_panel@#app_installs.xlsx
ADDED
Binary file (28.1 kB). View file
|
|
Overview_data_test_panel@#revenue.xlsx
ADDED
Binary file (28.1 kB). View file
|
|
Overview_data_test_panelreplace_meapp_installs.xlsx
ADDED
Binary file (28.1 kB). View file
|
|
README.md
CHANGED
@@ -5,7 +5,7 @@ colorFrom: indigo
|
|
5 |
colorTo: pink
|
6 |
sdk: streamlit
|
7 |
sdk_version: 1.32.1
|
8 |
-
app_file:
|
9 |
pinned: false
|
10 |
---
|
11 |
|
|
|
5 |
colorTo: pink
|
6 |
sdk: streamlit
|
7 |
sdk_version: 1.32.1
|
8 |
+
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
11 |
|
Test/merged_df_contri.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
Test/output_df.csv
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Date,ga_app,kwai,fb_level_achieved_tier_1,fb_level_achieved_tier_2,paid_search,programmatic,digital_tactic_others,const
|
2 |
+
2023-08-28,-0.35397136801899115,66.71033836253882,174.81135610042415,3594.2897643195533,150.11933145463811,297.6127449335578,3.5410008622400904,17088.40000435223
|
3 |
+
2023-09-04,-0.27103315177908194,72.74975974406102,194.2113426820265,3860.2811580984967,158.86380529049313,335.6495710921645,3.925829952759959,18800.68227530415
|
4 |
+
2023-09-11,-0.18972320624535735,73.56577516680593,203.22388059872446,3779.8710519990336,154.802475164174,311.6912405544196,4.084632969053109,18923.5133215299
|
5 |
+
2023-09-18,-0.13280624437175015,72.56271222233786,200.12443813391138,3681.8929038825913,155.7098287205689,311.05112450305245,4.051292175357099,19046.344367755646
|
6 |
+
2023-09-25,-0.09296437106022509,73.70727325917034,205.74779198138953,3858.65253006403,155.02816278138727,312.86845990465827,3.9265582664040735,19169.175413981393
|
7 |
+
2023-10-02,-0.06507505974215756,72.34256892327214,205.58713073299748,3726.8536377627233,159.8242700571235,315.5994755570924,3.955202754813552,19292.006460207143
|
8 |
+
2023-10-09,-0.045552541819510295,74.38740927114137,207.0439308259877,3845.7054965140105,166.1387318784968,318.3770263805087,3.9670100767811185,19414.83750643289
|
9 |
+
2023-10-16,-0.031886779273657205,73.92804257031634,209.0350517896794,3749.259107713571,158.5179131618084,308.27664915352324,3.935545074442725,19537.66855265864
|
10 |
+
2023-10-23,-0.02232074549156004,74.2265721786869,214.96921278574305,3766.838626589657,155.11867956784573,298.7838125908522,3.8717920437881834,19660.499598884388
|
11 |
+
2023-10-30,-0.015624521844092026,73.13776666139266,215.11994117361186,3861.8716038759217,150.99199274844668,305.8173177680258,3.8593412414854895,19783.330645110138
|
12 |
+
2023-11-06,-0.010937165290864418,73.92209125196376,208.19044332496705,3939.163063071122,155.63698971642444,320.41327017703395,3.844088730158042,19906.161691335885
|
13 |
+
2023-11-13,-0.007656015703605092,75.65843124761166,208.86440994169482,3793.1062744683286,156.5242431409553,320.3204189984107,4.021312960163909,20028.99273756163
|
14 |
+
2023-11-20,-0.005359210992523565,73.88051276100926,218.40774072300528,3684.900260569517,163.258344706366,322.7402649826382,4.0473156754345965,20151.823783787382
|
15 |
+
2023-11-27,-0.0037514476947664945,72.1846283175467,213.20545855013495,3856.792298375503,167.13396999671053,332.60329700992924,3.949159871187085,20274.65483001313
|
16 |
+
2023-12-04,-0.002626013386336546,72.23564873518644,203.08444230779233,3848.078121929866,167.24638929455608,325.2003051931162,3.9989148636147225,20397.485876238876
|
Test/scenario_test_df.csv
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
other_contributions,correction,sales
|
2 |
+
17088.04603298421,-215.4682810582599,4502.552817091212
|
3 |
+
18800.41124215237,74.62945753481836,4551.0520093251835
|
4 |
+
18923.323598323655,-24.472395662971394,4551.711452115181
|
5 |
+
19046.211561511274,-125.71083540064501,4551.1031350384665
|
6 |
+
19169.082449610334,59.723662814169074,4550.207113442869
|
7 |
+
19291.941385147402,-62.72601966545335,4546.888305453476
|
8 |
+
19414.791953891072,67.80597281407609,4547.8136321328475
|
9 |
+
19537.636665879367,-49.327276753389015,4552.279586216728
|
10 |
+
19660.477278138897,-34.96735624499706,4548.776052001568
|
11 |
+
19783.315020588292,63.3505618488889,4547.4474016199965
|
12 |
+
19906.150754170594,157.53118273497603,4543.63876353669
|
13 |
+
20028.98508154593,8.48155599979873,4550.013534757365
|
14 |
+
20151.81842457639,-76.79487376436737,4544.029313182335
|
15 |
+
20274.651078565435,90.96984069810424,4554.898971422908
|
16 |
+
20397.48325022549,65.02213269566346,4554.821689628467
|
Test/x_test_contribution.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
Test/x_test_to_save.csv
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
Test/x_train_contribution.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
Test/x_train_to_save.csv
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
best_models.pkl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:63e3de089d3f2a199a396228c6c0cf7f5db60c36fe3b7a6fb5cf3e74a92ae304
|
3 |
+
size 4095026
|
classes.py
CHANGED
@@ -16,21 +16,15 @@ def class_to_dict(class_instance):
|
|
16 |
attr_dict["modified_spends"] = class_instance.modified_spends
|
17 |
attr_dict["modified_sales"] = class_instance.modified_sales
|
18 |
attr_dict["response_curve_type"] = class_instance.response_curve_type
|
19 |
-
attr_dict["response_curve_params"] =
|
20 |
-
class_instance.response_curve_params
|
21 |
-
)
|
22 |
attr_dict["penalty"] = class_instance.penalty
|
23 |
attr_dict["bounds"] = class_instance.bounds
|
24 |
attr_dict["actual_total_spends"] = class_instance.actual_total_spends
|
25 |
attr_dict["actual_total_sales"] = class_instance.actual_total_sales
|
26 |
-
attr_dict["modified_total_spends"] =
|
27 |
-
class_instance.modified_total_spends
|
28 |
-
)
|
29 |
attr_dict["modified_total_sales"] = class_instance.modified_total_sales
|
30 |
attr_dict["actual_mroi"] = class_instance.get_marginal_roi("actual")
|
31 |
-
attr_dict["modified_mroi"] = class_instance.get_marginal_roi(
|
32 |
-
"modified"
|
33 |
-
)
|
34 |
|
35 |
elif isinstance(class_instance, Scenario):
|
36 |
attr_dict["type"] = "Scenario"
|
@@ -43,9 +37,7 @@ def class_to_dict(class_instance):
|
|
43 |
attr_dict["correction"] = class_instance.correction
|
44 |
attr_dict["actual_total_spends"] = class_instance.actual_total_spends
|
45 |
attr_dict["actual_total_sales"] = class_instance.actual_total_sales
|
46 |
-
attr_dict["modified_total_spends"] =
|
47 |
-
class_instance.modified_total_spends
|
48 |
-
)
|
49 |
attr_dict["modified_total_sales"] = class_instance.modified_total_sales
|
50 |
|
51 |
return attr_dict
|
@@ -95,9 +87,7 @@ class Channel:
|
|
95 |
self.modified_sales = self.calculate_sales()
|
96 |
self.modified_total_spends = self.modified_spends.sum()
|
97 |
self.modified_total_sales = self.modified_sales.sum()
|
98 |
-
self.delta_spends =
|
99 |
-
self.modified_total_spends - self.actual_total_spends
|
100 |
-
)
|
101 |
self.delta_sales = self.modified_total_sales - self.actual_total_sales
|
102 |
|
103 |
def update_penalty(self, penalty):
|
@@ -119,8 +109,7 @@ class Channel:
|
|
119 |
x = np.where(
|
120 |
x < self.upper_limit,
|
121 |
x,
|
122 |
-
self.upper_limit
|
123 |
-
+ (x - self.upper_limit) * self.upper_limit / x,
|
124 |
)
|
125 |
if self.response_curve_type == "s-curve":
|
126 |
if self.power >= 0:
|
@@ -169,9 +158,7 @@ class Channel:
|
|
169 |
self.modified_sales = self.calculate_sales()
|
170 |
self.modified_total_spends = self.modified_spends.sum()
|
171 |
self.modified_total_sales = self.modified_sales.sum()
|
172 |
-
self.delta_spends =
|
173 |
-
self.modified_total_spends - self.actual_total_spends
|
174 |
-
)
|
175 |
self.delta_sales = self.modified_total_sales - self.actual_total_sales
|
176 |
|
177 |
def intialize(self):
|
@@ -208,9 +195,7 @@ class Scenario:
|
|
208 |
self.actual_total_sales = self.calculate_actual_total_sales()
|
209 |
self.modified_total_sales = self.calculate_modified_total_sales()
|
210 |
self.modified_total_spends = self.calculate_modified_total_spends()
|
211 |
-
self.delta_spends =
|
212 |
-
self.modified_total_spends - self.actual_total_spends
|
213 |
-
)
|
214 |
self.delta_sales = self.modified_total_sales - self.actual_total_sales
|
215 |
|
216 |
def update_penalty(self, value):
|
@@ -220,9 +205,7 @@ class Scenario:
|
|
220 |
def calculate_modified_total_spends(self):
|
221 |
total_actual_spends = 0.0
|
222 |
for channel in self.channels.values():
|
223 |
-
total_actual_spends +=
|
224 |
-
channel.actual_total_spends * channel.conversion_rate
|
225 |
-
)
|
226 |
return total_actual_spends
|
227 |
|
228 |
def calculate_modified_total_spends(self):
|
@@ -251,12 +234,47 @@ class Scenario:
|
|
251 |
self.channels[channel_name].update(modified_spends)
|
252 |
self.modified_total_sales = self.calculate_modified_total_sales()
|
253 |
self.modified_total_spends = self.calculate_modified_total_spends()
|
254 |
-
self.delta_spends =
|
255 |
-
self.modified_total_spends - self.actual_total_spends
|
256 |
-
)
|
257 |
self.delta_sales = self.modified_total_sales - self.actual_total_sales
|
258 |
|
259 |
-
def optimize_spends(self, sales_percent, channels_list, algo="COBYLA"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
260 |
desired_sales = self.actual_total_sales * (1 + sales_percent / 100.0)
|
261 |
|
262 |
def constraint(x):
|
@@ -285,7 +303,7 @@ class Scenario:
|
|
285 |
x0=initial_point,
|
286 |
constraints=constraints,
|
287 |
method=algo,
|
288 |
-
options={"maxiter": int(2e7), "
|
289 |
)
|
290 |
|
291 |
for channel_name, modified_spends in zip(channels_list, res.x):
|
@@ -317,14 +335,11 @@ class Scenario:
|
|
317 |
for channel_name in channels_list:
|
318 |
_channel_class = self.channels[channel_name]
|
319 |
channel_bounds = _channel_class.bounds
|
320 |
-
channel_actual_total_spends = (
|
321 |
-
|
322 |
-
* ((1 + spends_percent / 100))
|
323 |
)
|
324 |
old_spends.append(channel_actual_total_spends)
|
325 |
-
bounds.append(
|
326 |
-
(1 + channel_bounds / 100) * channel_actual_total_spends
|
327 |
-
)
|
328 |
|
329 |
def objective_function(x):
|
330 |
for channel_name, modified_spends in zip(channels_list, x):
|
@@ -332,12 +347,12 @@ class Scenario:
|
|
332 |
return -1 * self.modified_total_sales
|
333 |
|
334 |
res = minimize(
|
335 |
-
lambda x
|
336 |
method="trust-constr",
|
337 |
x0=old_spends,
|
338 |
constraints=constraint,
|
339 |
bounds=bounds,
|
340 |
-
options={"maxiter": int(1e7),
|
341 |
)
|
342 |
# res = dual_annealing(
|
343 |
# objective_function,
|
@@ -361,81 +376,91 @@ class Scenario:
|
|
361 |
channel_data = []
|
362 |
|
363 |
summary_rows = []
|
364 |
-
actual_list.append(
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
|
|
|
|
|
|
|
|
374 |
for channel in self.channels.values():
|
375 |
name_mod = channel.name.replace("_", " ")
|
376 |
if name_mod.lower().endswith(" imp"):
|
377 |
name_mod = name_mod.replace("Imp", " Impressions")
|
378 |
-
summary_rows.append(
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
channel.actual_total_sales / channel.actual_total_spends, 2
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
])
|
395 |
data[channel.name] = channel.modified_spends
|
396 |
data["Date"] = channel.dates
|
397 |
data["Sales"] = (
|
398 |
data.get("Sales", np.zeros((len(channel.dates),)))
|
399 |
+ channel.modified_sales
|
400 |
)
|
401 |
-
actual_list.append(
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
channel.
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
439 |
details["Actual"] = actual_list
|
440 |
details["Modified"] = modified_list
|
441 |
columns_index = pd.MultiIndex.from_product(
|
@@ -467,8 +492,7 @@ class Scenario:
|
|
467 |
def from_dict(cls, attr_dict):
|
468 |
channels_list = attr_dict["channels"]
|
469 |
channels = {
|
470 |
-
channel["name"]: class_from_dict(channel)
|
471 |
-
for channel in channels_list
|
472 |
}
|
473 |
return Scenario(
|
474 |
name=attr_dict["name"],
|
|
|
16 |
attr_dict["modified_spends"] = class_instance.modified_spends
|
17 |
attr_dict["modified_sales"] = class_instance.modified_sales
|
18 |
attr_dict["response_curve_type"] = class_instance.response_curve_type
|
19 |
+
attr_dict["response_curve_params"] = class_instance.response_curve_params
|
|
|
|
|
20 |
attr_dict["penalty"] = class_instance.penalty
|
21 |
attr_dict["bounds"] = class_instance.bounds
|
22 |
attr_dict["actual_total_spends"] = class_instance.actual_total_spends
|
23 |
attr_dict["actual_total_sales"] = class_instance.actual_total_sales
|
24 |
+
attr_dict["modified_total_spends"] = class_instance.modified_total_spends
|
|
|
|
|
25 |
attr_dict["modified_total_sales"] = class_instance.modified_total_sales
|
26 |
attr_dict["actual_mroi"] = class_instance.get_marginal_roi("actual")
|
27 |
+
attr_dict["modified_mroi"] = class_instance.get_marginal_roi("modified")
|
|
|
|
|
28 |
|
29 |
elif isinstance(class_instance, Scenario):
|
30 |
attr_dict["type"] = "Scenario"
|
|
|
37 |
attr_dict["correction"] = class_instance.correction
|
38 |
attr_dict["actual_total_spends"] = class_instance.actual_total_spends
|
39 |
attr_dict["actual_total_sales"] = class_instance.actual_total_sales
|
40 |
+
attr_dict["modified_total_spends"] = class_instance.modified_total_spends
|
|
|
|
|
41 |
attr_dict["modified_total_sales"] = class_instance.modified_total_sales
|
42 |
|
43 |
return attr_dict
|
|
|
87 |
self.modified_sales = self.calculate_sales()
|
88 |
self.modified_total_spends = self.modified_spends.sum()
|
89 |
self.modified_total_sales = self.modified_sales.sum()
|
90 |
+
self.delta_spends = self.modified_total_spends - self.actual_total_spends
|
|
|
|
|
91 |
self.delta_sales = self.modified_total_sales - self.actual_total_sales
|
92 |
|
93 |
def update_penalty(self, penalty):
|
|
|
109 |
x = np.where(
|
110 |
x < self.upper_limit,
|
111 |
x,
|
112 |
+
self.upper_limit + (x - self.upper_limit) * self.upper_limit / x,
|
|
|
113 |
)
|
114 |
if self.response_curve_type == "s-curve":
|
115 |
if self.power >= 0:
|
|
|
158 |
self.modified_sales = self.calculate_sales()
|
159 |
self.modified_total_spends = self.modified_spends.sum()
|
160 |
self.modified_total_sales = self.modified_sales.sum()
|
161 |
+
self.delta_spends = self.modified_total_spends - self.actual_total_spends
|
|
|
|
|
162 |
self.delta_sales = self.modified_total_sales - self.actual_total_sales
|
163 |
|
164 |
def intialize(self):
|
|
|
195 |
self.actual_total_sales = self.calculate_actual_total_sales()
|
196 |
self.modified_total_sales = self.calculate_modified_total_sales()
|
197 |
self.modified_total_spends = self.calculate_modified_total_spends()
|
198 |
+
self.delta_spends = self.modified_total_spends - self.actual_total_spends
|
|
|
|
|
199 |
self.delta_sales = self.modified_total_sales - self.actual_total_sales
|
200 |
|
201 |
def update_penalty(self, value):
|
|
|
205 |
def calculate_modified_total_spends(self):
|
206 |
total_actual_spends = 0.0
|
207 |
for channel in self.channels.values():
|
208 |
+
total_actual_spends += channel.actual_total_spends * channel.conversion_rate
|
|
|
|
|
209 |
return total_actual_spends
|
210 |
|
211 |
def calculate_modified_total_spends(self):
|
|
|
234 |
self.channels[channel_name].update(modified_spends)
|
235 |
self.modified_total_sales = self.calculate_modified_total_sales()
|
236 |
self.modified_total_spends = self.calculate_modified_total_spends()
|
237 |
+
self.delta_spends = self.modified_total_spends - self.actual_total_spends
|
|
|
|
|
238 |
self.delta_sales = self.modified_total_sales - self.actual_total_sales
|
239 |
|
240 |
+
# def optimize_spends(self, sales_percent, channels_list, algo="COBYLA"):
|
241 |
+
# desired_sales = self.actual_total_sales * (1 + sales_percent / 100.0)
|
242 |
+
|
243 |
+
# def constraint(x):
|
244 |
+
# for ch, spends in zip(channels_list, x):
|
245 |
+
# self.update(ch, spends)
|
246 |
+
# return self.modified_total_sales - desired_sales
|
247 |
+
|
248 |
+
# bounds = []
|
249 |
+
# for ch in channels_list:
|
250 |
+
# bounds.append(
|
251 |
+
# (1 + np.array([-50.0, 100.0]) / 100.0)
|
252 |
+
# * self.channels[ch].actual_total_spends
|
253 |
+
# )
|
254 |
+
|
255 |
+
# initial_point = []
|
256 |
+
# for bound in bounds:
|
257 |
+
# initial_point.append(bound[0])
|
258 |
+
|
259 |
+
# power = np.ceil(np.log(sum(initial_point)) / np.log(10))
|
260 |
+
|
261 |
+
# constraints = [NonlinearConstraint(constraint, -1.0, 1.0)]
|
262 |
+
|
263 |
+
# res = minimize(
|
264 |
+
# lambda x: sum(x) / 10 ** (power),
|
265 |
+
# bounds=bounds,
|
266 |
+
# x0=initial_point,
|
267 |
+
# constraints=constraints,
|
268 |
+
# method=algo,
|
269 |
+
# options={"maxiter": int(2e7), "catol": 1},
|
270 |
+
# )
|
271 |
+
|
272 |
+
# for channel_name, modified_spends in zip(channels_list, res.x):
|
273 |
+
# self.update(channel_name, modified_spends)
|
274 |
+
|
275 |
+
# return zip(channels_list, res.x)
|
276 |
+
|
277 |
+
def optimize_spends(self, sales_percent, channels_list, algo="trust-constr"):
|
278 |
desired_sales = self.actual_total_sales * (1 + sales_percent / 100.0)
|
279 |
|
280 |
def constraint(x):
|
|
|
303 |
x0=initial_point,
|
304 |
constraints=constraints,
|
305 |
method=algo,
|
306 |
+
options={"maxiter": int(2e7), "xtol": 100},
|
307 |
)
|
308 |
|
309 |
for channel_name, modified_spends in zip(channels_list, res.x):
|
|
|
335 |
for channel_name in channels_list:
|
336 |
_channel_class = self.channels[channel_name]
|
337 |
channel_bounds = _channel_class.bounds
|
338 |
+
channel_actual_total_spends = _channel_class.actual_total_spends * (
|
339 |
+
(1 + spends_percent / 100)
|
|
|
340 |
)
|
341 |
old_spends.append(channel_actual_total_spends)
|
342 |
+
bounds.append((1 + channel_bounds / 100) * channel_actual_total_spends)
|
|
|
|
|
343 |
|
344 |
def objective_function(x):
|
345 |
for channel_name, modified_spends in zip(channels_list, x):
|
|
|
347 |
return -1 * self.modified_total_sales
|
348 |
|
349 |
res = minimize(
|
350 |
+
lambda x: objective_function(x) / 1e8,
|
351 |
method="trust-constr",
|
352 |
x0=old_spends,
|
353 |
constraints=constraint,
|
354 |
bounds=bounds,
|
355 |
+
options={"maxiter": int(1e7), "xtol": 100},
|
356 |
)
|
357 |
# res = dual_annealing(
|
358 |
# objective_function,
|
|
|
376 |
channel_data = []
|
377 |
|
378 |
summary_rows = []
|
379 |
+
actual_list.append(
|
380 |
+
{
|
381 |
+
"name": "Total",
|
382 |
+
"Spends": self.actual_total_spends,
|
383 |
+
"Sales": self.actual_total_sales,
|
384 |
+
}
|
385 |
+
)
|
386 |
+
modified_list.append(
|
387 |
+
{
|
388 |
+
"name": "Total",
|
389 |
+
"Spends": self.modified_total_spends,
|
390 |
+
"Sales": self.modified_total_sales,
|
391 |
+
}
|
392 |
+
)
|
393 |
for channel in self.channels.values():
|
394 |
name_mod = channel.name.replace("_", " ")
|
395 |
if name_mod.lower().endswith(" imp"):
|
396 |
name_mod = name_mod.replace("Imp", " Impressions")
|
397 |
+
summary_rows.append(
|
398 |
+
[
|
399 |
+
name_mod,
|
400 |
+
channel.actual_total_spends,
|
401 |
+
channel.modified_total_spends,
|
402 |
+
channel.actual_total_sales,
|
403 |
+
channel.modified_total_sales,
|
404 |
+
round(channel.actual_total_sales / channel.actual_total_spends, 2),
|
405 |
+
round(
|
406 |
+
channel.modified_total_sales / channel.modified_total_spends,
|
407 |
+
2,
|
408 |
+
),
|
409 |
+
channel.get_marginal_roi("actual"),
|
410 |
+
channel.get_marginal_roi("modified"),
|
411 |
+
]
|
412 |
+
)
|
|
|
413 |
data[channel.name] = channel.modified_spends
|
414 |
data["Date"] = channel.dates
|
415 |
data["Sales"] = (
|
416 |
data.get("Sales", np.zeros((len(channel.dates),)))
|
417 |
+ channel.modified_sales
|
418 |
)
|
419 |
+
actual_list.append(
|
420 |
+
{
|
421 |
+
"name": channel.name,
|
422 |
+
"Spends": channel.actual_total_spends,
|
423 |
+
"Sales": channel.actual_total_sales,
|
424 |
+
"ROI": round(
|
425 |
+
channel.actual_total_sales / channel.actual_total_spends, 2
|
426 |
+
),
|
427 |
+
}
|
428 |
+
)
|
429 |
+
modified_list.append(
|
430 |
+
{
|
431 |
+
"name": channel.name,
|
432 |
+
"Spends": channel.modified_total_spends,
|
433 |
+
"Sales": channel.modified_total_sales,
|
434 |
+
"ROI": round(
|
435 |
+
channel.modified_total_sales / channel.modified_total_spends,
|
436 |
+
2,
|
437 |
+
),
|
438 |
+
"Marginal ROI": channel.get_marginal_roi("modified"),
|
439 |
+
}
|
440 |
+
)
|
441 |
+
|
442 |
+
channel_data.append(
|
443 |
+
{
|
444 |
+
"channel": channel.name,
|
445 |
+
"spends_act": channel.actual_total_spends,
|
446 |
+
"spends_mod": channel.modified_total_spends,
|
447 |
+
"sales_act": channel.actual_total_sales,
|
448 |
+
"sales_mod": channel.modified_total_sales,
|
449 |
+
}
|
450 |
+
)
|
451 |
+
summary_rows.append(
|
452 |
+
[
|
453 |
+
"Total",
|
454 |
+
self.actual_total_spends,
|
455 |
+
self.modified_total_spends,
|
456 |
+
self.actual_total_sales,
|
457 |
+
self.modified_total_sales,
|
458 |
+
round(self.actual_total_sales / self.actual_total_spends, 2),
|
459 |
+
round(self.modified_total_sales / self.modified_total_spends, 2),
|
460 |
+
0.0,
|
461 |
+
0.0,
|
462 |
+
]
|
463 |
+
)
|
464 |
details["Actual"] = actual_list
|
465 |
details["Modified"] = modified_list
|
466 |
columns_index = pd.MultiIndex.from_product(
|
|
|
492 |
def from_dict(cls, attr_dict):
|
493 |
channels_list = attr_dict["channels"]
|
494 |
channels = {
|
495 |
+
channel["name"]: class_from_dict(channel) for channel in channels_list
|
|
|
496 |
}
|
497 |
return Scenario(
|
498 |
name=attr_dict["name"],
|
data_import.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d759e0caf40a5cf6ddfe5c391289fa964363652dba2ffe919fa1ab7c6b4399ec
|
3 |
+
size 2246178
|
data_test_overview_panel_#total_approved_accounts_revenue.xlsx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:763047805d36dca3502a6ed9c6dcee9a0c99c945ee92bb61a7c0f6647486a96c
|
3 |
+
size 1637428
|
final_df_transformed.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1d775eda5ee0172e1511622b69b301023cdf2c5dbe74bb62d79264fe926eee1b
|
3 |
+
size 19479046
|
metrics_level_data/Overview_data_test_panel@#app_installs.xlsx
ADDED
Binary file (28.1 kB). View file
|
|
metrics_level_data/Overview_data_test_panel@#revenue.xlsx
ADDED
Binary file (28.1 kB). View file
|
|
model_output.csv
CHANGED
@@ -1,11 +1,6 @@
|
|
1 |
-
,Model_object,Model_iteration,Feature_set,MAPE,R2,ADJR2
|
2 |
-
0,Model/model_0.pkl,0,"['
|
3 |
-
1,Model/model_1.pkl,1,"['
|
4 |
-
2,Model/model_2.pkl,2,"['
|
5 |
-
3,Model/model_3.pkl,3,"['
|
6 |
-
4,Model/model_4.pkl,4,"['
|
7 |
-
5,Model/model_5.pkl,5,"['paid_search_clicks', 'kwai_clicks', 'fb_level_achieved_tier_2_clicks_lag_2', 'fb_level_achieved_tier_1_impressions', 'ga_app_clicks', 'digital_tactic_others_clicks', 'programmatic_clicks_lag_3']",0.21221059311555665,0.8436849097221487,0.843547997105539
|
8 |
-
6,Model/model_6.pkl,6,"['paid_search_clicks', 'kwai_clicks', 'fb_level_achieved_tier_2_clicks_lag_2', 'fb_level_achieved_tier_1_impressions', 'ga_app_clicks', 'digital_tactic_others_clicks', 'programmatic_impressions']",0.21023311688137142,0.8421414101917525,0.8420031456611397
|
9 |
-
7,Model/model_7.pkl,7,"['paid_search_clicks', 'kwai_clicks', 'fb_level_achieved_tier_2_clicks_lag_2', 'fb_level_achieved_tier_1_impressions', 'ga_app_clicks', 'digital_tactic_others_clicks', 'programmatic_impressions_lag_3']",0.21230002407340917,0.8438639613954715,0.843727205605903
|
10 |
-
8,Model/model_8.pkl,8,"['paid_search_clicks', 'kwai_clicks', 'fb_level_achieved_tier_2_clicks_lag_2', 'fb_level_achieved_tier_1_impressions', 'ga_app_clicks', 'digital_tactic_others_impressions_lag_2', 'programmatic_clicks']",0.21138525009178905,0.8446253227642725,0.8444892338327598
|
11 |
-
9,Model/model_9.pkl,9,"['paid_search_clicks', 'kwai_clicks', 'fb_level_achieved_tier_2_clicks_lag_2', 'fb_level_achieved_tier_1_impressions', 'ga_app_clicks', 'digital_tactic_others_impressions_lag_2', 'programmatic_clicks_lag_3']",0.2123701406564611,0.8464957579981922,0.8463613073357782
|
|
|
1 |
+
,Model_object,Model_iteration,Feature_set,MAPE,R2,ADJR2,pos_count
|
2 |
+
0,Model/model_0.pkl,0,"['paid_search_clicks_adstock_0_7_lag_1', 'kwai_clicks_adstock_0_7_lag_2', 'fb_level_achieved_tier_2_clicks_adstock_0_7_lag_2', 'fb_level_achieved_tier_1_impressions_adstock_0_7_lag_2', 'ga_app_clicks_adstock_0_7_lag_2', 'digital_tactic_others_clicks_adstock_0_7_lag_2', 'programmatic_clicks_adstock_0_7_lag_2']",0.217990735975396,0.8737098317237447,0.8735992172119913,8
|
3 |
+
1,Model/model_1.pkl,1,"['paid_search_clicks_adstock_0_7_lag_1', 'kwai_clicks_adstock_0_7_lag_2', 'fb_level_achieved_tier_2_clicks_adstock_0_7_lag_2', 'fb_level_achieved_tier_1_impressions_adstock_0_7_lag_2', 'ga_app_clicks_adstock_0_7_lag_2', 'digital_tactic_others_clicks_adstock_0_7_lag_2', 'programmatic_clicks_adstock_0_7_lag_1']",0.2179731139181846,0.873704484501189,0.8735938653059323,8
|
4 |
+
2,Model/model_2.pkl,2,"['paid_search_clicks_adstock_0_7_lag_1', 'kwai_clicks_adstock_0_7_lag_2', 'fb_level_achieved_tier_2_clicks_adstock_0_7_lag_2', 'fb_level_achieved_tier_1_impressions_adstock_0_7_lag_2', 'ga_app_clicks_adstock_0_7_lag_2', 'digital_tactic_others_clicks_adstock_0_7_lag_2', 'programmatic_impressions_adstock_0_7_lag_2']",0.22282859947602898,0.8741134168513375,0.8740031558300612,7
|
5 |
+
3,Model/model_3.pkl,3,"['paid_search_clicks_adstock_0_7_lag_1', 'kwai_clicks_adstock_0_7_lag_2', 'fb_level_achieved_tier_2_clicks_adstock_0_7_lag_2', 'fb_level_achieved_tier_1_impressions_adstock_0_7_lag_2', 'ga_app_clicks_adstock_0_7_lag_2', 'digital_tactic_others_clicks_adstock_0_7_lag_2', 'programmatic_impressions_adstock_0_7_lag_1']",0.22288787053617995,0.8740146663445868,0.8739043188301239,8
|
6 |
+
4,Model/model_4.pkl,4,"['paid_search_clicks_adstock_0_7_lag_1', 'kwai_clicks_adstock_0_7_lag_2', 'fb_level_achieved_tier_2_clicks_adstock_0_7_lag_2', 'fb_level_achieved_tier_1_impressions_adstock_0_7_lag_2', 'ga_app_clicks_adstock_0_7_lag_2', 'digital_tactic_others_clicks_adstock_0_7_lag_2', 'programmatic_cost_adstock_0_7_lag_2']",0.21714189338473494,0.8736897844153089,0.8735791523446015,8
|
|
|
|
|
|
|
|
|
|
pages/10_Optimized_Result_Analysis.py
CHANGED
@@ -14,15 +14,7 @@ import plotly.express as px
|
|
14 |
import numpy as np
|
15 |
import plotly.graph_objects as go
|
16 |
import pandas as pd
|
17 |
-
from plotly.subplots import make_subplots
|
18 |
|
19 |
-
def format_number(x):
|
20 |
-
if x >= 1_000_000:
|
21 |
-
return f'{x / 1_000_000:.2f}M'
|
22 |
-
elif x >= 1_000:
|
23 |
-
return f'{x / 1_000:.2f}K'
|
24 |
-
else:
|
25 |
-
return f'{x:.2f}'
|
26 |
|
27 |
def summary_plot(data, x, y, title, text_column, color, format_as_percent=False, format_as_decimal=False):
|
28 |
fig = px.bar(data, x=x, y=y, orientation='h',
|
@@ -104,13 +96,11 @@ spends_data=pd.read_excel('Overview_data_test.xlsx')
|
|
104 |
|
105 |
with open('summary_df.pkl', 'rb') as file:
|
106 |
summary_df_sorted = pickle.load(file)
|
107 |
-
#st.write(summary_df_sorted)
|
108 |
|
109 |
selected_scenario= st.selectbox('Select Saved Scenarios',['S1','S2'])
|
110 |
|
111 |
st.header('Optimized Spends Overview')
|
112 |
___columns=st.columns(3)
|
113 |
-
summary_df_sorted=summary_df_sorted.sort_values(by=['Optimized_spend'],ascending=False)
|
114 |
with ___columns[2]:
|
115 |
fig=summary_plot(summary_df_sorted, x='Delta_percent', y='Channel_name', title='Delta', text_column='Delta_percent',color='Channel_name')
|
116 |
st.plotly_chart(fig,use_container_width=True)
|
@@ -344,75 +334,31 @@ with st.expander("Return Forecast by Media Channel"):
|
|
344 |
|
345 |
summary_df_sorted=summary_df_sorted.merge(effectiveness_df,left_on="Channel_name",right_on='Channel')
|
346 |
|
347 |
-
#
|
348 |
-
summary_df_sorted['Efficiency']
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
channel_colors = px.colors.qualitative.Plotly
|
353 |
-
|
354 |
-
fig = make_subplots(rows=1, cols=3, subplot_titles=('Optimized Spends', 'Effectiveness', 'Efficiency'), horizontal_spacing=0.05)
|
355 |
-
|
356 |
-
for i, channel in enumerate(summary_df_sorted['Channel_name'].unique()):
|
357 |
-
channel_df = summary_df_sorted[summary_df_sorted['Channel_name'] == channel]
|
358 |
-
channel_color = channel_colors[i % len(channel_colors)]
|
359 |
-
|
360 |
-
fig.add_trace(go.Bar(x=channel_df['Optimized_spend'],
|
361 |
-
y=channel_df['Channel_name'],
|
362 |
-
text=channel_df['Optimized_spend'].apply(format_number),
|
363 |
-
marker_color=channel_color,
|
364 |
-
orientation='h'), row=1, col=1)
|
365 |
-
|
366 |
-
fig.add_trace(go.Bar(x=channel_df['ResponseMetricValue'],
|
367 |
-
y=channel_df['Channel_name'],
|
368 |
-
text=channel_df['ResponseMetricValue'].apply(format_number),
|
369 |
-
marker_color=channel_color,
|
370 |
-
orientation='h', showlegend=False), row=1, col=2)
|
371 |
-
|
372 |
-
fig.add_trace(go.Bar(x=channel_df['Efficiency'],
|
373 |
-
y=channel_df['Channel_name'],
|
374 |
-
text=channel_df['Efficiency'].apply(format_number),
|
375 |
-
marker_color=channel_color,
|
376 |
-
orientation='h', showlegend=False), row=1, col=3)
|
377 |
-
|
378 |
-
fig.update_layout(
|
379 |
-
height=600,
|
380 |
-
width=900,
|
381 |
-
title='Media Channel Performance',
|
382 |
-
showlegend=False
|
383 |
-
)
|
384 |
-
|
385 |
-
fig.update_yaxes(showticklabels=False ,row=1, col=2 )
|
386 |
-
fig.update_yaxes(showticklabels=False, row=1, col=3)
|
387 |
-
|
388 |
-
fig.update_xaxes(showticklabels=False, row=1, col=1)
|
389 |
-
fig.update_xaxes(showticklabels=False, row=1, col=2)
|
390 |
-
fig.update_xaxes(showticklabels=False, row=1, col=3)
|
391 |
-
|
392 |
-
|
393 |
-
st.plotly_chart(fig, use_container_width=True)
|
394 |
-
|
395 |
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
# with columns[1]:
|
402 |
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
|
|
416 |
|
417 |
# Create figure with subplots
|
418 |
# fig = make_subplots(rows=1, cols=2)
|
|
|
14 |
import numpy as np
|
15 |
import plotly.graph_objects as go
|
16 |
import pandas as pd
|
|
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
def summary_plot(data, x, y, title, text_column, color, format_as_percent=False, format_as_decimal=False):
|
20 |
fig = px.bar(data, x=x, y=y, orientation='h',
|
|
|
96 |
|
97 |
with open('summary_df.pkl', 'rb') as file:
|
98 |
summary_df_sorted = pickle.load(file)
|
|
|
99 |
|
100 |
selected_scenario= st.selectbox('Select Saved Scenarios',['S1','S2'])
|
101 |
|
102 |
st.header('Optimized Spends Overview')
|
103 |
___columns=st.columns(3)
|
|
|
104 |
with ___columns[2]:
|
105 |
fig=summary_plot(summary_df_sorted, x='Delta_percent', y='Channel_name', title='Delta', text_column='Delta_percent',color='Channel_name')
|
106 |
st.plotly_chart(fig,use_container_width=True)
|
|
|
334 |
|
335 |
summary_df_sorted=summary_df_sorted.merge(effectiveness_df,left_on="Channel_name",right_on='Channel')
|
336 |
|
337 |
+
# st.dataframe(summary_df_sorted.head(2))
|
338 |
+
summary_df_sorted['Efficiency']=summary_df_sorted['ResponseMetricValue']/summary_df_sorted['Optimized_spend']
|
339 |
+
# # # st.dataframe(summary_df_sorted.head(2))
|
340 |
+
# st.dataframe(summary_df_sorted.head(2))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
341 |
|
342 |
+
columns= st.columns(3)
|
343 |
+
with columns[0]:
|
344 |
+
fig=summary_plot(summary_df_sorted, x='Optimized_spend', y='Channel_name', title='', text_column='Optimized_spend',color='Channel_name')
|
345 |
+
st.plotly_chart(fig,use_container_width=True)
|
346 |
+
with columns[1]:
|
|
|
347 |
|
348 |
+
# effectiveness=(selected_metric.groupby(by=['MediaChannelName'])['ResponseMetricValue'].sum()).values
|
349 |
+
# effectiveness_df=pd.DataFrame({'Channel':st.session_state['raw_data']['MediaChannelName'].unique(),"ResponseMetricValue":effectiveness})
|
350 |
+
# # effectiveness.reset_index(inplace=True)
|
351 |
+
# # st.dataframe(effectiveness.head())
|
352 |
+
fig=summary_plot(summary_df_sorted, x='ResponseMetricValue', y='Channel_name', title='Effectiveness', text_column='ResponseMetricValue',color='Channel_name')
|
353 |
+
st.plotly_chart(fig,use_container_width=True)
|
354 |
+
|
355 |
+
with columns[2]:
|
356 |
+
fig=summary_plot(summary_df_sorted, x='Efficiency', y='Channel_name', title='Efficiency', text_column='Efficiency',color='Channel_name',format_as_decimal=True)
|
357 |
+
st.plotly_chart(fig,use_container_width=True)
|
358 |
+
|
359 |
+
import plotly.express as px
|
360 |
+
import plotly.graph_objects as go
|
361 |
+
from plotly.subplots import make_subplots
|
362 |
|
363 |
# Create figure with subplots
|
364 |
# fig = make_subplots(rows=1, cols=2)
|
pages/1_Data_Validation.py
CHANGED
@@ -9,7 +9,7 @@ from streamlit_pandas_profiling import st_profile_report
|
|
9 |
import streamlit as st
|
10 |
import streamlit.components.v1 as components
|
11 |
import sweetviz as sv
|
12 |
-
from utilities import set_header,
|
13 |
from st_aggrid import GridOptionsBuilder,GridUpdateMode
|
14 |
from st_aggrid import GridOptionsBuilder
|
15 |
from st_aggrid import AgGrid
|
@@ -17,8 +17,7 @@ import base64
|
|
17 |
import os
|
18 |
import tempfile
|
19 |
from ydata_profiling import ProfileReport
|
20 |
-
|
21 |
-
from streamlit_pandas_profiling import st_profile_report
|
22 |
|
23 |
st.set_page_config(
|
24 |
page_title="Data Validation",
|
@@ -31,68 +30,52 @@ set_header()
|
|
31 |
|
32 |
|
33 |
|
34 |
-
#preprocessing
|
35 |
-
# with open('Categorised_data.pkl', 'rb') as file:
|
36 |
-
# Categorised_data = pickle.load(file)
|
37 |
-
# with open("edited_dataframe.pkl", 'rb') as file:
|
38 |
-
|
39 |
-
|
40 |
-
# df = pickle.load(file)
|
41 |
-
# date=df.index
|
42 |
-
# df.reset_index(inplace=True)
|
43 |
-
# df['date'] = pd.to_datetime(date)
|
44 |
-
|
45 |
-
|
46 |
-
#prospects=pd.read_excel('EDA_Data.xlsx',sheet_name='Prospects')
|
47 |
-
#spends=pd.read_excel('EDA_Data.xlsx',sheet_name='SPEND INPUT')
|
48 |
-
#spends.columns=['Week','Streaming (Spends)','TV (Spends)','Search (Spends)','Digital (Spends)']
|
49 |
-
#df=pd.concat([df,spends],axis=1)
|
50 |
|
51 |
-
#df['date'] =pd.to_datetime(df['date']).dt.strftime('%m/%d/%Y')
|
52 |
-
#df['Prospects']=prospects['Prospects']
|
53 |
-
#df.drop(['Week'],axis=1,inplace=True)
|
54 |
|
55 |
-
# Deserialize and load the objects from the pickle file
|
56 |
-
# Deserialize and load the objects from the pickle file
|
57 |
with open('data_import.pkl', 'rb') as f:
|
58 |
data = pickle.load(f)
|
59 |
|
60 |
-
# Accessing the loaded objects
|
61 |
st.session_state['cleaned_data']= data['final_df']
|
62 |
st.session_state['category_dict'] = data['bin_dict']
|
63 |
|
64 |
st.title('Data Validation and Insights')
|
65 |
|
66 |
|
67 |
-
# with open("Pickle_files/main_df",'rb') as f:
|
68 |
-
# st.session_state['cleaned_data']= pickle.load(f)
|
69 |
-
# with open("Pickle_files/category_dict",'rb') as c:
|
70 |
-
# st.session_state['category_dict']=pickle.load(c)
|
71 |
-
|
72 |
-
# st.write(st.session_state['cleaned_data'])
|
73 |
-
|
74 |
target_variables=[st.session_state['category_dict'][key] for key in st.session_state['category_dict'].keys() if key =='Response Metrics']
|
75 |
|
76 |
-
|
77 |
target_column = st.selectbox('Select the Target Feature/Dependent Variable (will be used in all charts as reference)',list(*target_variables))
|
78 |
st.session_state['target_column']=target_column
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
|
81 |
-
fig=line_plot_target(st.session_state['
|
82 |
-
st.plotly_chart(fig, use_container_width=True)
|
83 |
|
84 |
|
85 |
-
media_channel=list(*[st.session_state['category_dict'][key] for key in st.session_state['category_dict'].keys() if key =='Media'])
|
86 |
-
# st.write(media_channel)
|
87 |
|
88 |
-
|
89 |
|
90 |
|
91 |
-
st.markdown('### Annual Data Summary')
|
92 |
-
st.dataframe(summary(st.session_state['
|
93 |
|
94 |
-
if st.checkbox('Show raw data'):
|
95 |
-
|
96 |
col1 = st.columns(1)
|
97 |
|
98 |
if "selected_feature" not in st.session_state:
|
@@ -114,29 +97,30 @@ def generate_profile_report(df):
|
|
114 |
return report_path
|
115 |
|
116 |
|
117 |
-
st.header(
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
|
|
135 |
|
136 |
with eda_columns[1]:
|
137 |
-
if st.button('Generate Sweetviz Report'):
|
138 |
with st.spinner('Generating Report'):
|
139 |
-
report_file = generate_report_with_target(st.session_state['
|
140 |
|
141 |
if os.path.exists(report_file):
|
142 |
with open(report_file, 'rb') as f:
|
@@ -152,130 +136,116 @@ with eda_columns[1]:
|
|
152 |
|
153 |
|
154 |
|
155 |
-
st.warning('Work in Progress')
|
156 |
-
|
157 |
-
#
|
158 |
-
|
159 |
-
|
160 |
-
#
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
#
|
195 |
-
#
|
196 |
-
|
197 |
-
#
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
#
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
#
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
#
|
231 |
-
#
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
#
|
245 |
-
#
|
246 |
-
# #
|
247 |
-
|
248 |
-
# #
|
249 |
-
#
|
250 |
-
#
|
251 |
-
#
|
252 |
-
|
253 |
-
#
|
254 |
-
#
|
255 |
-
# st.
|
256 |
-
#
|
257 |
-
#
|
258 |
-
|
259 |
-
#
|
260 |
-
#
|
261 |
-
#
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
# tick=True
|
268 |
-
# selected_options = []
|
269 |
-
# for row in range(num_rows):
|
270 |
-
# cols = st.columns(num_columns)
|
271 |
-
# for col in cols:
|
272 |
-
# if options:
|
273 |
-
# option = options.pop(0)
|
274 |
-
# selected = col.checkbox(option,value=tick)
|
275 |
-
# if selected:
|
276 |
-
# selected_options.append(option)
|
277 |
-
# # Display selected options
|
278 |
-
# #st.write('You selected:', selected_options)
|
279 |
-
# st.pyplot(correlation_plot(df,selected_options,target_column))
|
280 |
-
|
281 |
|
|
|
9 |
import streamlit as st
|
10 |
import streamlit.components.v1 as components
|
11 |
import sweetviz as sv
|
12 |
+
from utilities import set_header,load_local_css
|
13 |
from st_aggrid import GridOptionsBuilder,GridUpdateMode
|
14 |
from st_aggrid import GridOptionsBuilder
|
15 |
from st_aggrid import AgGrid
|
|
|
17 |
import os
|
18 |
import tempfile
|
19 |
from ydata_profiling import ProfileReport
|
20 |
+
import re
|
|
|
21 |
|
22 |
st.set_page_config(
|
23 |
page_title="Data Validation",
|
|
|
30 |
|
31 |
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
|
|
|
|
|
|
34 |
|
|
|
|
|
35 |
with open('data_import.pkl', 'rb') as f:
|
36 |
data = pickle.load(f)
|
37 |
|
|
|
38 |
st.session_state['cleaned_data']= data['final_df']
|
39 |
st.session_state['category_dict'] = data['bin_dict']
|
40 |
|
41 |
st.title('Data Validation and Insights')
|
42 |
|
43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
target_variables=[st.session_state['category_dict'][key] for key in st.session_state['category_dict'].keys() if key =='Response Metrics']
|
45 |
|
|
|
46 |
target_column = st.selectbox('Select the Target Feature/Dependent Variable (will be used in all charts as reference)',list(*target_variables))
|
47 |
st.session_state['target_column']=target_column
|
48 |
+
panels=st.session_state['category_dict']['Panel Level 1'][0]
|
49 |
+
selected_panels=st.multiselect('Please choose the panels you wish to analyze.If no panels are selected, insights will be derived from the overall data.',st.session_state['cleaned_data'][panels].unique())
|
50 |
+
aggregation_dict = {item: 'sum' if key == 'Media' else 'mean' for key, value in st.session_state['category_dict'].items() for item in value if item not in ['date','Panel_1']}
|
51 |
+
|
52 |
+
with st.expander('**Reponse Metric Analysis**'):
|
53 |
+
|
54 |
+
if len(selected_panels)>0:
|
55 |
+
st.session_state['Cleaned_data_panel']=st.session_state['cleaned_data'][st.session_state['cleaned_data']['Panel_1'].isin(selected_panels)]
|
56 |
+
|
57 |
+
st.session_state['Cleaned_data_panel']=st.session_state['Cleaned_data_panel'].groupby(by='date').agg(aggregation_dict)
|
58 |
+
st.session_state['Cleaned_data_panel']=st.session_state['Cleaned_data_panel'].reset_index()
|
59 |
+
else:
|
60 |
+
st.session_state['Cleaned_data_panel']=st.session_state['cleaned_data'].groupby(by='date').agg(aggregation_dict)
|
61 |
+
st.session_state['Cleaned_data_panel']=st.session_state['Cleaned_data_panel'].reset_index()
|
62 |
|
63 |
|
64 |
+
fig=line_plot_target(st.session_state['Cleaned_data_panel'], target=target_column, title=f'{target_column} Over Time')
|
65 |
+
st.plotly_chart(fig, use_container_width=True)
|
66 |
|
67 |
|
68 |
+
media_channel=list(*[st.session_state['category_dict'][key] for key in st.session_state['category_dict'].keys() if key =='Media'])
|
69 |
+
# st.write(media_channel)
|
70 |
|
71 |
+
Non_media_variables=list(*[st.session_state['category_dict'][key] for key in st.session_state['category_dict'].keys() if key =='Exogenous' or key=='Internal'])
|
72 |
|
73 |
|
74 |
+
st.markdown('### Annual Data Summary')
|
75 |
+
st.dataframe(summary(st.session_state['Cleaned_data_panel'], media_channel+[target_column], spends=None,Target=True), use_container_width=True)
|
76 |
|
77 |
+
if st.checkbox('Show raw data'):
|
78 |
+
st.write(pd.concat([pd.to_datetime(st.session_state['Cleaned_data_panel']['date']).dt.strftime('%m/%d/%Y'),st.session_state['Cleaned_data_panel'].select_dtypes(np.number).applymap(format_numbers)],axis=1))
|
79 |
col1 = st.columns(1)
|
80 |
|
81 |
if "selected_feature" not in st.session_state:
|
|
|
97 |
return report_path
|
98 |
|
99 |
|
100 |
+
#st.header()
|
101 |
+
with st.expander('Univariate and Bivariate Report'):
|
102 |
+
eda_columns=st.columns(2)
|
103 |
+
with eda_columns[0]:
|
104 |
+
if st.button('Generate Profile Report',help='Univariate report which inlcudes all statistical analysis'):
|
105 |
+
with st.spinner('Generating Report'):
|
106 |
+
report_file = generate_profile_report(st.session_state['Cleaned_data_panel'])
|
107 |
+
|
108 |
+
if os.path.exists(report_file):
|
109 |
+
with open(report_file, 'rb') as f:
|
110 |
+
st.success('Report Generated')
|
111 |
+
st.download_button(
|
112 |
+
label="Download EDA Report",
|
113 |
+
data=f.read(),
|
114 |
+
file_name="pandas_profiling_report.html",
|
115 |
+
mime="text/html"
|
116 |
+
)
|
117 |
+
else:
|
118 |
+
st.warning("Report generation failed. Unable to find the report file.")
|
119 |
|
120 |
with eda_columns[1]:
|
121 |
+
if st.button('Generate Sweetviz Report',help='Bivariate report for selected response metric'):
|
122 |
with st.spinner('Generating Report'):
|
123 |
+
report_file = generate_report_with_target(st.session_state['Cleaned_data_panel'], target_column)
|
124 |
|
125 |
if os.path.exists(report_file):
|
126 |
with open(report_file, 'rb') as f:
|
|
|
136 |
|
137 |
|
138 |
|
139 |
+
#st.warning('Work in Progress')
|
140 |
+
with st.expander('Media Variables Analysis'):
|
141 |
+
# Get the selected feature
|
142 |
+
st.session_state["selected_feature"]= st.selectbox('Select media', [col for col in media_channel if 'cost' not in col.lower() and 'spend' not in col.lower()])
|
143 |
+
|
144 |
+
# Filter spends features based on the selected feature
|
145 |
+
spends_features = [col for col in st.session_state['Cleaned_data_panel'].columns if any(keyword in col.lower() for keyword in ['cost', 'spend'])]
|
146 |
+
spends_feature = [col for col in spends_features if re.split(r'_cost|_spend', col.lower())[0] in st.session_state["selected_feature"]]
|
147 |
+
|
148 |
+
if 'validation' not in st.session_state:
|
149 |
+
st.session_state['validation']=[]
|
150 |
+
|
151 |
+
|
152 |
+
val_variables=[col for col in media_channel if col!='date']
|
153 |
+
if len(spends_feature)==0:
|
154 |
+
st.warning('No spends varaible available for the selected metric in data')
|
155 |
+
|
156 |
+
else:
|
157 |
+
fig_row1 = line_plot(st.session_state['Cleaned_data_panel'], x_col='date', y1_cols=[st.session_state["selected_feature"]], y2_cols=[target_column], title=f'Analysis of {st.session_state["selected_feature"]} and {[target_column][0]} Over Time')
|
158 |
+
st.plotly_chart(fig_row1, use_container_width=True)
|
159 |
+
st.markdown('### Summary')
|
160 |
+
st.dataframe(summary(st.session_state['cleaned_data'],[st.session_state["selected_feature"]],spends=spends_feature[0]),use_container_width=True)
|
161 |
+
|
162 |
+
cols2=st.columns(2)
|
163 |
+
with cols2[0]:
|
164 |
+
if st.button('Validate'):
|
165 |
+
st.session_state['validation'].append(st.session_state["selected_feature"])
|
166 |
+
with cols2[1]:
|
167 |
+
if st.checkbox('Validate all'):
|
168 |
+
st.session_state['validation'].extend(val_variables)
|
169 |
+
st.success('All media variables are validated ✅')
|
170 |
+
|
171 |
+
if len(set(st.session_state['validation']).intersection(val_variables))!=len(val_variables):
|
172 |
+
validation_data=pd.DataFrame({'Validate':[True if col in st.session_state['validation'] else False for col in val_variables],
|
173 |
+
'Variables':val_variables
|
174 |
+
})
|
175 |
+
cols3=st.columns([1,30])
|
176 |
+
with cols3[1]:
|
177 |
+
validation_df=st.data_editor(validation_data,
|
178 |
+
# column_config={
|
179 |
+
# 'Validate':st.column_config.CheckboxColumn(wi)
|
180 |
+
|
181 |
+
# },
|
182 |
+
column_config={
|
183 |
+
"Validate": st.column_config.CheckboxColumn(
|
184 |
+
default=False,
|
185 |
+
width=100,
|
186 |
+
),
|
187 |
+
'Variables':st.column_config.TextColumn(
|
188 |
+
width=1000
|
189 |
+
|
190 |
+
)
|
191 |
+
},hide_index=True)
|
192 |
+
|
193 |
+
selected_rows = validation_df[validation_df['Validate']==True]['Variables']
|
194 |
+
|
195 |
+
#st.write(selected_rows)
|
196 |
+
|
197 |
+
st.session_state['validation'].extend(selected_rows)
|
198 |
+
|
199 |
+
not_validated_variables = [col for col in val_variables if col not in st.session_state["validation"]]
|
200 |
+
if not_validated_variables:
|
201 |
+
not_validated_message = f'The following variables are not validated:\n{" , ".join(not_validated_variables)}'
|
202 |
+
st.warning(not_validated_message)
|
203 |
+
|
204 |
+
|
205 |
+
|
206 |
+
with st.expander('Non Media Variables Analysis'):
|
207 |
+
selected_columns_row4 = st.selectbox('Select Channel',Non_media_variables,index=1)
|
208 |
+
# # Create the dual-axis line plot
|
209 |
+
fig_row4 = line_plot(st.session_state['Cleaned_data_panel'], x_col='date', y1_cols=[selected_columns_row4], y2_cols=[target_column], title=f'Analysis of {selected_columns_row4} and {target_column} Over Time')
|
210 |
+
st.plotly_chart(fig_row4, use_container_width=True)
|
211 |
+
selected_non_media=selected_columns_row4
|
212 |
+
sum_df = st.session_state['Cleaned_data_panel'][['date', selected_non_media,target_column]]
|
213 |
+
sum_df['Year']=pd.to_datetime(st.session_state['Cleaned_data_panel']['date']).dt.year
|
214 |
+
#st.dataframe(df)
|
215 |
+
#st.dataframe(sum_df.head(2))
|
216 |
+
sum_df=sum_df.groupby('Year').agg('sum')
|
217 |
+
sum_df.loc['Grand Total']=sum_df.sum()
|
218 |
+
sum_df=sum_df.applymap(format_numbers)
|
219 |
+
sum_df.fillna('-',inplace=True)
|
220 |
+
sum_df=sum_df.replace({"0.0":'-','nan':'-'})
|
221 |
+
st.markdown('### Summary')
|
222 |
+
st.dataframe(sum_df,use_container_width=True)
|
223 |
+
|
224 |
+
|
225 |
+
with st.expander('Correlation Analysis'):
|
226 |
+
options = list(st.session_state['Cleaned_data_panel'].select_dtypes(np.number).columns)
|
227 |
+
|
228 |
+
# selected_options = []
|
229 |
+
# num_columns = 4
|
230 |
+
# num_rows = -(-len(options) // num_columns) # Ceiling division to calculate rows
|
231 |
+
|
232 |
+
# # Create a grid of checkboxes
|
233 |
+
# st.header('Select Features for Correlation Plot')
|
234 |
+
# tick=False
|
235 |
+
# if st.checkbox('Select all'):
|
236 |
+
# tick=True
|
237 |
+
# selected_options = []
|
238 |
+
# for row in range(num_rows):
|
239 |
+
# cols = st.columns(num_columns)
|
240 |
+
# for col in cols:
|
241 |
+
# if options:
|
242 |
+
# option = options.pop(0)
|
243 |
+
# selected = col.checkbox(option,value=tick)
|
244 |
+
# if selected:
|
245 |
+
# selected_options.append(option)
|
246 |
+
# # Display selected options
|
247 |
+
|
248 |
+
selected_options=st.multiselect('Select Variables For correlation plot',[var for var in options if var!= target_column],default=options[3])
|
249 |
+
|
250 |
+
st.pyplot(correlation_plot(st.session_state['Cleaned_data_panel'],selected_options,target_column))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
251 |
|
pages/2_Transformations.py
ADDED
@@ -0,0 +1,522 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Importing necessary libraries
|
2 |
+
import streamlit as st
|
3 |
+
|
4 |
+
st.set_page_config(
|
5 |
+
page_title="Transformations",
|
6 |
+
page_icon=":shark:",
|
7 |
+
layout="wide",
|
8 |
+
initial_sidebar_state="collapsed",
|
9 |
+
)
|
10 |
+
|
11 |
+
import pickle
|
12 |
+
import numpy as np
|
13 |
+
import pandas as pd
|
14 |
+
from utilities import set_header, load_local_css
|
15 |
+
import streamlit_authenticator as stauth
|
16 |
+
import yaml
|
17 |
+
from yaml import SafeLoader
|
18 |
+
|
19 |
+
load_local_css("styles.css")
|
20 |
+
set_header()
|
21 |
+
|
22 |
+
# Check for authentication status
|
23 |
+
for k, v in st.session_state.items():
|
24 |
+
if k not in ["logout", "login", "config"] and not k.startswith(
|
25 |
+
"FormSubmitter"
|
26 |
+
):
|
27 |
+
st.session_state[k] = v
|
28 |
+
with open("config.yaml") as file:
|
29 |
+
config = yaml.load(file, Loader=SafeLoader)
|
30 |
+
st.session_state["config"] = config
|
31 |
+
authenticator = stauth.Authenticate(
|
32 |
+
config["credentials"],
|
33 |
+
config["cookie"]["name"],
|
34 |
+
config["cookie"]["key"],
|
35 |
+
config["cookie"]["expiry_days"],
|
36 |
+
config["preauthorized"],
|
37 |
+
)
|
38 |
+
st.session_state["authenticator"] = authenticator
|
39 |
+
name, authentication_status, username = authenticator.login("Login", "main")
|
40 |
+
auth_status = st.session_state.get("authentication_status")
|
41 |
+
|
42 |
+
if auth_status == True:
|
43 |
+
authenticator.logout("Logout", "main")
|
44 |
+
is_state_initiaized = st.session_state.get("initialized", False)
|
45 |
+
|
46 |
+
if not is_state_initiaized:
|
47 |
+
|
48 |
+
if 'session_name' not in st.session_state:
|
49 |
+
st.session_state['session_name']=None
|
50 |
+
|
51 |
+
|
52 |
+
# Deserialize and load the objects from the pickle file
|
53 |
+
with open("data_import.pkl", "rb") as f:
|
54 |
+
data = pickle.load(f)
|
55 |
+
|
56 |
+
# Accessing the loaded objects
|
57 |
+
final_df_loaded = data["final_df"]
|
58 |
+
bin_dict_loaded = data["bin_dict"]
|
59 |
+
|
60 |
+
# Initialize session state
|
61 |
+
if "transformed_columns_dict" not in st.session_state:
|
62 |
+
st.session_state["transformed_columns_dict"] = {} # Default empty dictionary
|
63 |
+
|
64 |
+
if "final_df" not in st.session_state:
|
65 |
+
st.session_state["final_df"] = final_df_loaded # Default as original dataframe
|
66 |
+
|
67 |
+
if "summary_string" not in st.session_state:
|
68 |
+
st.session_state["summary_string"] = None # Default as None
|
69 |
+
|
70 |
+
# Extract original columns for specified categories
|
71 |
+
original_columns = {
|
72 |
+
category: bin_dict_loaded[category]
|
73 |
+
for category in ["Media", "Internal", "Exogenous"]
|
74 |
+
if category in bin_dict_loaded
|
75 |
+
}
|
76 |
+
|
77 |
+
# Retrive Panel columns
|
78 |
+
panel_1 = bin_dict_loaded.get("Panel Level 1")
|
79 |
+
panel_2 = bin_dict_loaded.get("Panel Level 2")
|
80 |
+
|
81 |
+
# # For testing on non panel level
|
82 |
+
# final_df_loaded = final_df_loaded.drop("Panel_1", axis=1)
|
83 |
+
# final_df_loaded = final_df_loaded.groupby("date").mean().reset_index()
|
84 |
+
# panel_1 = None
|
85 |
+
|
86 |
+
# Apply transformations on panel level
|
87 |
+
st.write("")
|
88 |
+
if panel_1:
|
89 |
+
panel = panel_1 + panel_2 if panel_2 else panel_1
|
90 |
+
else:
|
91 |
+
panel = []
|
92 |
+
|
93 |
+
|
94 |
+
# Function to build transformation widgets
|
95 |
+
def transformation_widgets(category, transform_params, date_granularity):
|
96 |
+
# Transformation Options
|
97 |
+
transformation_options = {
|
98 |
+
"Media": ["Lag", "Moving Average", "Saturation", "Power", "Adstock"],
|
99 |
+
"Internal": ["Lead", "Lag", "Moving Average"],
|
100 |
+
"Exogenous": ["Lead", "Lag", "Moving Average"],
|
101 |
+
}
|
102 |
+
|
103 |
+
with st.expander(f"{category} Transformations"):
|
104 |
+
|
105 |
+
# Let users select which transformations to apply
|
106 |
+
transformations_to_apply = st.multiselect(
|
107 |
+
"Select transformations to apply",
|
108 |
+
options=transformation_options[category],
|
109 |
+
default=[],
|
110 |
+
key=f"transformation_{category}",
|
111 |
+
)
|
112 |
+
|
113 |
+
# Determine the number of transformations to put in each column
|
114 |
+
transformations_per_column = (
|
115 |
+
len(transformations_to_apply) // 2 + len(transformations_to_apply) % 2
|
116 |
+
)
|
117 |
+
|
118 |
+
# Create two columns
|
119 |
+
col1, col2 = st.columns(2)
|
120 |
+
|
121 |
+
# Assign transformations to each column
|
122 |
+
transformations_col1 = transformations_to_apply[:transformations_per_column]
|
123 |
+
transformations_col2 = transformations_to_apply[transformations_per_column:]
|
124 |
+
|
125 |
+
# Define a helper function to create widgets for each transformation
|
126 |
+
def create_transformation_widgets(column, transformations):
|
127 |
+
with column:
|
128 |
+
for transformation in transformations:
|
129 |
+
# Conditionally create widgets for selected transformations
|
130 |
+
if transformation == "Lead":
|
131 |
+
st.markdown(f"**Lead ({date_granularity})**")
|
132 |
+
lead = st.slider(
|
133 |
+
"Lead periods",
|
134 |
+
1,
|
135 |
+
10,
|
136 |
+
(1, 2),
|
137 |
+
1,
|
138 |
+
key=f"lead_{category}",
|
139 |
+
label_visibility="collapsed",
|
140 |
+
)
|
141 |
+
start = lead[0]
|
142 |
+
end = lead[1]
|
143 |
+
step = 1
|
144 |
+
transform_params[category]["Lead"] = np.arange(
|
145 |
+
start, end + step, step
|
146 |
+
)
|
147 |
+
|
148 |
+
if transformation == "Lag":
|
149 |
+
st.markdown(f"**Lag ({date_granularity})**")
|
150 |
+
lag = st.slider(
|
151 |
+
"Lag periods",
|
152 |
+
1,
|
153 |
+
10,
|
154 |
+
(1, 2),
|
155 |
+
1,
|
156 |
+
key=f"lag_{category}",
|
157 |
+
label_visibility="collapsed",
|
158 |
+
)
|
159 |
+
start = lag[0]
|
160 |
+
end = lag[1]
|
161 |
+
step = 1
|
162 |
+
transform_params[category]["Lag"] = np.arange(
|
163 |
+
start, end + step, step
|
164 |
+
)
|
165 |
+
|
166 |
+
if transformation == "Moving Average":
|
167 |
+
st.markdown(f"**Moving Average ({date_granularity})**")
|
168 |
+
window = st.slider(
|
169 |
+
"Window size for Moving Average",
|
170 |
+
1,
|
171 |
+
10,
|
172 |
+
(1, 2),
|
173 |
+
1,
|
174 |
+
key=f"ma_{category}",
|
175 |
+
label_visibility="collapsed",
|
176 |
+
)
|
177 |
+
start = window[0]
|
178 |
+
end = window[1]
|
179 |
+
step = 1
|
180 |
+
transform_params[category]["Moving Average"] = np.arange(
|
181 |
+
start, end + step, step
|
182 |
+
)
|
183 |
+
|
184 |
+
if transformation == "Saturation":
|
185 |
+
st.markdown("**Saturation (%)**")
|
186 |
+
saturation_point = st.slider(
|
187 |
+
f"Saturation Percentage",
|
188 |
+
0,
|
189 |
+
100,
|
190 |
+
(10, 20),
|
191 |
+
10,
|
192 |
+
key=f"sat_{category}",
|
193 |
+
label_visibility="collapsed",
|
194 |
+
)
|
195 |
+
start = saturation_point[0]
|
196 |
+
end = saturation_point[1]
|
197 |
+
step = 10
|
198 |
+
transform_params[category]["Saturation"] = np.arange(
|
199 |
+
start, end + step, step
|
200 |
+
)
|
201 |
+
|
202 |
+
if transformation == "Power":
|
203 |
+
st.markdown("**Power**")
|
204 |
+
power = st.slider(
|
205 |
+
f"Power",
|
206 |
+
0,
|
207 |
+
10,
|
208 |
+
(2, 4),
|
209 |
+
1,
|
210 |
+
key=f"power_{category}",
|
211 |
+
label_visibility="collapsed",
|
212 |
+
)
|
213 |
+
start = power[0]
|
214 |
+
end = power[1]
|
215 |
+
step = 1
|
216 |
+
transform_params[category]["Power"] = np.arange(
|
217 |
+
start, end + step, step
|
218 |
+
)
|
219 |
+
|
220 |
+
if transformation == "Adstock":
|
221 |
+
st.markdown("**Adstock**")
|
222 |
+
rate = st.slider(
|
223 |
+
f"Factor ({category})",
|
224 |
+
0.0,
|
225 |
+
1.0,
|
226 |
+
(0.5, 0.7),
|
227 |
+
0.05,
|
228 |
+
key=f"adstock_{category}",
|
229 |
+
label_visibility="collapsed",
|
230 |
+
)
|
231 |
+
start = rate[0]
|
232 |
+
end = rate[1]
|
233 |
+
step = 0.05
|
234 |
+
adstock_range = [
|
235 |
+
round(a, 3) for a in np.arange(start, end + step, step)
|
236 |
+
]
|
237 |
+
transform_params[category]["Adstock"] = adstock_range
|
238 |
+
|
239 |
+
# Create widgets in each column
|
240 |
+
create_transformation_widgets(col1, transformations_col1)
|
241 |
+
create_transformation_widgets(col2, transformations_col2)
|
242 |
+
|
243 |
+
|
244 |
+
# Function to apply Lag transformation
|
245 |
+
def apply_lag(df, lag):
|
246 |
+
return df.shift(lag)
|
247 |
+
|
248 |
+
|
249 |
+
# Function to apply Lead transformation
|
250 |
+
def apply_lead(df, lead):
|
251 |
+
return df.shift(-lead)
|
252 |
+
|
253 |
+
|
254 |
+
# Function to apply Moving Average transformation
|
255 |
+
def apply_moving_average(df, window_size):
|
256 |
+
return df.rolling(window=window_size).mean()
|
257 |
+
|
258 |
+
|
259 |
+
# Function to apply Saturation transformation
|
260 |
+
def apply_saturation(df, saturation_percent_100):
|
261 |
+
# Convert saturation percentage from 100-based to fraction
|
262 |
+
saturation_percent = saturation_percent_100 / 100.0
|
263 |
+
|
264 |
+
# Calculate saturation point and steepness
|
265 |
+
column_max = df.max()
|
266 |
+
column_min = df.min()
|
267 |
+
saturation_point = (column_min + column_max) / 2
|
268 |
+
|
269 |
+
numerator = np.log(
|
270 |
+
(1 / (saturation_percent if saturation_percent != 1 else 1 - 1e-9)) - 1
|
271 |
+
)
|
272 |
+
denominator = np.log(saturation_point / max(column_max, 1e-9))
|
273 |
+
|
274 |
+
steepness = numerator / max(
|
275 |
+
denominator, 1e-9
|
276 |
+
) # Avoid division by zero with a small constant
|
277 |
+
|
278 |
+
# Apply the saturation transformation
|
279 |
+
transformed_series = df.apply(
|
280 |
+
lambda x: (1 / (1 + (saturation_point / x) ** steepness)) * x
|
281 |
+
)
|
282 |
+
|
283 |
+
return transformed_series
|
284 |
+
|
285 |
+
|
286 |
+
# Function to apply Power transformation
|
287 |
+
def apply_power(df, power):
|
288 |
+
return df**power
|
289 |
+
|
290 |
+
|
291 |
+
# Function to apply Adstock transformation
|
292 |
+
def apply_adstock(df, factor):
|
293 |
+
x = 0
|
294 |
+
# Use the walrus operator to update x iteratively with the Adstock formula
|
295 |
+
adstock_var = [x := x * factor + v for v in df]
|
296 |
+
ans = pd.Series(adstock_var, index=df.index)
|
297 |
+
return ans
|
298 |
+
|
299 |
+
|
300 |
+
# Function to generate transformed columns names
|
301 |
+
@st.cache_resource(show_spinner=False)
|
302 |
+
def generate_transformed_columns(original_columns, transform_params):
|
303 |
+
transformed_columns, summary = {}, {}
|
304 |
+
|
305 |
+
for category, columns in original_columns.items():
|
306 |
+
for column in columns:
|
307 |
+
transformed_columns[column] = []
|
308 |
+
summary_details = (
|
309 |
+
[]
|
310 |
+
) # List to hold transformation details for the current column
|
311 |
+
|
312 |
+
if category in transform_params:
|
313 |
+
for transformation, values in transform_params[category].items():
|
314 |
+
# Generate transformed column names for each value
|
315 |
+
for value in values:
|
316 |
+
transformed_name = f"{column}@{transformation}_{value}"
|
317 |
+
transformed_columns[column].append(transformed_name)
|
318 |
+
|
319 |
+
# Format the values list as a string with commas and "and" before the last item
|
320 |
+
if len(values) > 1:
|
321 |
+
formatted_values = (
|
322 |
+
", ".join(map(str, values[:-1])) + " and " + str(values[-1])
|
323 |
+
)
|
324 |
+
else:
|
325 |
+
formatted_values = str(values[0])
|
326 |
+
|
327 |
+
# Add transformation details
|
328 |
+
summary_details.append(f"{transformation} ({formatted_values})")
|
329 |
+
|
330 |
+
# Only add to summary if there are transformation details for the column
|
331 |
+
if summary_details:
|
332 |
+
formatted_summary = "⮕ ".join(summary_details)
|
333 |
+
# Use <strong> tags to make the column name bold
|
334 |
+
summary[column] = f"<strong>{column}</strong>: {formatted_summary}"
|
335 |
+
|
336 |
+
# Generate a comprehensive summary string for all columns
|
337 |
+
summary_items = [
|
338 |
+
f"{idx + 1}. {details}" for idx, details in enumerate(summary.values())
|
339 |
+
]
|
340 |
+
|
341 |
+
summary_string = "\n".join(summary_items)
|
342 |
+
|
343 |
+
return transformed_columns, summary_string
|
344 |
+
|
345 |
+
|
346 |
+
# Function to apply transformations to DataFrame slices based on specified categories and parameters
|
347 |
+
@st.cache_resource(show_spinner=False)
|
348 |
+
def apply_category_transformations(df, bin_dict, transform_params, panel):
|
349 |
+
# Dictionary for function mapping
|
350 |
+
transformation_functions = {
|
351 |
+
"Lead": apply_lead,
|
352 |
+
"Lag": apply_lag,
|
353 |
+
"Moving Average": apply_moving_average,
|
354 |
+
"Saturation": apply_saturation,
|
355 |
+
"Power": apply_power,
|
356 |
+
"Adstock": apply_adstock,
|
357 |
+
}
|
358 |
+
|
359 |
+
# Initialize category_df as an empty DataFrame
|
360 |
+
category_df = pd.DataFrame()
|
361 |
+
|
362 |
+
# Iterate through each category specified in transform_params
|
363 |
+
for category in ["Media", "Internal", "Exogenous"]:
|
364 |
+
if (
|
365 |
+
category not in transform_params
|
366 |
+
or category not in bin_dict
|
367 |
+
or not transform_params[category]
|
368 |
+
):
|
369 |
+
continue # Skip categories without transformations
|
370 |
+
|
371 |
+
# Slice the DataFrame based on the columns specified in bin_dict for the current category
|
372 |
+
df_slice = df[bin_dict[category] + panel]
|
373 |
+
|
374 |
+
# Iterate through each transformation and its parameters for the current category
|
375 |
+
for transformation, parameters in transform_params[category].items():
|
376 |
+
transformation_function = transformation_functions[transformation]
|
377 |
+
|
378 |
+
# Check if there is panel data to group by
|
379 |
+
if len(panel) > 0:
|
380 |
+
# Apply the transformation to each group
|
381 |
+
category_df = pd.concat(
|
382 |
+
[
|
383 |
+
df_slice.groupby(panel)
|
384 |
+
.transform(transformation_function, p)
|
385 |
+
.add_suffix(f"@{transformation}_{p}")
|
386 |
+
for p in parameters
|
387 |
+
],
|
388 |
+
axis=1,
|
389 |
+
)
|
390 |
+
|
391 |
+
# Replace all NaN or null values in category_df with 0
|
392 |
+
category_df.fillna(0, inplace=True)
|
393 |
+
|
394 |
+
# Update df_slice
|
395 |
+
df_slice = pd.concat(
|
396 |
+
[df[panel], category_df],
|
397 |
+
axis=1,
|
398 |
+
)
|
399 |
+
|
400 |
+
else:
|
401 |
+
for p in parameters:
|
402 |
+
# Apply the transformation function to each column
|
403 |
+
temp_df = df_slice.apply(
|
404 |
+
lambda x: transformation_function(x, p), axis=0
|
405 |
+
).rename(lambda x: f"{x}@{transformation}_{p}", axis="columns")
|
406 |
+
# Concatenate the transformed DataFrame slice to the category DataFrame
|
407 |
+
category_df = pd.concat([category_df, temp_df], axis=1)
|
408 |
+
|
409 |
+
# Replace all NaN or null values in category_df with 0
|
410 |
+
category_df.fillna(0, inplace=True)
|
411 |
+
|
412 |
+
# Update df_slice
|
413 |
+
df_slice = pd.concat(
|
414 |
+
[df[panel], category_df],
|
415 |
+
axis=1,
|
416 |
+
)
|
417 |
+
|
418 |
+
# If category_df has been modified, concatenate it with the panel and response metrics from the original DataFrame
|
419 |
+
if not category_df.empty:
|
420 |
+
final_df = pd.concat([df, category_df], axis=1)
|
421 |
+
else:
|
422 |
+
# If no transformations were applied, use the original DataFrame
|
423 |
+
final_df = df
|
424 |
+
|
425 |
+
return final_df
|
426 |
+
|
427 |
+
|
428 |
+
# Function to infers the granularity of the date column in a DataFrame
|
429 |
+
@st.cache_resource(show_spinner=False)
|
430 |
+
def infer_date_granularity(df):
|
431 |
+
# Find the most common difference
|
432 |
+
common_freq = pd.Series(df["date"].unique()).diff().dt.days.dropna().mode()[0]
|
433 |
+
|
434 |
+
# Map the most common difference to a granularity
|
435 |
+
if common_freq == 1:
|
436 |
+
return "daily"
|
437 |
+
elif common_freq == 7:
|
438 |
+
return "weekly"
|
439 |
+
elif 28 <= common_freq <= 31:
|
440 |
+
return "monthly"
|
441 |
+
else:
|
442 |
+
return "irregular"
|
443 |
+
|
444 |
+
|
445 |
+
#########################################################################################################################################################
|
446 |
+
# User input for transformations
|
447 |
+
#########################################################################################################################################################
|
448 |
+
|
449 |
+
|
450 |
+
# Infer date granularity
|
451 |
+
date_granularity = infer_date_granularity(final_df_loaded)
|
452 |
+
|
453 |
+
# Initialize the main dictionary to store the transformation parameters for each category
|
454 |
+
transform_params = {"Media": {}, "Internal": {}, "Exogenous": {}}
|
455 |
+
|
456 |
+
# User input for transformations
|
457 |
+
st.markdown("### Select Transformations to Apply")
|
458 |
+
for category in ["Media", "Internal", "Exogenous"]:
|
459 |
+
# Skip Internal
|
460 |
+
if category == "Internal":
|
461 |
+
continue
|
462 |
+
|
463 |
+
transformation_widgets(category, transform_params, date_granularity)
|
464 |
+
|
465 |
+
|
466 |
+
#########################################################################################################################################################
|
467 |
+
# Apply transformations
|
468 |
+
#########################################################################################################################################################
|
469 |
+
|
470 |
+
|
471 |
+
# Apply category-based transformations to the DataFrame
|
472 |
+
if st.button("Accept and Proceed", use_container_width=True):
|
473 |
+
with st.spinner("Applying transformations..."):
|
474 |
+
final_df = apply_category_transformations(
|
475 |
+
final_df_loaded, bin_dict_loaded, transform_params, panel
|
476 |
+
)
|
477 |
+
|
478 |
+
# Generate a dictionary mapping original column names to lists of transformed column names
|
479 |
+
transformed_columns_dict, summary_string = generate_transformed_columns(
|
480 |
+
original_columns, transform_params
|
481 |
+
)
|
482 |
+
|
483 |
+
# Store into transformed dataframe and summary session state
|
484 |
+
st.session_state["final_df"] = final_df
|
485 |
+
st.session_state["summary_string"] = summary_string
|
486 |
+
|
487 |
+
|
488 |
+
#########################################################################################################################################################
|
489 |
+
# Display the transformed DataFrame and summary
|
490 |
+
#########################################################################################################################################################
|
491 |
+
|
492 |
+
|
493 |
+
# Display the transformed DataFrame in the Streamlit app
|
494 |
+
st.markdown("### Transformed DataFrame")
|
495 |
+
st.dataframe(st.session_state["final_df"], hide_index=True)
|
496 |
+
|
497 |
+
# Total rows and columns
|
498 |
+
total_rows, total_columns = st.session_state["final_df"].shape
|
499 |
+
st.markdown(
|
500 |
+
f"<p style='text-align: justify;'>The transformed DataFrame contains <strong>{total_rows}</strong> rows and <strong>{total_columns}</strong> columns.</p>",
|
501 |
+
unsafe_allow_html=True,
|
502 |
+
)
|
503 |
+
|
504 |
+
# Display the summary of transformations as markdown
|
505 |
+
if st.session_state["summary_string"]:
|
506 |
+
with st.expander("Summary of Transformations"):
|
507 |
+
st.markdown("### Summary of Transformations")
|
508 |
+
st.markdown(st.session_state["summary_string"], unsafe_allow_html=True)
|
509 |
+
|
510 |
+
@st.cache_resource(show_spinner=False)
|
511 |
+
def save_to_pickle(file_path, final_df):
|
512 |
+
# Open the file in write-binary mode and dump the objects
|
513 |
+
with open(file_path, "wb") as f:
|
514 |
+
pickle.dump({"final_df_transformed": final_df}, f)
|
515 |
+
# Data is now saved to file
|
516 |
+
|
517 |
+
if st.button("Accept and Save", use_container_width=True):
|
518 |
+
|
519 |
+
save_to_pickle(
|
520 |
+
"final_df_transformed.pkl", st.session_state["final_df"]
|
521 |
+
)
|
522 |
+
st.toast("💾 Saved Successfully!")
|
pages/4_Model_Build.py
ADDED
@@ -0,0 +1,826 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'''
|
2 |
+
MMO Build Sprint 3
|
3 |
+
additions : adding more variables to session state for saved model : random effect, predicted train & test
|
4 |
+
|
5 |
+
MMO Build Sprint 4
|
6 |
+
additions : ability to run models for different response metrics
|
7 |
+
'''
|
8 |
+
|
9 |
+
import streamlit as st
|
10 |
+
import pandas as pd
|
11 |
+
import plotly.express as px
|
12 |
+
import plotly.graph_objects as go
|
13 |
+
from Eda_functions import format_numbers
|
14 |
+
import numpy as np
|
15 |
+
import pickle
|
16 |
+
from st_aggrid import AgGrid
|
17 |
+
from st_aggrid import GridOptionsBuilder, GridUpdateMode
|
18 |
+
from utilities import set_header, load_local_css
|
19 |
+
from st_aggrid import GridOptionsBuilder
|
20 |
+
import time
|
21 |
+
import itertools
|
22 |
+
import statsmodels.api as sm
|
23 |
+
import numpy as npc
|
24 |
+
import re
|
25 |
+
import itertools
|
26 |
+
from sklearn.metrics import mean_absolute_error, r2_score, mean_absolute_percentage_error
|
27 |
+
from sklearn.preprocessing import MinMaxScaler
|
28 |
+
import os
|
29 |
+
import matplotlib.pyplot as plt
|
30 |
+
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
31 |
+
|
32 |
+
st.set_option('deprecation.showPyplotGlobalUse', False)
|
33 |
+
import statsmodels.api as sm
|
34 |
+
import statsmodels.formula.api as smf
|
35 |
+
|
36 |
+
from datetime import datetime
|
37 |
+
import seaborn as sns
|
38 |
+
from Data_prep_functions import *
|
39 |
+
|
40 |
+
|
41 |
+
|
42 |
+
def get_random_effects(media_data, panel_col, mdf):
|
43 |
+
random_eff_df = pd.DataFrame(columns=[panel_col, "random_effect"])
|
44 |
+
|
45 |
+
for i, market in enumerate(media_data[panel_col].unique()):
|
46 |
+
print(i, end='\r')
|
47 |
+
intercept = mdf.random_effects[market].values[0]
|
48 |
+
random_eff_df.loc[i, 'random_effect'] = intercept
|
49 |
+
random_eff_df.loc[i, panel_col] = market
|
50 |
+
|
51 |
+
return random_eff_df
|
52 |
+
|
53 |
+
|
54 |
+
def mdf_predict(X_df, mdf, random_eff_df):
|
55 |
+
X = X_df.copy()
|
56 |
+
X['fixed_effect'] = mdf.predict(X)
|
57 |
+
X = pd.merge(X, random_eff_df, on=panel_col, how='left')
|
58 |
+
X['pred'] = X['fixed_effect'] + X['random_effect']
|
59 |
+
# X.to_csv('Test/megred_df.csv',index=False)
|
60 |
+
X.drop(columns=['fixed_effect', 'random_effect'], inplace=True)
|
61 |
+
return X['pred']
|
62 |
+
|
63 |
+
|
64 |
+
st.set_page_config(
|
65 |
+
page_title="Model Build",
|
66 |
+
page_icon=":shark:",
|
67 |
+
layout="wide",
|
68 |
+
initial_sidebar_state='collapsed'
|
69 |
+
)
|
70 |
+
|
71 |
+
load_local_css('styles.css')
|
72 |
+
set_header()
|
73 |
+
|
74 |
+
st.title('1. Build Your Model')
|
75 |
+
|
76 |
+
with open("data_import.pkl", "rb") as f:
|
77 |
+
data = pickle.load(f)
|
78 |
+
|
79 |
+
st.session_state['bin_dict'] = data["bin_dict"]
|
80 |
+
|
81 |
+
#st.write(data["bin_dict"])
|
82 |
+
|
83 |
+
with open("final_df_transformed.pkl", "rb") as f:
|
84 |
+
data = pickle.load(f)
|
85 |
+
|
86 |
+
# Accessing the loaded objects
|
87 |
+
media_data = data["final_df_transformed"]
|
88 |
+
|
89 |
+
# Sprint4 - available response metrics is a list of all reponse metrics in the data
|
90 |
+
## these will be put in a drop down
|
91 |
+
|
92 |
+
st.session_state['media_data']=media_data
|
93 |
+
|
94 |
+
if 'available_response_metrics' not in st.session_state:
|
95 |
+
# st.session_state['available_response_metrics'] = ['Total Approved Accounts - Revenue',
|
96 |
+
# 'Total Approved Accounts - Appsflyer',
|
97 |
+
# 'Account Requests - Appsflyer',
|
98 |
+
# 'App Installs - Appsflyer']
|
99 |
+
|
100 |
+
st.session_state['available_response_metrics']= st.session_state['bin_dict']["Response Metrics"]
|
101 |
+
# Sprint4
|
102 |
+
if "is_tuned_model" not in st.session_state:
|
103 |
+
st.session_state["is_tuned_model"] = {}
|
104 |
+
for resp_metric in st.session_state['available_response_metrics'] :
|
105 |
+
resp_metric=resp_metric.lower().replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_")
|
106 |
+
st.session_state["is_tuned_model"][resp_metric] = False
|
107 |
+
|
108 |
+
# Sprint4 - used_response_metrics is a list of resp metrics for which user has created & saved a model
|
109 |
+
if 'used_response_metrics' not in st.session_state:
|
110 |
+
st.session_state['used_response_metrics'] = []
|
111 |
+
|
112 |
+
# Sprint4 - saved_model_names
|
113 |
+
if 'saved_model_names' not in st.session_state:
|
114 |
+
st.session_state['saved_model_names'] = []
|
115 |
+
|
116 |
+
# if "model_save_flag" not in st.session_state:
|
117 |
+
# st.session_state["model_save_flag"]=False
|
118 |
+
# def reset_save():
|
119 |
+
# st.session_state["model_save_flag"]=False
|
120 |
+
# def set_save():
|
121 |
+
# st.session_state["model_save_flag"]=True
|
122 |
+
# Sprint4 - select a response metric
|
123 |
+
|
124 |
+
|
125 |
+
sel_target_col = st.selectbox("Select the response metric",
|
126 |
+
st.session_state['available_response_metrics'])
|
127 |
+
# , on_change=reset_save())
|
128 |
+
target_col = sel_target_col.lower().replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_")
|
129 |
+
|
130 |
+
new_name_dct={col:col.lower().replace('.','_').lower().replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in media_data.columns}
|
131 |
+
|
132 |
+
media_data.columns=[col.lower().replace('.','_').replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in media_data.columns]
|
133 |
+
|
134 |
+
#st.write(st.session_state['bin_dict'])
|
135 |
+
panel_col = [col.lower().replace('.','_').replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in st.session_state['bin_dict']['Panel Level 1'] ] [0]# set the panel column
|
136 |
+
date_col = 'date'
|
137 |
+
|
138 |
+
#st.write(media_data)
|
139 |
+
|
140 |
+
is_panel = True if len(panel_col)>0 else False
|
141 |
+
|
142 |
+
if 'is_panel' not in st.session_state:
|
143 |
+
st.session_state['is_panel']=False
|
144 |
+
|
145 |
+
|
146 |
+
|
147 |
+
# if st.toggle('Apply Transformations on DMA/Panel Level'):
|
148 |
+
# media_data = pd.read_csv(r'C:\Users\SrishtiVerma\Mastercard\Sprint2\upf_data_converted_randomized_resp_metrics.csv')
|
149 |
+
# media_data.columns = [i.lower().replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for i in
|
150 |
+
# media_data.columns]
|
151 |
+
# dma = st.selectbox('Select the Level of data ',
|
152 |
+
# [col for col in media_data.columns if col.lower() in ['dma', 'panel', 'markets']])
|
153 |
+
# # is_panel = True
|
154 |
+
# # st.session_state['is_panel']=True
|
155 |
+
#
|
156 |
+
# else:
|
157 |
+
# # """ code to aggregate data on date """
|
158 |
+
# media_data = pd.read_excel(r'C:\Users\SrishtiVerma\Mastercard\Sprint1\Tactic Level Models\Tactic_level_data_imp_clicks_spends.xlsx')
|
159 |
+
# media_data.columns = [i.lower().replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for i in
|
160 |
+
# media_data.columns]
|
161 |
+
# dma = None
|
162 |
+
# # is_panel = False
|
163 |
+
# # st.session_state['is_panel']=False
|
164 |
+
|
165 |
+
#media_data = st.session_state["final_df"]
|
166 |
+
|
167 |
+
|
168 |
+
|
169 |
+
# st.write(media_data.columns)
|
170 |
+
|
171 |
+
media_data.sort_values(date_col, inplace=True)
|
172 |
+
media_data.reset_index(drop=True, inplace=True)
|
173 |
+
|
174 |
+
date = media_data[date_col]
|
175 |
+
st.session_state['date'] = date
|
176 |
+
# revenue=media_data[target_col]
|
177 |
+
y = media_data[target_col]
|
178 |
+
|
179 |
+
if is_panel:
|
180 |
+
spends_data = media_data[
|
181 |
+
[c for c in media_data.columns if "_cost" in c.lower() or "_spend" in c.lower()] + [date_col, panel_col]]
|
182 |
+
# Sprint3 - spends for resp curves
|
183 |
+
else:
|
184 |
+
spends_data = media_data[
|
185 |
+
[c for c in media_data.columns if "_cost" in c.lower() or "_spend" in c.lower()] + [date_col]]
|
186 |
+
|
187 |
+
y = media_data[target_col]
|
188 |
+
# media_data.drop([target_col],axis=1,inplace=True)
|
189 |
+
media_data.drop([date_col], axis=1, inplace=True)
|
190 |
+
media_data.reset_index(drop=True, inplace=True)
|
191 |
+
|
192 |
+
# dma_dict={ dm:media_data[media_data[dma]==dm] for dm in media_data[dma].unique()}
|
193 |
+
|
194 |
+
# st.markdown('## Select the Range of Transformations')
|
195 |
+
columns = st.columns(2)
|
196 |
+
|
197 |
+
old_shape = media_data.shape
|
198 |
+
|
199 |
+
if "old_shape" not in st.session_state:
|
200 |
+
st.session_state['old_shape'] = old_shape
|
201 |
+
|
202 |
+
# with columns[0]:
|
203 |
+
# slider_value_adstock = st.slider('Select Adstock Range (only applied to media)', 0.0, 1.0, (0.2, 0.4), step=0.1,
|
204 |
+
# format="%.2f")
|
205 |
+
# with columns[1]:
|
206 |
+
# slider_value_lag = st.slider('Select Lag Range (applied to media, seasonal, macroeconomic variables)', 1, 7, (1, 3),
|
207 |
+
# step=1)
|
208 |
+
|
209 |
+
|
210 |
+
# with columns[2]:
|
211 |
+
# slider_value_power=st.slider('Select Power range (only applied to media )',0,4,(1,2),step=1)
|
212 |
+
|
213 |
+
# with columns[1]:
|
214 |
+
# st.number_input('Select the range of half saturation point ',min_value=1,max_value=5)
|
215 |
+
# st.number_input('Select the range of ')
|
216 |
+
|
217 |
+
# Section 1 - Transformations Functions
|
218 |
+
# def lag(data, features, lags, dma=None):
|
219 |
+
# if dma:
|
220 |
+
#
|
221 |
+
# transformed_data = pd.concat(
|
222 |
+
# [data.groupby([dma])[features].shift(lag).add_suffix(f'_lag_{lag}') for lag in lags], axis=1)
|
223 |
+
# # transformed_data = transformed_data.fillna(method='bfill')
|
224 |
+
# transformed_data = transformed_data.bfill() # Sprint4 - fillna getting deprecated
|
225 |
+
# return pd.concat([transformed_data, data], axis=1)
|
226 |
+
#
|
227 |
+
# else:
|
228 |
+
#
|
229 |
+
# # ''' data should be aggregated on date'''
|
230 |
+
#
|
231 |
+
# transformed_data = pd.concat([data[features].shift(lag).add_suffix(f'_lag_{lag}') for lag in lags], axis=1)
|
232 |
+
# # transformed_data = transformed_data.fillna(method='bfill')
|
233 |
+
# transformed_data = transformed_data.bfill()
|
234 |
+
#
|
235 |
+
# return pd.concat([transformed_data, data], axis=1)
|
236 |
+
#
|
237 |
+
#
|
238 |
+
# # adstock
|
239 |
+
# def adstock(df, alphas, cutoff, features, dma=None):
|
240 |
+
# if dma:
|
241 |
+
# transformed_data = pd.DataFrame()
|
242 |
+
# for d in df[dma].unique():
|
243 |
+
# dma_sub_df = df[df[dma] == d]
|
244 |
+
# n = len(dma_sub_df)
|
245 |
+
#
|
246 |
+
# weights = np.array(
|
247 |
+
# [[[alpha ** (i - j) if i >= j and j >= i - cutoff else 0. for j in range(n)] for i in range(n)] for
|
248 |
+
# alpha in alphas])
|
249 |
+
# X = dma_sub_df[features].to_numpy()
|
250 |
+
#
|
251 |
+
# res = pd.DataFrame(np.hstack(weights @ X),
|
252 |
+
# columns=[f'{col}_adstock_{alpha}' for alpha in alphas for col in features])
|
253 |
+
#
|
254 |
+
# transformed_data = pd.concat([transformed_data, res], axis=0)
|
255 |
+
# transformed_data.reset_index(drop=True, inplace=True)
|
256 |
+
# return pd.concat([transformed_data, df], axis=1)
|
257 |
+
#
|
258 |
+
# else:
|
259 |
+
#
|
260 |
+
# n = len(df)
|
261 |
+
#
|
262 |
+
# weights = np.array(
|
263 |
+
# [[[alpha ** (i - j) if i >= j and j >= i - cutoff else 0. for j in range(n)] for i in range(n)] for alpha in
|
264 |
+
# alphas])
|
265 |
+
#
|
266 |
+
# X = df[features].to_numpy()
|
267 |
+
# res = pd.DataFrame(np.hstack(weights @ X),
|
268 |
+
# columns=[f'{col}_adstock_{alpha}' for alpha in alphas for col in features])
|
269 |
+
# return pd.concat([res, df], axis=1)
|
270 |
+
|
271 |
+
|
272 |
+
# Section 2 - Begin Transformations
|
273 |
+
|
274 |
+
if 'media_data' not in st.session_state:
|
275 |
+
st.session_state['media_data'] = pd.DataFrame()
|
276 |
+
|
277 |
+
# Sprint3
|
278 |
+
if "orig_media_data" not in st.session_state:
|
279 |
+
st.session_state['orig_media_data'] = pd.DataFrame()
|
280 |
+
|
281 |
+
# Sprint3 additions
|
282 |
+
if 'random_effects' not in st.session_state:
|
283 |
+
st.session_state['random_effects'] = pd.DataFrame()
|
284 |
+
if 'pred_train' not in st.session_state:
|
285 |
+
st.session_state['pred_train'] = []
|
286 |
+
if 'pred_test' not in st.session_state:
|
287 |
+
st.session_state['pred_test'] = []
|
288 |
+
# end of Sprint3 additions
|
289 |
+
|
290 |
+
# variables_to_be_transformed=[col for col in media_data.columns if col.lower() not in ['dma','panel'] ] # change for buckets
|
291 |
+
# variables_to_be_transformed = [col for col in media_data.columns if
|
292 |
+
# '_clicks' in col.lower() or '_impress' in col.lower()] # srishti - change
|
293 |
+
#
|
294 |
+
# with columns[0]:
|
295 |
+
# if st.button('Apply Transformations'):
|
296 |
+
# with st.spinner('Applying Transformations'):
|
297 |
+
# transformed_data_lag = lag(media_data, features=variables_to_be_transformed,
|
298 |
+
# lags=np.arange(slider_value_lag[0], slider_value_lag[1] + 1, 1), dma=dma)
|
299 |
+
#
|
300 |
+
# # variables_to_be_transformed=[col for col in list(transformed_data_lag.columns) if col not in ['Date','DMA','Panel']] #change for buckets
|
301 |
+
# variables_to_be_transformed = [col for col in media_data.columns if
|
302 |
+
# '_clicks' in col.lower() or '_impress' in col.lower()] # srishti - change
|
303 |
+
#
|
304 |
+
# transformed_data_adstock = adstock(df=transformed_data_lag,
|
305 |
+
# alphas=np.arange(slider_value_adstock[0], slider_value_adstock[1], 0.1),
|
306 |
+
# cutoff=8, features=variables_to_be_transformed, dma=dma)
|
307 |
+
#
|
308 |
+
# # st.success('Done')
|
309 |
+
# st.success("Transformations complete!")
|
310 |
+
#
|
311 |
+
# st.write(f'old shape {old_shape}, new shape {transformed_data_adstock.shape}')
|
312 |
+
#
|
313 |
+
# transformed_data_adstock.columns = [c.replace(".", "_") for c in
|
314 |
+
# transformed_data_adstock.columns] # srishti
|
315 |
+
# st.session_state['media_data'] = transformed_data_adstock # srishti
|
316 |
+
# # Sprint3
|
317 |
+
# orig_media_data = media_data.copy()
|
318 |
+
# orig_media_data[date_col] = date
|
319 |
+
# orig_media_data[target_col] = y
|
320 |
+
# st.session_state['orig_media_data'] = orig_media_data # srishti
|
321 |
+
#
|
322 |
+
# # with st.spinner('Applying Transformations'):
|
323 |
+
# # time.sleep(2)
|
324 |
+
# # st.success("Transformations complete!")
|
325 |
+
#
|
326 |
+
# # if st.session_state['media_data'].shape[1]>old_shape[1]:
|
327 |
+
# # with columns[0]:
|
328 |
+
# # st.write(f'Total no.of variables before transformation: {old_shape[1]}, Total no.of variables after transformation: {st.session_state["media_data"].shape[1]}')
|
329 |
+
# # st.write(f'Total no.of variables after transformation: {st.session_state["media_data"].shape[1]}')
|
330 |
+
|
331 |
+
# Section 3 - Create combinations
|
332 |
+
|
333 |
+
# bucket=['paid_search', 'kwai','indicacao','infleux', 'influencer','FB: Level Achieved - Tier 1 Impressions',
|
334 |
+
# ' FB: Level Achieved - Tier 2 Impressions','paid_social_others',
|
335 |
+
# ' GA App: Will And Cid Pequena Baixo Risco Clicks',
|
336 |
+
# 'digital_tactic_others',"programmatic"
|
337 |
+
# ]
|
338 |
+
|
339 |
+
# srishti - bucket names changed
|
340 |
+
bucket = ['paid_search', 'kwai', 'indicacao', 'infleux', 'influencer', 'fb_level_achieved_tier_2',
|
341 |
+
'fb_level_achieved_tier_1', 'paid_social_others',
|
342 |
+
'ga_app',
|
343 |
+
'digital_tactic_others', "programmatic"
|
344 |
+
]
|
345 |
+
|
346 |
+
with columns[0]:
|
347 |
+
if st.button('Create Combinations of Variables'):
|
348 |
+
|
349 |
+
top_3_correlated_features = []
|
350 |
+
# # for col in st.session_state['media_data'].columns[:19]:
|
351 |
+
# original_cols = [c for c in st.session_state['media_data'].columns if
|
352 |
+
# "_clicks" in c.lower() or "_impressions" in c.lower()]
|
353 |
+
#original_cols = [c for c in original_cols if "_lag" not in c.lower() and "_adstock" not in c.lower()]
|
354 |
+
|
355 |
+
original_cols=st.session_state['bin_dict']['Media'] + st.session_state['bin_dict']['Internal']
|
356 |
+
|
357 |
+
original_cols=[col.lower().replace('.','_').replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in original_cols]
|
358 |
+
|
359 |
+
#st.write(original_cols)
|
360 |
+
# for col in st.session_state['media_data'].columns[:19]:
|
361 |
+
for col in original_cols: # srishti - new
|
362 |
+
corr_df = pd.concat([st.session_state['media_data'].filter(regex=col),
|
363 |
+
y], axis=1).corr()[target_col].iloc[:-1]
|
364 |
+
top_3_correlated_features.append(list(corr_df.sort_values(ascending=False).head(2).index))
|
365 |
+
flattened_list = [item for sublist in top_3_correlated_features for item in sublist]
|
366 |
+
# all_features_set={var:[col for col in flattened_list if var in col] for var in bucket}
|
367 |
+
all_features_set = {var: [col for col in flattened_list if var in col] for var in bucket if
|
368 |
+
len([col for col in flattened_list if var in col]) > 0} # srishti
|
369 |
+
|
370 |
+
channels_all = [values for values in all_features_set.values()]
|
371 |
+
st.session_state['combinations'] = list(itertools.product(*channels_all))
|
372 |
+
# if 'combinations' not in st.session_state:
|
373 |
+
# st.session_state['combinations']=combinations_all
|
374 |
+
|
375 |
+
st.session_state['final_selection'] = st.session_state['combinations']
|
376 |
+
st.success('Done')
|
377 |
+
|
378 |
+
# revenue.reset_index(drop=True,inplace=True)
|
379 |
+
y.reset_index(drop=True, inplace=True)
|
380 |
+
if 'Model_results' not in st.session_state:
|
381 |
+
st.session_state['Model_results'] = {'Model_object': [],
|
382 |
+
'Model_iteration': [],
|
383 |
+
'Feature_set': [],
|
384 |
+
'MAPE': [],
|
385 |
+
'R2': [],
|
386 |
+
'ADJR2': [],
|
387 |
+
'pos_count': []
|
388 |
+
}
|
389 |
+
|
390 |
+
|
391 |
+
def reset_model_result_dct():
|
392 |
+
st.session_state['Model_results'] = {'Model_object': [],
|
393 |
+
'Model_iteration': [],
|
394 |
+
'Feature_set': [],
|
395 |
+
'MAPE': [],
|
396 |
+
'R2': [],
|
397 |
+
'ADJR2': [],
|
398 |
+
'pos_count': []
|
399 |
+
}
|
400 |
+
|
401 |
+
# if st.button('Build Model'):
|
402 |
+
|
403 |
+
|
404 |
+
if 'iterations' not in st.session_state:
|
405 |
+
st.session_state['iterations'] = 0
|
406 |
+
|
407 |
+
if 'final_selection' not in st.session_state:
|
408 |
+
st.session_state['final_selection'] = False
|
409 |
+
|
410 |
+
save_path = r"Model/"
|
411 |
+
with columns[1]:
|
412 |
+
if st.session_state['final_selection']:
|
413 |
+
st.write(f'Total combinations created {format_numbers(len(st.session_state["final_selection"]))}')
|
414 |
+
|
415 |
+
if st.checkbox('Build all iterations'):
|
416 |
+
iterations = len(st.session_state['final_selection'])
|
417 |
+
else:
|
418 |
+
iterations = st.number_input('Select the number of iterations to perform', min_value=0, step=100,
|
419 |
+
value=st.session_state['iterations'], on_change=reset_model_result_dct)
|
420 |
+
# st.write("iterations=", iterations)
|
421 |
+
|
422 |
+
|
423 |
+
if st.button('Build Model', on_click=reset_model_result_dct):
|
424 |
+
st.session_state['iterations'] = iterations
|
425 |
+
|
426 |
+
# Section 4 - Model
|
427 |
+
# st.session_state['media_data'] = st.session_state['media_data'].fillna(method='ffill')
|
428 |
+
st.session_state['media_data'] = st.session_state['media_data'].ffill()
|
429 |
+
st.markdown(
|
430 |
+
'Data Split -- Training Period: May 9th, 2023 - October 5th,2023 , Testing Period: October 6th, 2023 - November 7th, 2023 ')
|
431 |
+
progress_bar = st.progress(0) # Initialize the progress bar
|
432 |
+
# time_remaining_text = st.empty() # Create an empty space for time remaining text
|
433 |
+
start_time = time.time() # Record the start time
|
434 |
+
progress_text = st.empty()
|
435 |
+
|
436 |
+
# time_elapsed_text = st.empty()
|
437 |
+
# for i, selected_features in enumerate(st.session_state["final_selection"][40000:40000 + int(iterations)]):
|
438 |
+
# st.write(st.session_state["final_selection"])
|
439 |
+
# for i, selected_features in enumerate(st.session_state["final_selection"]):
|
440 |
+
|
441 |
+
if is_panel == True:
|
442 |
+
for i, selected_features in enumerate(st.session_state["final_selection"][0:int(iterations)]): # srishti
|
443 |
+
df = st.session_state['media_data']
|
444 |
+
|
445 |
+
fet = [var for var in selected_features if len(var) > 0]
|
446 |
+
inp_vars_str = " + ".join(fet) # new
|
447 |
+
|
448 |
+
X = df[fet]
|
449 |
+
y = df[target_col]
|
450 |
+
ss = MinMaxScaler()
|
451 |
+
X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
|
452 |
+
|
453 |
+
X[target_col] = y # Sprint2
|
454 |
+
X[panel_col] = df[panel_col] # Sprint2
|
455 |
+
|
456 |
+
X_train = X.iloc[:8000]
|
457 |
+
X_test = X.iloc[8000:]
|
458 |
+
y_train = y.iloc[:8000]
|
459 |
+
y_test = y.iloc[8000:]
|
460 |
+
|
461 |
+
print(X_train.shape)
|
462 |
+
# model = sm.OLS(y_train, X_train).fit()
|
463 |
+
md_str = target_col + " ~ " + inp_vars_str
|
464 |
+
# md = smf.mixedlm("total_approved_accounts_revenue ~ {}".format(inp_vars_str),
|
465 |
+
# data=X_train[[target_col] + fet],
|
466 |
+
# groups=X_train[panel_col])
|
467 |
+
md = smf.mixedlm(md_str,
|
468 |
+
data=X_train[[target_col] + fet],
|
469 |
+
groups=X_train[panel_col])
|
470 |
+
mdf = md.fit()
|
471 |
+
predicted_values = mdf.fittedvalues
|
472 |
+
|
473 |
+
coefficients = mdf.fe_params.to_dict()
|
474 |
+
model_positive = [col for col in coefficients.keys() if coefficients[col] > 0]
|
475 |
+
|
476 |
+
pvalues = [var for var in list(mdf.pvalues) if var <= 0.06]
|
477 |
+
|
478 |
+
if (len(model_positive) / len(selected_features)) > 0 and (
|
479 |
+
len(pvalues) / len(selected_features)) >= 0: # srishti - changed just for testing, revert later
|
480 |
+
# predicted_values = model.predict(X_train)
|
481 |
+
mape = mean_absolute_percentage_error(y_train, predicted_values)
|
482 |
+
r2 = r2_score(y_train, predicted_values)
|
483 |
+
adjr2 = 1 - (1 - r2) * (len(y_train) - 1) / (len(y_train) - len(selected_features) - 1)
|
484 |
+
|
485 |
+
filename = os.path.join(save_path, f"model_{i}.pkl")
|
486 |
+
with open(filename, "wb") as f:
|
487 |
+
pickle.dump(mdf, f)
|
488 |
+
# with open(r"C:\Users\ManojP\Documents\MMM\simopt\Model\model.pkl", 'rb') as file:
|
489 |
+
# model = pickle.load(file)
|
490 |
+
|
491 |
+
st.session_state['Model_results']['Model_object'].append(filename)
|
492 |
+
st.session_state['Model_results']['Model_iteration'].append(i)
|
493 |
+
st.session_state['Model_results']['Feature_set'].append(fet)
|
494 |
+
st.session_state['Model_results']['MAPE'].append(mape)
|
495 |
+
st.session_state['Model_results']['R2'].append(r2)
|
496 |
+
st.session_state['Model_results']['pos_count'].append(len(model_positive))
|
497 |
+
st.session_state['Model_results']['ADJR2'].append(adjr2)
|
498 |
+
|
499 |
+
current_time = time.time()
|
500 |
+
time_taken = current_time - start_time
|
501 |
+
time_elapsed_minutes = time_taken / 60
|
502 |
+
completed_iterations_text = f"{i + 1}/{iterations}"
|
503 |
+
progress_bar.progress((i + 1) / int(iterations))
|
504 |
+
progress_text.text(
|
505 |
+
f'Completed iterations: {completed_iterations_text},Time Elapsed (min): {time_elapsed_minutes:.2f}')
|
506 |
+
st.write(
|
507 |
+
f'Out of {st.session_state["iterations"]} iterations : {len(st.session_state["Model_results"]["Model_object"])} valid models')
|
508 |
+
|
509 |
+
else:
|
510 |
+
|
511 |
+
for i, selected_features in enumerate(st.session_state["final_selection"][0:int(iterations)]): # srishti
|
512 |
+
df = st.session_state['media_data']
|
513 |
+
|
514 |
+
fet = [var for var in selected_features if len(var) > 0]
|
515 |
+
inp_vars_str = " + ".join(fet)
|
516 |
+
|
517 |
+
X = df[fet]
|
518 |
+
y = df[target_col]
|
519 |
+
ss = MinMaxScaler()
|
520 |
+
X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
|
521 |
+
X = sm.add_constant(X)
|
522 |
+
X_train = X.iloc[:130]
|
523 |
+
X_test = X.iloc[130:]
|
524 |
+
y_train = y.iloc[:130]
|
525 |
+
y_test = y.iloc[130:]
|
526 |
+
|
527 |
+
model = sm.OLS(y_train, X_train).fit()
|
528 |
+
|
529 |
+
|
530 |
+
coefficients = model.params.to_list()
|
531 |
+
model_positive = [coef for coef in coefficients if coef > 0]
|
532 |
+
predicted_values = model.predict(X_train)
|
533 |
+
pvalues = [var for var in list(model.pvalues) if var <= 0.06]
|
534 |
+
|
535 |
+
# if (len(model_possitive) / len(selected_features)) > 0.9 and (len(pvalues) / len(selected_features)) >= 0.8:
|
536 |
+
if (len(model_positive) / len(selected_features)) > 0 and (len(pvalues) / len(
|
537 |
+
selected_features)) >= 0.5: # srishti - changed just for testing, revert later VALID MODEL CRITERIA
|
538 |
+
# predicted_values = model.predict(X_train)
|
539 |
+
mape = mean_absolute_percentage_error(y_train, predicted_values)
|
540 |
+
adjr2 = model.rsquared_adj
|
541 |
+
r2 = model.rsquared
|
542 |
+
|
543 |
+
filename = os.path.join(save_path, f"model_{i}.pkl")
|
544 |
+
with open(filename, "wb") as f:
|
545 |
+
pickle.dump(model, f)
|
546 |
+
# with open(r"C:\Users\ManojP\Documents\MMM\simopt\Model\model.pkl", 'rb') as file:
|
547 |
+
# model = pickle.load(file)
|
548 |
+
|
549 |
+
st.session_state['Model_results']['Model_object'].append(filename)
|
550 |
+
st.session_state['Model_results']['Model_iteration'].append(i)
|
551 |
+
st.session_state['Model_results']['Feature_set'].append(fet)
|
552 |
+
st.session_state['Model_results']['MAPE'].append(mape)
|
553 |
+
st.session_state['Model_results']['R2'].append(r2)
|
554 |
+
st.session_state['Model_results']['ADJR2'].append(adjr2)
|
555 |
+
st.session_state['Model_results']['pos_count'].append(len(model_positive))
|
556 |
+
|
557 |
+
current_time = time.time()
|
558 |
+
time_taken = current_time - start_time
|
559 |
+
time_elapsed_minutes = time_taken / 60
|
560 |
+
completed_iterations_text = f"{i + 1}/{iterations}"
|
561 |
+
progress_bar.progress((i + 1) / int(iterations))
|
562 |
+
progress_text.text(
|
563 |
+
f'Completed iterations: {completed_iterations_text},Time Elapsed (min): {time_elapsed_minutes:.2f}')
|
564 |
+
st.write(
|
565 |
+
f'Out of {st.session_state["iterations"]} iterations : {len(st.session_state["Model_results"]["Model_object"])} valid models')
|
566 |
+
|
567 |
+
pd.DataFrame(st.session_state['Model_results']).to_csv('model_output.csv')
|
568 |
+
|
569 |
+
|
570 |
+
def to_percentage(value):
|
571 |
+
return f'{value * 100:.1f}%'
|
572 |
+
|
573 |
+
## Section 5 - Select Model
|
574 |
+
st.title('2. Select Models')
|
575 |
+
if 'tick' not in st.session_state:
|
576 |
+
st.session_state['tick'] = False
|
577 |
+
if st.checkbox('Show results of top 10 models (based on MAPE and Adj. R2)', value=st.session_state['tick']):
|
578 |
+
st.session_state['tick'] = True
|
579 |
+
st.write('Select one model iteration to generate performance metrics for it:')
|
580 |
+
data = pd.DataFrame(st.session_state['Model_results'])
|
581 |
+
data = data[data['pos_count']==data['pos_count'].max()].reset_index(drop=True) # Sprint4 -- Srishti -- only show models with the lowest num of neg coeffs
|
582 |
+
data.sort_values(by=['ADJR2'], ascending=False, inplace=True)
|
583 |
+
data.drop_duplicates(subset='Model_iteration', inplace=True)
|
584 |
+
top_10 = data.head(10)
|
585 |
+
top_10['Rank'] = np.arange(1, len(top_10) + 1, 1)
|
586 |
+
top_10[['MAPE', 'R2', 'ADJR2']] = np.round(top_10[['MAPE', 'R2', 'ADJR2']], 4).applymap(to_percentage)
|
587 |
+
top_10_table = top_10[['Rank', 'Model_iteration', 'MAPE', 'ADJR2', 'R2']]
|
588 |
+
# top_10_table.columns=[['Rank','Model Iteration Index','MAPE','Adjusted R2','R2']]
|
589 |
+
gd = GridOptionsBuilder.from_dataframe(top_10_table)
|
590 |
+
gd.configure_pagination(enabled=True)
|
591 |
+
|
592 |
+
gd.configure_selection(
|
593 |
+
use_checkbox=True,
|
594 |
+
selection_mode="single",
|
595 |
+
pre_select_all_rows=False,
|
596 |
+
pre_selected_rows=[1],
|
597 |
+
)
|
598 |
+
|
599 |
+
gridoptions = gd.build()
|
600 |
+
|
601 |
+
table = AgGrid(top_10, gridOptions=gridoptions, update_mode=GridUpdateMode.SELECTION_CHANGED)
|
602 |
+
|
603 |
+
selected_rows = table.selected_rows
|
604 |
+
# if st.session_state["selected_rows"] != selected_rows:
|
605 |
+
# st.session_state["build_rc_cb"] = False
|
606 |
+
st.session_state["selected_rows"] = selected_rows
|
607 |
+
if 'Model' not in st.session_state:
|
608 |
+
st.session_state['Model'] = {}
|
609 |
+
|
610 |
+
# Section 6 - Display Results
|
611 |
+
|
612 |
+
if len(selected_rows) > 0:
|
613 |
+
st.header('2.1 Results Summary')
|
614 |
+
|
615 |
+
model_object = data[data['Model_iteration'] == selected_rows[0]['Model_iteration']]['Model_object']
|
616 |
+
features_set = data[data['Model_iteration'] == selected_rows[0]['Model_iteration']]['Feature_set']
|
617 |
+
|
618 |
+
with open(str(model_object.values[0]), 'rb') as file:
|
619 |
+
# print(file)
|
620 |
+
model = pickle.load(file)
|
621 |
+
st.write(model.summary())
|
622 |
+
st.header('2.2 Actual vs. Predicted Plot')
|
623 |
+
|
624 |
+
if is_panel :
|
625 |
+
df = st.session_state['media_data']
|
626 |
+
X = df[features_set.values[0]]
|
627 |
+
y = df[target_col]
|
628 |
+
|
629 |
+
ss = MinMaxScaler()
|
630 |
+
X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
|
631 |
+
|
632 |
+
# Sprint2 changes
|
633 |
+
X[target_col] = y # new
|
634 |
+
X[panel_col] = df[panel_col]
|
635 |
+
X[date_col] = date
|
636 |
+
|
637 |
+
X_train = X.iloc[:8000]
|
638 |
+
X_test = X.iloc[8000:].reset_index(drop=True)
|
639 |
+
y_train = y.iloc[:8000]
|
640 |
+
y_test = y.iloc[8000:].reset_index(drop=True)
|
641 |
+
|
642 |
+
test_spends = spends_data[8000:] # Sprint3 - test spends for resp curves
|
643 |
+
random_eff_df = get_random_effects(media_data, panel_col, model)
|
644 |
+
train_pred = model.fittedvalues
|
645 |
+
test_pred = mdf_predict(X_test, model, random_eff_df)
|
646 |
+
print("__" * 20, test_pred.isna().sum())
|
647 |
+
|
648 |
+
else :
|
649 |
+
df = st.session_state['media_data']
|
650 |
+
X = df[features_set.values[0]]
|
651 |
+
y = df[target_col]
|
652 |
+
|
653 |
+
ss = MinMaxScaler()
|
654 |
+
X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
|
655 |
+
X = sm.add_constant(X)
|
656 |
+
|
657 |
+
X[date_col] = date
|
658 |
+
|
659 |
+
X_train = X.iloc[:130]
|
660 |
+
X_test = X.iloc[130:].reset_index(drop=True)
|
661 |
+
y_train = y.iloc[:130]
|
662 |
+
y_test = y.iloc[130:].reset_index(drop=True)
|
663 |
+
|
664 |
+
test_spends = spends_data[130:] # Sprint3 - test spends for resp curves
|
665 |
+
train_pred = model.predict(X_train[features_set.values[0]+['const']])
|
666 |
+
test_pred = model.predict(X_test[features_set.values[0]+['const']])
|
667 |
+
|
668 |
+
|
669 |
+
# save x test to test - srishti
|
670 |
+
x_test_to_save = X_test.copy()
|
671 |
+
x_test_to_save['Actuals'] = y_test
|
672 |
+
x_test_to_save['Predictions'] = test_pred
|
673 |
+
|
674 |
+
x_train_to_save = X_train.copy()
|
675 |
+
x_train_to_save['Actuals'] = y_train
|
676 |
+
x_train_to_save['Predictions'] = train_pred
|
677 |
+
|
678 |
+
x_train_to_save.to_csv('Test/x_train_to_save.csv', index=False)
|
679 |
+
x_test_to_save.to_csv('Test/x_test_to_save.csv', index=False)
|
680 |
+
|
681 |
+
st.session_state['X'] = X_train
|
682 |
+
st.session_state['features_set'] = features_set.values[0]
|
683 |
+
print("**" * 20, "selected model features : ", features_set.values[0])
|
684 |
+
metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(X_train[date_col], y_train, train_pred,
|
685 |
+
model, target_column=sel_target_col,
|
686 |
+
is_panel=is_panel) # Sprint2
|
687 |
+
|
688 |
+
st.plotly_chart(actual_vs_predicted_plot, use_container_width=True)
|
689 |
+
|
690 |
+
st.markdown('## 2.3 Residual Analysis')
|
691 |
+
columns = st.columns(2)
|
692 |
+
with columns[0]:
|
693 |
+
fig = plot_residual_predicted(y_train, train_pred, X_train) # Sprint2
|
694 |
+
st.plotly_chart(fig)
|
695 |
+
|
696 |
+
with columns[1]:
|
697 |
+
st.empty()
|
698 |
+
fig = qqplot(y_train, train_pred) # Sprint2
|
699 |
+
st.plotly_chart(fig)
|
700 |
+
|
701 |
+
with columns[0]:
|
702 |
+
fig = residual_distribution(y_train, train_pred) # Sprint2
|
703 |
+
st.pyplot(fig)
|
704 |
+
|
705 |
+
vif_data = pd.DataFrame()
|
706 |
+
# X=X.drop('const',axis=1)
|
707 |
+
X_train_orig = X_train.copy() # Sprint2 -- creating a copy of xtrain. Later deleting panel, target & date from xtrain
|
708 |
+
del_col_list = list(set([target_col, panel_col, date_col]).intersection(list(X_train.columns)))
|
709 |
+
X_train.drop(columns=del_col_list, inplace=True) # Sprint2
|
710 |
+
|
711 |
+
vif_data["Variable"] = X_train.columns
|
712 |
+
vif_data["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
|
713 |
+
vif_data.sort_values(by=['VIF'], ascending=False, inplace=True)
|
714 |
+
vif_data = np.round(vif_data)
|
715 |
+
vif_data['VIF'] = vif_data['VIF'].astype(float)
|
716 |
+
st.header('2.4 Variance Inflation Factor (VIF)')
|
717 |
+
# st.dataframe(vif_data)
|
718 |
+
color_mapping = {
|
719 |
+
'darkgreen': (vif_data['VIF'] < 3),
|
720 |
+
'orange': (vif_data['VIF'] >= 3) & (vif_data['VIF'] <= 10),
|
721 |
+
'darkred': (vif_data['VIF'] > 10)
|
722 |
+
}
|
723 |
+
|
724 |
+
# Create a horizontal bar plot
|
725 |
+
fig, ax = plt.subplots()
|
726 |
+
fig.set_figwidth(10) # Adjust the width of the figure as needed
|
727 |
+
|
728 |
+
# Sort the bars by descending VIF values
|
729 |
+
vif_data = vif_data.sort_values(by='VIF', ascending=False)
|
730 |
+
|
731 |
+
# Iterate through the color mapping and plot bars with corresponding colors
|
732 |
+
for color, condition in color_mapping.items():
|
733 |
+
subset = vif_data[condition]
|
734 |
+
bars = ax.barh(subset["Variable"], subset["VIF"], color=color, label=color)
|
735 |
+
|
736 |
+
# Add text annotations on top of the bars
|
737 |
+
for bar in bars:
|
738 |
+
width = bar.get_width()
|
739 |
+
ax.annotate(f'{width:}', xy=(width, bar.get_y() + bar.get_height() / 2), xytext=(5, 0),
|
740 |
+
textcoords='offset points', va='center')
|
741 |
+
|
742 |
+
# Customize the plot
|
743 |
+
ax.set_xlabel('VIF Values')
|
744 |
+
# ax.set_title('2.4 Variance Inflation Factor (VIF)')
|
745 |
+
# ax.legend(loc='upper right')
|
746 |
+
|
747 |
+
# Display the plot in Streamlit
|
748 |
+
st.pyplot(fig)
|
749 |
+
|
750 |
+
with st.expander('Results Summary Test data'):
|
751 |
+
# ss = MinMaxScaler()
|
752 |
+
# X_test = pd.DataFrame(ss.fit_transform(X_test), columns=X_test.columns)
|
753 |
+
st.header('2.2 Actual vs. Predicted Plot')
|
754 |
+
|
755 |
+
metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(X_test[date_col], y_test,
|
756 |
+
test_pred, model,
|
757 |
+
target_column=sel_target_col,
|
758 |
+
is_panel=is_panel) # Sprint2
|
759 |
+
|
760 |
+
st.plotly_chart(actual_vs_predicted_plot, use_container_width=True)
|
761 |
+
|
762 |
+
st.markdown('## 2.3 Residual Analysis')
|
763 |
+
columns = st.columns(2)
|
764 |
+
with columns[0]:
|
765 |
+
fig = plot_residual_predicted(y, test_pred, X_test) # Sprint2
|
766 |
+
st.plotly_chart(fig)
|
767 |
+
|
768 |
+
with columns[1]:
|
769 |
+
st.empty()
|
770 |
+
fig = qqplot(y, test_pred) # Sprint2
|
771 |
+
st.plotly_chart(fig)
|
772 |
+
|
773 |
+
with columns[0]:
|
774 |
+
fig = residual_distribution(y, test_pred) # Sprint2
|
775 |
+
st.pyplot(fig)
|
776 |
+
|
777 |
+
value = False
|
778 |
+
save_button_model = st.checkbox('Save this model to tune', key='build_rc_cb') # , on_click=set_save())
|
779 |
+
|
780 |
+
if save_button_model:
|
781 |
+
mod_name = st.text_input('Enter model name')
|
782 |
+
if len(mod_name) > 0:
|
783 |
+
mod_name = mod_name + "__" + target_col # Sprint4 - adding target col to model name
|
784 |
+
if is_panel :
|
785 |
+
pred_train= model.fittedvalues
|
786 |
+
pred_test= mdf_predict(X_test, model, random_eff_df)
|
787 |
+
else :
|
788 |
+
st.session_state['features_set'] = st.session_state['features_set'] + ['const']
|
789 |
+
pred_train= model.predict(X_train_orig[st.session_state['features_set']])
|
790 |
+
pred_test= model.predict(X_test[st.session_state['features_set']])
|
791 |
+
|
792 |
+
st.session_state['Model'][mod_name] = {"Model_object": model,
|
793 |
+
'feature_set': st.session_state['features_set'],
|
794 |
+
'X_train': X_train_orig,
|
795 |
+
'X_test': X_test,
|
796 |
+
'y_train': y_train,
|
797 |
+
'y_test': y_test,
|
798 |
+
'pred_train':pred_train,
|
799 |
+
'pred_test': pred_test
|
800 |
+
}
|
801 |
+
st.session_state['X_train'] = X_train_orig
|
802 |
+
# st.session_state['X_test'] = X_test
|
803 |
+
# st.session_state['y_train'] = y_train
|
804 |
+
# st.session_state['y_test'] = y_test
|
805 |
+
st.session_state['X_test_spends'] = test_spends
|
806 |
+
# st.session_state['base_model'] = model
|
807 |
+
# st.session_state['base_model_feature_set'] = st.session_state['features_set']
|
808 |
+
st.session_state['saved_model_names'].append(mod_name)
|
809 |
+
# Sprint3 additions
|
810 |
+
if is_panel :
|
811 |
+
random_eff_df = get_random_effects(media_data, panel_col, model)
|
812 |
+
st.session_state['random_effects'] = random_eff_df
|
813 |
+
|
814 |
+
# st.session_state['pred_train'] = model.fittedvalues
|
815 |
+
# st.session_state['pred_test'] = mdf_predict(X_test, model, random_eff_df)
|
816 |
+
# # End of Sprint3 additions
|
817 |
+
|
818 |
+
with open("best_models.pkl", "wb") as f:
|
819 |
+
pickle.dump(st.session_state['Model'], f)
|
820 |
+
st.success(mod_name + ' model saved! Proceed to the next page to tune the model')
|
821 |
+
urm = st.session_state['used_response_metrics']
|
822 |
+
urm.append(sel_target_col)
|
823 |
+
st.session_state['used_response_metrics'] = list(set(urm))
|
824 |
+
mod_name = ""
|
825 |
+
# Sprint4 - add the formatted name of the target col to used resp metrics
|
826 |
+
value = False
|
pages/4_Saved_Model_Results.py
CHANGED
@@ -7,16 +7,14 @@ import statsmodels.api as sm
|
|
7 |
from sklearn.metrics import mean_absolute_percentage_error
|
8 |
import sys
|
9 |
import os
|
10 |
-
from utilities import
|
11 |
-
load_local_css,
|
12 |
-
load_authenticator)
|
13 |
import seaborn as sns
|
14 |
import matplotlib.pyplot as plt
|
15 |
import sweetviz as sv
|
16 |
import tempfile
|
17 |
from sklearn.preprocessing import MinMaxScaler
|
18 |
from st_aggrid import AgGrid
|
19 |
-
from st_aggrid import GridOptionsBuilder,GridUpdateMode
|
20 |
from st_aggrid import GridOptionsBuilder
|
21 |
import sys
|
22 |
import re
|
@@ -24,390 +22,586 @@ import re
|
|
24 |
sys.setrecursionlimit(10**6)
|
25 |
|
26 |
original_stdout = sys.stdout
|
27 |
-
sys.stdout = open(
|
28 |
sys.stdout.close()
|
29 |
sys.stdout = original_stdout
|
30 |
|
31 |
-
st.set_page_config(layout=
|
32 |
-
load_local_css(
|
33 |
set_header()
|
34 |
|
35 |
for k, v in st.session_state.items():
|
36 |
-
if k not in [
|
37 |
st.session_state[k] = v
|
38 |
|
39 |
-
authenticator = st.session_state.get(
|
40 |
if authenticator is None:
|
41 |
authenticator = load_authenticator()
|
42 |
|
43 |
-
name, authentication_status, username = authenticator.login(
|
44 |
-
auth_status = st.session_state.get(
|
45 |
|
46 |
if auth_status == True:
|
47 |
-
is_state_initiaized = st.session_state.get(
|
48 |
if not is_state_initiaized:
|
49 |
-
a=1
|
50 |
-
|
51 |
|
52 |
def plot_residual_predicted(actual, predicted, df_):
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
def residual_distribution(actual, predicted):
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
def qqplot(actual, predicted):
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
diagonal_line = go.Scatter(
|
102 |
-
x=[-2, 2], # Adjust the x values as needed to fit the range of your data
|
103 |
-
y=[-2, 2], # Adjust the y values accordingly
|
104 |
-
mode='lines',
|
105 |
-
line=dict(color='red'), # Customize the line color and style
|
106 |
-
name=' '
|
107 |
)
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
|
|
|
116 |
|
117 |
def plot_actual_vs_predicted(date, y, predicted_values, model):
|
118 |
|
119 |
fig = go.Figure()
|
120 |
|
121 |
-
fig.add_trace(
|
122 |
-
|
123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
# Calculate MAPE
|
125 |
-
mape = mean_absolute_percentage_error(y, predicted_values)*100
|
126 |
-
|
127 |
# Calculate R-squared
|
128 |
rss = np.sum((y - predicted_values) ** 2)
|
129 |
tss = np.sum((y - np.mean(y)) ** 2)
|
130 |
r_squared = 1 - (rss / tss)
|
131 |
-
|
132 |
# Get the number of predictors
|
133 |
num_predictors = model.df_model
|
134 |
-
|
135 |
# Get the number of samples
|
136 |
num_samples = len(y)
|
137 |
-
|
138 |
# Calculate Adjusted R-squared
|
139 |
-
adj_r_squared = 1 - (
|
140 |
-
|
141 |
-
|
142 |
-
|
|
|
|
|
|
|
|
|
|
|
143 |
fig.update_layout(
|
144 |
-
xaxis=dict(title=
|
145 |
-
yaxis=dict(title=
|
146 |
-
title=f
|
147 |
-
xaxis_tickangle=-30
|
148 |
)
|
149 |
|
150 |
-
return metrics_table,fig
|
|
|
151 |
def contributions(X, model):
|
152 |
X1 = X.copy()
|
153 |
for j, col in enumerate(X1.columns):
|
154 |
X1[col] = X1[col] * model.params.values[j]
|
155 |
|
156 |
-
return np.round(
|
|
|
|
|
157 |
|
158 |
-
transformed_data=pd.read_csv(
|
159 |
|
160 |
# hard coded for now, need to get features set from model
|
161 |
|
162 |
-
feature_set_dct={
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
|
|
|
|
|
|
217 |
X1 = X.copy()
|
218 |
for j, col in enumerate(X1.columns):
|
219 |
X1[col] = X1[col] * model.params.values[j]
|
220 |
-
|
221 |
-
contributions= np.round(
|
222 |
-
|
223 |
-
|
224 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
return contributions
|
226 |
-
|
227 |
|
228 |
-
def model_fit(features_set,target):
|
229 |
X = transformed_data[features_set]
|
230 |
-
y=
|
231 |
ss = MinMaxScaler()
|
232 |
X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
|
233 |
X = sm.add_constant(X)
|
234 |
-
X_train=X.iloc[:150]
|
235 |
-
X_test=X.iloc[150:]
|
236 |
-
y_train=y.iloc[:150]
|
237 |
-
y_test=y.iloc[150:]
|
238 |
model = sm.OLS(y_train, X_train).fit()
|
239 |
predicted_values_train = model.predict(X_train)
|
240 |
r2 = model.rsquared
|
241 |
adjr2 = model.rsquared_adj
|
242 |
train_mape = mean_absolute_percentage_error(y_train, predicted_values_train)
|
243 |
-
test_mape=mean_absolute_percentage_error(y_test, model.predict(X_test))
|
244 |
-
summary=model.summary()
|
245 |
-
train_contributions=contributions(X_train,model,[target])
|
246 |
-
return
|
247 |
-
|
248 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
249 |
|
250 |
-
metrics_table=pd.DataFrame()
|
251 |
|
252 |
-
if
|
253 |
-
st.session_state["contribution_df"]=pd.DataFrame()
|
254 |
|
255 |
-
for target,feature_set in feature_set_dct.items():
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
261 |
|
262 |
# st.write(st.session_state["contribution_df"])
|
263 |
-
|
264 |
-
|
265 |
-
metrics_table.reset_index(drop=True,inplace=True)
|
266 |
-
|
267 |
|
|
|
268 |
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
eda_columns=st.columns(2)
|
275 |
with eda_columns[1]:
|
276 |
-
eda=st.button(
|
277 |
-
|
278 |
-
|
|
|
279 |
|
280 |
# st.markdown('Model Metrics')
|
281 |
-
|
282 |
-
st.title('Contribution Overview')
|
283 |
|
284 |
-
|
285 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
286 |
|
287 |
for selection in contribution_selections:
|
288 |
|
289 |
-
trace=go.Bar(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
290 |
trace_data.append(trace)
|
291 |
|
292 |
layout = go.Layout(
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
fig = go.Figure(data=trace_data, layout=layout)
|
299 |
-
st.plotly_chart(fig,use_container_width=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
300 |
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
308 |
|
309 |
-
|
310 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
311 |
# table=metrics_table.iloc[:,:-2]
|
312 |
# table.insert(0, "Select", False)
|
313 |
# selection_table=st.data_editor(table,column_config={"Select": st.column_config.CheckboxColumn(required=True)})
|
314 |
-
|
315 |
-
|
316 |
|
317 |
-
if len(table.selected_rows)==0:
|
318 |
-
st.warning(
|
|
|
|
|
319 |
st.stop()
|
320 |
-
else:
|
321 |
-
target_column=table.selected_rows[0][
|
322 |
-
feature_set=feature_set_dct[target_column]
|
323 |
|
324 |
with eda_columns[1]:
|
325 |
if eda:
|
|
|
326 |
def generate_report_with_target(channel_data, target_feature):
|
327 |
-
report = sv.analyze(
|
|
|
|
|
328 |
temp_dir = tempfile.mkdtemp()
|
329 |
report_path = os.path.join(temp_dir, "report.html")
|
330 |
-
report.show_html(
|
|
|
|
|
331 |
return report_path
|
332 |
-
|
333 |
-
report_data=transformed_data[feature_set]
|
334 |
-
report_data[target_column]=transformed_data[target_column]
|
335 |
report_file = generate_report_with_target(report_data, target_column)
|
336 |
-
|
337 |
if os.path.exists(report_file):
|
338 |
-
with open(report_file,
|
339 |
st.download_button(
|
340 |
label="Download EDA Report",
|
341 |
data=f.read(),
|
342 |
file_name="report.html",
|
343 |
-
mime="text/html"
|
344 |
)
|
345 |
else:
|
346 |
st.warning("Report generation failed. Unable to find the report file.")
|
347 |
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
st.header(
|
352 |
st.write(model.summary())
|
353 |
-
X=transformed_data[feature_set]
|
354 |
-
ss=MinMaxScaler()
|
355 |
-
X=pd.DataFrame(ss.fit_transform(X),columns=X.columns)
|
356 |
-
X=sm.add_constant(X)
|
357 |
-
y=transformed_data[target_column]
|
358 |
-
X_train=X.iloc[:150]
|
359 |
-
X_test=X.iloc[150:]
|
360 |
-
y_train=y.iloc[:150]
|
361 |
-
y_test=y.iloc[150:]
|
362 |
-
X.index=transformed_data[
|
363 |
-
y.index=transformed_data[
|
364 |
-
|
365 |
-
metrics_table_train,fig_train= plot_actual_vs_predicted(
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
st.
|
380 |
-
|
381 |
-
st.
|
382 |
-
|
383 |
-
st.
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
|
|
|
|
|
|
388 |
with columns[0]:
|
389 |
-
fig=plot_residual_predicted(y_train,model.predict(Xtrain1),Xtrain1)
|
390 |
st.plotly_chart(fig)
|
391 |
|
392 |
with columns[1]:
|
393 |
st.empty()
|
394 |
-
fig = qqplot(y_train,model.predict(X_train))
|
395 |
st.plotly_chart(fig)
|
396 |
|
397 |
with columns[0]:
|
398 |
-
fig=residual_distribution(y_train,model.predict(X_train))
|
399 |
st.pyplot(fig)
|
400 |
|
401 |
|
402 |
-
|
403 |
elif auth_status == False:
|
404 |
-
st.error(
|
405 |
try:
|
406 |
-
username_forgot_pw, email_forgot_password, random_password =
|
|
|
|
|
407 |
if username_forgot_pw:
|
408 |
-
st.success(
|
409 |
# Random password to be transferred to the user securely
|
410 |
elif username_forgot_pw == False:
|
411 |
-
st.error(
|
412 |
except Exception as e:
|
413 |
st.error(e)
|
|
|
7 |
from sklearn.metrics import mean_absolute_percentage_error
|
8 |
import sys
|
9 |
import os
|
10 |
+
from utilities import set_header, load_local_css, load_authenticator
|
|
|
|
|
11 |
import seaborn as sns
|
12 |
import matplotlib.pyplot as plt
|
13 |
import sweetviz as sv
|
14 |
import tempfile
|
15 |
from sklearn.preprocessing import MinMaxScaler
|
16 |
from st_aggrid import AgGrid
|
17 |
+
from st_aggrid import GridOptionsBuilder, GridUpdateMode
|
18 |
from st_aggrid import GridOptionsBuilder
|
19 |
import sys
|
20 |
import re
|
|
|
22 |
sys.setrecursionlimit(10**6)
|
23 |
|
24 |
original_stdout = sys.stdout
|
25 |
+
sys.stdout = open("temp_stdout.txt", "w")
|
26 |
sys.stdout.close()
|
27 |
sys.stdout = original_stdout
|
28 |
|
29 |
+
st.set_page_config(layout="wide")
|
30 |
+
load_local_css("styles.css")
|
31 |
set_header()
|
32 |
|
33 |
for k, v in st.session_state.items():
|
34 |
+
if k not in ["logout", "login", "config"] and not k.startswith("FormSubmitter"):
|
35 |
st.session_state[k] = v
|
36 |
|
37 |
+
authenticator = st.session_state.get("authenticator")
|
38 |
if authenticator is None:
|
39 |
authenticator = load_authenticator()
|
40 |
|
41 |
+
name, authentication_status, username = authenticator.login("Login", "main")
|
42 |
+
auth_status = st.session_state.get("authentication_status")
|
43 |
|
44 |
if auth_status == True:
|
45 |
+
is_state_initiaized = st.session_state.get("initialized", False)
|
46 |
if not is_state_initiaized:
|
47 |
+
a = 1
|
|
|
48 |
|
49 |
def plot_residual_predicted(actual, predicted, df_):
|
50 |
+
df_["Residuals"] = actual - pd.Series(predicted)
|
51 |
+
df_["StdResidual"] = (df_["Residuals"] - df_["Residuals"].mean()) / df_[
|
52 |
+
"Residuals"
|
53 |
+
].std()
|
54 |
+
|
55 |
+
# Create a Plotly scatter plot
|
56 |
+
fig = px.scatter(
|
57 |
+
df_,
|
58 |
+
x=predicted,
|
59 |
+
y="StdResidual",
|
60 |
+
opacity=0.5,
|
61 |
+
color_discrete_sequence=["#11B6BD"],
|
62 |
+
)
|
63 |
+
|
64 |
+
# Add horizontal lines
|
65 |
+
fig.add_hline(y=0, line_dash="dash", line_color="darkorange")
|
66 |
+
fig.add_hline(y=2, line_color="red")
|
67 |
+
fig.add_hline(y=-2, line_color="red")
|
68 |
+
|
69 |
+
fig.update_xaxes(title="Predicted")
|
70 |
+
fig.update_yaxes(title="Standardized Residuals (Actual - Predicted)")
|
71 |
+
|
72 |
+
# Set the same width and height for both figures
|
73 |
+
fig.update_layout(
|
74 |
+
title="Residuals over Predicted Values",
|
75 |
+
autosize=False,
|
76 |
+
width=600,
|
77 |
+
height=400,
|
78 |
+
)
|
79 |
+
|
80 |
+
return fig
|
81 |
|
82 |
def residual_distribution(actual, predicted):
|
83 |
+
Residuals = actual - pd.Series(predicted)
|
84 |
+
|
85 |
+
# Create a Seaborn distribution plot
|
86 |
+
sns.set(style="whitegrid")
|
87 |
+
plt.figure(figsize=(6, 4))
|
88 |
+
sns.histplot(Residuals, kde=True, color="#11B6BD")
|
89 |
+
|
90 |
+
plt.title(" Distribution of Residuals")
|
91 |
+
plt.xlabel("Residuals")
|
92 |
+
plt.ylabel("Probability Density")
|
93 |
+
|
94 |
+
return plt
|
95 |
+
|
|
|
96 |
def qqplot(actual, predicted):
|
97 |
+
Residuals = actual - pd.Series(predicted)
|
98 |
+
Residuals = pd.Series(Residuals)
|
99 |
+
Resud_std = (Residuals - Residuals.mean()) / Residuals.std()
|
100 |
+
|
101 |
+
# Create a QQ plot using Plotly with custom colors
|
102 |
+
fig = go.Figure()
|
103 |
+
fig.add_trace(
|
104 |
+
go.Scatter(
|
105 |
+
x=sm.ProbPlot(Resud_std).theoretical_quantiles,
|
106 |
+
y=sm.ProbPlot(Resud_std).sample_quantiles,
|
107 |
+
mode="markers",
|
108 |
+
marker=dict(size=5, color="#11B6BD"),
|
109 |
+
name="QQ Plot",
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
)
|
111 |
+
)
|
112 |
+
|
113 |
+
# Add the 45-degree reference line
|
114 |
+
diagonal_line = go.Scatter(
|
115 |
+
x=[-2, 2], # Adjust the x values as needed to fit the range of your data
|
116 |
+
y=[-2, 2], # Adjust the y values accordingly
|
117 |
+
mode="lines",
|
118 |
+
line=dict(color="red"), # Customize the line color and style
|
119 |
+
name=" ",
|
120 |
+
)
|
121 |
+
fig.add_trace(diagonal_line)
|
122 |
+
|
123 |
+
# Customize the layout
|
124 |
+
fig.update_layout(
|
125 |
+
title="QQ Plot of Residuals",
|
126 |
+
title_x=0.5,
|
127 |
+
autosize=False,
|
128 |
+
width=600,
|
129 |
+
height=400,
|
130 |
+
xaxis_title="Theoretical Quantiles",
|
131 |
+
yaxis_title="Sample Quantiles",
|
132 |
+
)
|
133 |
|
134 |
+
return fig
|
135 |
|
136 |
def plot_actual_vs_predicted(date, y, predicted_values, model):
|
137 |
|
138 |
fig = go.Figure()
|
139 |
|
140 |
+
fig.add_trace(
|
141 |
+
go.Scatter(
|
142 |
+
x=date, y=y, mode="lines", name="Actual", line=dict(color="blue")
|
143 |
+
)
|
144 |
+
)
|
145 |
+
fig.add_trace(
|
146 |
+
go.Scatter(
|
147 |
+
x=date,
|
148 |
+
y=predicted_values,
|
149 |
+
mode="lines",
|
150 |
+
name="Predicted",
|
151 |
+
line=dict(color="orange"),
|
152 |
+
)
|
153 |
+
)
|
154 |
+
|
155 |
# Calculate MAPE
|
156 |
+
mape = mean_absolute_percentage_error(y, predicted_values) * 100
|
157 |
+
|
158 |
# Calculate R-squared
|
159 |
rss = np.sum((y - predicted_values) ** 2)
|
160 |
tss = np.sum((y - np.mean(y)) ** 2)
|
161 |
r_squared = 1 - (rss / tss)
|
162 |
+
|
163 |
# Get the number of predictors
|
164 |
num_predictors = model.df_model
|
165 |
+
|
166 |
# Get the number of samples
|
167 |
num_samples = len(y)
|
168 |
+
|
169 |
# Calculate Adjusted R-squared
|
170 |
+
adj_r_squared = 1 - (
|
171 |
+
(1 - r_squared) * ((num_samples - 1) / (num_samples - num_predictors - 1))
|
172 |
+
)
|
173 |
+
metrics_table = pd.DataFrame(
|
174 |
+
{
|
175 |
+
"Metric": ["MAPE", "R-squared", "AdjR-squared"],
|
176 |
+
"Value": [mape, r_squared, adj_r_squared],
|
177 |
+
}
|
178 |
+
)
|
179 |
fig.update_layout(
|
180 |
+
xaxis=dict(title="Date"),
|
181 |
+
yaxis=dict(title="Value"),
|
182 |
+
title=f"MAPE : {mape:.2f}%, AdjR2: {adj_r_squared:.2f}",
|
183 |
+
xaxis_tickangle=-30,
|
184 |
)
|
185 |
|
186 |
+
return metrics_table, fig
|
187 |
+
|
188 |
def contributions(X, model):
|
189 |
X1 = X.copy()
|
190 |
for j, col in enumerate(X1.columns):
|
191 |
X1[col] = X1[col] * model.params.values[j]
|
192 |
|
193 |
+
return np.round(
|
194 |
+
(X1.sum() / sum(X1.sum()) * 100).sort_values(ascending=False), 2
|
195 |
+
)
|
196 |
|
197 |
+
transformed_data = pd.read_csv("transformed_data.csv")
|
198 |
|
199 |
# hard coded for now, need to get features set from model
|
200 |
|
201 |
+
feature_set_dct = {
|
202 |
+
"app_installs_-_appsflyer": [
|
203 |
+
"paid_search_clicks",
|
204 |
+
"fb:_level_achieved_-_tier_1_impressions_lag2",
|
205 |
+
"fb:_level_achieved_-_tier_2_clicks_lag2",
|
206 |
+
"paid_social_others_impressions_adst.1",
|
207 |
+
"ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag2",
|
208 |
+
"digital_tactic_others_clicks",
|
209 |
+
"kwai_clicks_adst.3",
|
210 |
+
"programmaticclicks",
|
211 |
+
"indicacao_clicks_adst.1",
|
212 |
+
"infleux_clicks_adst.4",
|
213 |
+
"influencer_clicks",
|
214 |
+
],
|
215 |
+
"account_requests_-_appsflyer": [
|
216 |
+
"paid_search_impressions",
|
217 |
+
"fb:_level_achieved_-_tier_1_clicks_adst.1",
|
218 |
+
"fb:_level_achieved_-_tier_2_clicks_adst.1",
|
219 |
+
"paid_social_others_clicks_lag2",
|
220 |
+
"ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag5_adst.1",
|
221 |
+
"digital_tactic_others_clicks_adst.1",
|
222 |
+
"kwai_clicks_adst.2",
|
223 |
+
"programmaticimpressions_lag4_adst.1",
|
224 |
+
"indicacao_clicks",
|
225 |
+
"infleux_clicks_adst.2",
|
226 |
+
"influencer_clicks",
|
227 |
+
],
|
228 |
+
"total_approved_accounts_-_appsflyer": [
|
229 |
+
"paid_search_clicks",
|
230 |
+
"fb:_level_achieved_-_tier_1_impressions_lag2_adst.1",
|
231 |
+
"fb:_level_achieved_-_tier_2_impressions_lag2",
|
232 |
+
"paid_social_others_clicks_lag2_adst.2",
|
233 |
+
"ga_app:_will_and_cid_pequena_baixo_risco_impressions_lag4",
|
234 |
+
"digital_tactic_others_clicks",
|
235 |
+
"kwai_impressions_adst.2",
|
236 |
+
"programmaticclicks_adst.5",
|
237 |
+
"indicacao_clicks_adst.1",
|
238 |
+
"infleux_clicks_adst.3",
|
239 |
+
"influencer_clicks",
|
240 |
+
],
|
241 |
+
"total_approved_accounts_-_revenue": [
|
242 |
+
"paid_search_impressions_adst.5",
|
243 |
+
"kwai_impressions_lag2_adst.3",
|
244 |
+
"indicacao_clicks_adst.3",
|
245 |
+
"infleux_clicks_adst.3",
|
246 |
+
"programmaticclicks_adst.4",
|
247 |
+
"influencer_clicks_adst.3",
|
248 |
+
"fb:_level_achieved_-_tier_1_impressions_adst.2",
|
249 |
+
"fb:_level_achieved_-_tier_2_impressions_lag3_adst.5",
|
250 |
+
"paid_social_others_impressions_adst.3",
|
251 |
+
"ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag3_adst.5",
|
252 |
+
"digital_tactic_others_clicks_adst.2",
|
253 |
+
],
|
254 |
+
}
|
255 |
+
|
256 |
+
# """ the above part should be modified so that we are fetching features set from the saved model"""
|
257 |
+
|
258 |
+
def contributions(X, model, target):
|
259 |
X1 = X.copy()
|
260 |
for j, col in enumerate(X1.columns):
|
261 |
X1[col] = X1[col] * model.params.values[j]
|
262 |
+
|
263 |
+
contributions = np.round(
|
264 |
+
(X1.sum() / sum(X1.sum()) * 100).sort_values(ascending=False), 2
|
265 |
+
)
|
266 |
+
contributions = (
|
267 |
+
pd.DataFrame(contributions, columns=target)
|
268 |
+
.reset_index()
|
269 |
+
.rename(columns={"index": "Channel"})
|
270 |
+
)
|
271 |
+
contributions["Channel"] = [
|
272 |
+
re.split(r"_imp|_cli", col)[0] for col in contributions["Channel"]
|
273 |
+
]
|
274 |
+
|
275 |
return contributions
|
|
|
276 |
|
277 |
+
def model_fit(features_set, target):
|
278 |
X = transformed_data[features_set]
|
279 |
+
y = transformed_data[target]
|
280 |
ss = MinMaxScaler()
|
281 |
X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
|
282 |
X = sm.add_constant(X)
|
283 |
+
X_train = X.iloc[:150]
|
284 |
+
X_test = X.iloc[150:]
|
285 |
+
y_train = y.iloc[:150]
|
286 |
+
y_test = y.iloc[150:]
|
287 |
model = sm.OLS(y_train, X_train).fit()
|
288 |
predicted_values_train = model.predict(X_train)
|
289 |
r2 = model.rsquared
|
290 |
adjr2 = model.rsquared_adj
|
291 |
train_mape = mean_absolute_percentage_error(y_train, predicted_values_train)
|
292 |
+
test_mape = mean_absolute_percentage_error(y_test, model.predict(X_test))
|
293 |
+
summary = model.summary()
|
294 |
+
train_contributions = contributions(X_train, model, [target])
|
295 |
+
return (
|
296 |
+
pd.DataFrame(
|
297 |
+
{
|
298 |
+
"Model": target,
|
299 |
+
"R2": np.round(r2, 2),
|
300 |
+
"ADJr2": np.round(adjr2, 2),
|
301 |
+
"Train Mape": np.round(train_mape, 2),
|
302 |
+
"Test Mape": np.round(test_mape, 2),
|
303 |
+
"Summary": summary,
|
304 |
+
"Model_object": model,
|
305 |
+
},
|
306 |
+
index=[0],
|
307 |
+
),
|
308 |
+
train_contributions,
|
309 |
+
)
|
310 |
|
311 |
+
metrics_table = pd.DataFrame()
|
312 |
|
313 |
+
if "contribution_df" not in st.session_state:
|
314 |
+
st.session_state["contribution_df"] = pd.DataFrame()
|
315 |
|
316 |
+
for target, feature_set in feature_set_dct.items():
|
317 |
+
metrics_table = pd.concat(
|
318 |
+
[metrics_table, model_fit(features_set=feature_set, target=target)[0]]
|
319 |
+
)
|
320 |
+
if st.session_state["contribution_df"].empty:
|
321 |
+
st.session_state["contribution_df"] = model_fit(
|
322 |
+
features_set=feature_set, target=target
|
323 |
+
)[1]
|
324 |
+
else:
|
325 |
+
st.session_state["contribution_df"] = pd.merge(
|
326 |
+
st.session_state["contribution_df"],
|
327 |
+
model_fit(features_set=feature_set, target=target)[1],
|
328 |
+
)
|
329 |
|
330 |
# st.write(st.session_state["contribution_df"])
|
|
|
|
|
|
|
|
|
331 |
|
332 |
+
metrics_table.reset_index(drop=True, inplace=True)
|
333 |
|
334 |
+
eda_columns = st.columns(2)
|
|
|
|
|
|
|
|
|
|
|
335 |
with eda_columns[1]:
|
336 |
+
eda = st.button(
|
337 |
+
"Generate EDA Report",
|
338 |
+
help="Click to generate a bivariate report for the selected response metric from the table below.",
|
339 |
+
)
|
340 |
|
341 |
# st.markdown('Model Metrics')
|
|
|
|
|
342 |
|
343 |
+
st.title("Contribution Overview")
|
344 |
+
|
345 |
+
contribution_selections = st.multiselect(
|
346 |
+
"Select the models to compare contributions",
|
347 |
+
[
|
348 |
+
col
|
349 |
+
for col in st.session_state["contribution_df"].columns
|
350 |
+
if col.lower() != "channel"
|
351 |
+
],
|
352 |
+
default=[
|
353 |
+
col
|
354 |
+
for col in st.session_state["contribution_df"].columns
|
355 |
+
if col.lower() != "channel"
|
356 |
+
][-1],
|
357 |
+
)
|
358 |
+
trace_data = []
|
359 |
|
360 |
for selection in contribution_selections:
|
361 |
|
362 |
+
trace = go.Bar(
|
363 |
+
x=st.session_state["contribution_df"]["Channel"],
|
364 |
+
y=st.session_state["contribution_df"][selection],
|
365 |
+
name=selection,
|
366 |
+
text=np.round(st.session_state["contribution_df"][selection], 0)
|
367 |
+
.astype(int)
|
368 |
+
.astype(str)
|
369 |
+
+ "%",
|
370 |
+
textposition="outside",
|
371 |
+
)
|
372 |
trace_data.append(trace)
|
373 |
|
374 |
layout = go.Layout(
|
375 |
+
title="Metrics Contribution by Channel",
|
376 |
+
xaxis=dict(title="Channel Name"),
|
377 |
+
yaxis=dict(title="Metrics Contribution"),
|
378 |
+
barmode="group",
|
379 |
+
)
|
380 |
fig = go.Figure(data=trace_data, layout=layout)
|
381 |
+
st.plotly_chart(fig, use_container_width=True)
|
382 |
+
|
383 |
+
############################################ Waterfall Chart ############################################
|
384 |
+
# import plotly.graph_objects as go
|
385 |
+
|
386 |
+
# # Initialize a Plotly figure
|
387 |
+
# fig = go.Figure()
|
388 |
+
|
389 |
+
# for selection in contribution_selections:
|
390 |
+
# # Ensure y_values are numeric
|
391 |
+
# y_values = st.session_state["contribution_df"][selection].values.astype(float)
|
392 |
+
|
393 |
+
# # Generating text labels for each bar, ensuring operations are compatible with string formats
|
394 |
+
# text_values = [f"{val}%" for val in np.round(y_values, 0).astype(int)]
|
395 |
+
|
396 |
+
# fig.add_trace(
|
397 |
+
# go.Waterfall(
|
398 |
+
# name=selection,
|
399 |
+
# orientation="v",
|
400 |
+
# measure=["relative"]
|
401 |
+
# * len(y_values), # Adjust if you have absolute values at certain points
|
402 |
+
# x=st.session_state["contribution_df"]["Channel"].tolist(),
|
403 |
+
# text=text_values,
|
404 |
+
# textposition="outside",
|
405 |
+
# y=y_values,
|
406 |
+
# increasing={"marker": {"color": "green"}},
|
407 |
+
# decreasing={"marker": {"color": "red"}},
|
408 |
+
# totals={"marker": {"color": "blue"}},
|
409 |
+
# )
|
410 |
+
# )
|
411 |
+
|
412 |
+
# fig.update_layout(
|
413 |
+
# title="Metrics Contribution by Channel",
|
414 |
+
# xaxis={"title": "Channel Name"},
|
415 |
+
# yaxis={"title": "Metrics Contribution"},
|
416 |
+
# height=600,
|
417 |
+
# )
|
418 |
+
|
419 |
+
# # Displaying the waterfall chart in Streamlit
|
420 |
+
# st.plotly_chart(fig, use_container_width=True)
|
421 |
+
|
422 |
+
import plotly.graph_objects as go
|
423 |
+
|
424 |
+
# Initialize a Plotly figure
|
425 |
+
fig = go.Figure()
|
426 |
|
427 |
+
for selection in contribution_selections:
|
428 |
+
# Ensure contributions are numeric
|
429 |
+
contributions = (
|
430 |
+
st.session_state["contribution_df"][selection].values.astype(float).tolist()
|
431 |
+
)
|
432 |
+
channel_names = st.session_state["contribution_df"]["Channel"].tolist()
|
433 |
|
434 |
+
display_name, display_contribution, base_contribution = [], [], 0
|
435 |
+
for channel_name, contribution in zip(channel_names, contributions):
|
436 |
+
if channel_name != "const":
|
437 |
+
display_name.append(channel_name)
|
438 |
+
display_contribution.append(contribution)
|
439 |
+
else:
|
440 |
+
base_contribution = contribution
|
441 |
+
|
442 |
+
display_name = ["Base Sales"] + display_name
|
443 |
+
display_contribution = [base_contribution] + display_contribution
|
444 |
+
|
445 |
+
# Generating text labels for each bar, ensuring operations are compatible with string formats
|
446 |
+
text_values = [
|
447 |
+
f"{val}%" for val in np.round(display_contribution, 0).astype(int)
|
448 |
+
]
|
449 |
+
|
450 |
+
fig.add_trace(
|
451 |
+
go.Waterfall(
|
452 |
+
orientation="v",
|
453 |
+
measure=["relative"]
|
454 |
+
* len(
|
455 |
+
display_contribution
|
456 |
+
), # Adjust if you have absolute values at certain points
|
457 |
+
x=display_name,
|
458 |
+
text=text_values,
|
459 |
+
textposition="outside",
|
460 |
+
y=display_contribution,
|
461 |
+
increasing={"marker": {"color": "green"}},
|
462 |
+
decreasing={"marker": {"color": "red"}},
|
463 |
+
totals={"marker": {"color": "blue"}},
|
464 |
+
)
|
465 |
+
)
|
466 |
|
467 |
+
fig.update_layout(
|
468 |
+
title="Metrics Contribution by Channel",
|
469 |
+
xaxis={"title": "Channel Name"},
|
470 |
+
yaxis={"title": "Metrics Contribution"},
|
471 |
+
height=600,
|
472 |
+
)
|
473 |
+
|
474 |
+
# Displaying the waterfall chart in Streamlit
|
475 |
+
st.plotly_chart(fig, use_container_width=True)
|
476 |
+
|
477 |
+
############################################ Waterfall Chart ############################################
|
478 |
+
|
479 |
+
st.title("Analysis of Models Result")
|
480 |
+
# st.markdown()
|
481 |
+
gd_table = metrics_table.iloc[:, :-2]
|
482 |
+
|
483 |
+
gd = GridOptionsBuilder.from_dataframe(gd_table)
|
484 |
+
# gd.configure_pagination(enabled=True)
|
485 |
+
gd.configure_selection(
|
486 |
+
use_checkbox=True,
|
487 |
+
selection_mode="single",
|
488 |
+
pre_select_all_rows=False,
|
489 |
+
pre_selected_rows=[1],
|
490 |
+
)
|
491 |
+
|
492 |
+
gridoptions = gd.build()
|
493 |
+
table = AgGrid(
|
494 |
+
gd_table, gridOptions=gridoptions, fit_columns_on_grid_load=True, height=200
|
495 |
+
)
|
496 |
# table=metrics_table.iloc[:,:-2]
|
497 |
# table.insert(0, "Select", False)
|
498 |
# selection_table=st.data_editor(table,column_config={"Select": st.column_config.CheckboxColumn(required=True)})
|
|
|
|
|
499 |
|
500 |
+
if len(table.selected_rows) == 0:
|
501 |
+
st.warning(
|
502 |
+
"Click on the checkbox to view comprehensive results of the selected model."
|
503 |
+
)
|
504 |
st.stop()
|
505 |
+
else:
|
506 |
+
target_column = table.selected_rows[0]["Model"]
|
507 |
+
feature_set = feature_set_dct[target_column]
|
508 |
|
509 |
with eda_columns[1]:
|
510 |
if eda:
|
511 |
+
|
512 |
def generate_report_with_target(channel_data, target_feature):
|
513 |
+
report = sv.analyze(
|
514 |
+
[channel_data, "Dataset"], target_feat=target_feature, verbose=False
|
515 |
+
)
|
516 |
temp_dir = tempfile.mkdtemp()
|
517 |
report_path = os.path.join(temp_dir, "report.html")
|
518 |
+
report.show_html(
|
519 |
+
filepath=report_path, open_browser=False
|
520 |
+
) # Generate the report as an HTML file
|
521 |
return report_path
|
522 |
+
|
523 |
+
report_data = transformed_data[feature_set]
|
524 |
+
report_data[target_column] = transformed_data[target_column]
|
525 |
report_file = generate_report_with_target(report_data, target_column)
|
526 |
+
|
527 |
if os.path.exists(report_file):
|
528 |
+
with open(report_file, "rb") as f:
|
529 |
st.download_button(
|
530 |
label="Download EDA Report",
|
531 |
data=f.read(),
|
532 |
file_name="report.html",
|
533 |
+
mime="text/html",
|
534 |
)
|
535 |
else:
|
536 |
st.warning("Report generation failed. Unable to find the report file.")
|
537 |
|
538 |
+
model = metrics_table[metrics_table["Model"] == target_column]["Model_object"].iloc[
|
539 |
+
0
|
540 |
+
]
|
541 |
+
st.header("Model Summary")
|
542 |
st.write(model.summary())
|
543 |
+
X = transformed_data[feature_set]
|
544 |
+
ss = MinMaxScaler()
|
545 |
+
X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
|
546 |
+
X = sm.add_constant(X)
|
547 |
+
y = transformed_data[target_column]
|
548 |
+
X_train = X.iloc[:150]
|
549 |
+
X_test = X.iloc[150:]
|
550 |
+
y_train = y.iloc[:150]
|
551 |
+
y_test = y.iloc[150:]
|
552 |
+
X.index = transformed_data["date"]
|
553 |
+
y.index = transformed_data["date"]
|
554 |
+
|
555 |
+
metrics_table_train, fig_train = plot_actual_vs_predicted(
|
556 |
+
X_train.index, y_train, model.predict(X_train), model
|
557 |
+
)
|
558 |
+
metrics_table_test, fig_test = plot_actual_vs_predicted(
|
559 |
+
X_test.index, y_test, model.predict(X_test), model
|
560 |
+
)
|
561 |
+
|
562 |
+
metrics_table_train = metrics_table_train.set_index("Metric").transpose()
|
563 |
+
metrics_table_train.index = ["Train"]
|
564 |
+
metrics_table_test = metrics_table_test.set_index("Metric").transpose()
|
565 |
+
metrics_table_test.index = ["test"]
|
566 |
+
metrics_table = np.round(pd.concat([metrics_table_train, metrics_table_test]), 2)
|
567 |
+
|
568 |
+
st.markdown("Result Overview")
|
569 |
+
st.dataframe(np.round(metrics_table, 2), use_container_width=True)
|
570 |
+
|
571 |
+
st.subheader("Actual vs Predicted Plot Train")
|
572 |
+
|
573 |
+
st.plotly_chart(fig_train, use_container_width=True)
|
574 |
+
st.subheader("Actual vs Predicted Plot Test")
|
575 |
+
st.plotly_chart(fig_test, use_container_width=True)
|
576 |
+
|
577 |
+
st.markdown("## Residual Analysis")
|
578 |
+
columns = st.columns(2)
|
579 |
+
|
580 |
+
Xtrain1 = X_train.copy()
|
581 |
with columns[0]:
|
582 |
+
fig = plot_residual_predicted(y_train, model.predict(Xtrain1), Xtrain1)
|
583 |
st.plotly_chart(fig)
|
584 |
|
585 |
with columns[1]:
|
586 |
st.empty()
|
587 |
+
fig = qqplot(y_train, model.predict(X_train))
|
588 |
st.plotly_chart(fig)
|
589 |
|
590 |
with columns[0]:
|
591 |
+
fig = residual_distribution(y_train, model.predict(X_train))
|
592 |
st.pyplot(fig)
|
593 |
|
594 |
|
|
|
595 |
elif auth_status == False:
|
596 |
+
st.error("Username/Password is incorrect")
|
597 |
try:
|
598 |
+
username_forgot_pw, email_forgot_password, random_password = (
|
599 |
+
authenticator.forgot_password("Forgot password")
|
600 |
+
)
|
601 |
if username_forgot_pw:
|
602 |
+
st.success("New password sent securely")
|
603 |
# Random password to be transferred to the user securely
|
604 |
elif username_forgot_pw == False:
|
605 |
+
st.error("Username not found")
|
606 |
except Exception as e:
|
607 |
st.error(e)
|
pages/5_Model_Tuning_with_panel.py
ADDED
@@ -0,0 +1,527 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'''
|
2 |
+
MMO Build Sprint 3
|
3 |
+
date :
|
4 |
+
changes : capability to tune MixedLM as well as simple LR in the same page
|
5 |
+
'''
|
6 |
+
|
7 |
+
import streamlit as st
|
8 |
+
import pandas as pd
|
9 |
+
from Eda_functions import format_numbers
|
10 |
+
import pickle
|
11 |
+
from utilities import set_header, load_local_css
|
12 |
+
import statsmodels.api as sm
|
13 |
+
import re
|
14 |
+
from sklearn.preprocessing import MinMaxScaler
|
15 |
+
import matplotlib.pyplot as plt
|
16 |
+
from statsmodels.stats.outliers_influence import variance_inflation_factor
|
17 |
+
|
18 |
+
st.set_option('deprecation.showPyplotGlobalUse', False)
|
19 |
+
import statsmodels.formula.api as smf
|
20 |
+
from Data_prep_functions import *
|
21 |
+
|
22 |
+
# for i in ["model_tuned", "X_train_tuned", "X_test_tuned", "tuned_model_features", "tuned_model", "tuned_model_dict"] :
|
23 |
+
|
24 |
+
st.set_page_config(
|
25 |
+
page_title="Model Tuning",
|
26 |
+
page_icon=":shark:",
|
27 |
+
layout="wide",
|
28 |
+
initial_sidebar_state='collapsed'
|
29 |
+
)
|
30 |
+
load_local_css('styles.css')
|
31 |
+
set_header()
|
32 |
+
|
33 |
+
# Sprint3
|
34 |
+
# is_panel = st.session_state['is_panel']
|
35 |
+
# panel_col = 'markets' # set the panel column
|
36 |
+
date_col = 'date'
|
37 |
+
|
38 |
+
panel_col = [col.lower().replace('.','_').replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in st.session_state['bin_dict']['Panel Level 1'] ] [0]# set the panel column
|
39 |
+
is_panel = True if len(panel_col)>0 else False
|
40 |
+
|
41 |
+
|
42 |
+
# flag indicating there is not tuned model till now
|
43 |
+
|
44 |
+
# Sprint4 - model tuned dict
|
45 |
+
if 'Model_Tuned' not in st.session_state:
|
46 |
+
st.session_state['Model_Tuned'] = {}
|
47 |
+
|
48 |
+
st.title('1. Model Tuning')
|
49 |
+
# st.write(st.session_state['base_model_feature_set'])
|
50 |
+
|
51 |
+
if "X_train" not in st.session_state:
|
52 |
+
st.error(
|
53 |
+
"Oops! It seems there are no saved models available. Please build and save a model from the previous page to proceed.")
|
54 |
+
st.stop()
|
55 |
+
# X_train=st.session_state['X_train']
|
56 |
+
# X_test=st.session_state['X_test']
|
57 |
+
# y_train=st.session_state['y_train']
|
58 |
+
# y_test=st.session_state['y_test']
|
59 |
+
# df=st.session_state['media_data']
|
60 |
+
|
61 |
+
|
62 |
+
# st.write(X_train.columns)
|
63 |
+
# st.write(X_test.columns)
|
64 |
+
if "is_tuned_model" not in st.session_state:
|
65 |
+
st.session_state["is_tuned_model"] = {}
|
66 |
+
# Sprint4 - if used_response_metrics is not blank, then select one of the used_response_metrics, else target is revenue by default
|
67 |
+
if "used_response_metrics" in st.session_state and st.session_state['used_response_metrics'] != []:
|
68 |
+
sel_target_col = st.selectbox("Select the response metric", st.session_state['used_response_metrics'])
|
69 |
+
target_col = sel_target_col.lower().replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_")
|
70 |
+
|
71 |
+
else:
|
72 |
+
sel_target_col = 'Total Approved Accounts - Revenue'
|
73 |
+
target_col = 'total_approved_accounts_revenue'
|
74 |
+
|
75 |
+
# Sprint4 - Look through all saved models, only show saved models of the sel resp metric (target_col)
|
76 |
+
saved_models = st.session_state['saved_model_names']
|
77 |
+
required_saved_models = [m.split("__")[0] for m in saved_models if m.split("__")[1] == target_col]
|
78 |
+
sel_model = st.selectbox("Select the model to tune", required_saved_models)
|
79 |
+
|
80 |
+
with open("best_models.pkl", 'rb') as file:
|
81 |
+
model_dict = pickle.load(file)
|
82 |
+
|
83 |
+
sel_model_dict = model_dict[sel_model + "__" + target_col] # Sprint4 - get the model obj of the selected model
|
84 |
+
# st.write(sel_model_dict)
|
85 |
+
|
86 |
+
X_train = sel_model_dict['X_train']
|
87 |
+
X_test = sel_model_dict['X_test']
|
88 |
+
y_train = sel_model_dict['y_train']
|
89 |
+
y_test = sel_model_dict['y_test']
|
90 |
+
df = st.session_state['media_data']
|
91 |
+
|
92 |
+
if 'selected_model' not in st.session_state:
|
93 |
+
st.session_state['selected_model'] = 0
|
94 |
+
|
95 |
+
# st.write(model_dict[st.session_state["selected_model"]]['X_train'].columns)
|
96 |
+
|
97 |
+
st.markdown('### 1.1 Event Flags')
|
98 |
+
st.markdown('Helps in quantifying the impact of specific occurrences of events')
|
99 |
+
with st.expander('Apply Event Flags'):
|
100 |
+
# st.session_state["selected_model"]=st.selectbox('Select Model to apply flags',model_dict.keys())
|
101 |
+
model = sel_model_dict['Model_object']
|
102 |
+
date = st.session_state['date']
|
103 |
+
date = pd.to_datetime(date)
|
104 |
+
X_train = sel_model_dict['X_train']
|
105 |
+
|
106 |
+
# features_set= model_dict[st.session_state["selected_model"]]['feature_set']
|
107 |
+
features_set = sel_model_dict["feature_set"]
|
108 |
+
|
109 |
+
col = st.columns(3)
|
110 |
+
min_date = min(date)
|
111 |
+
max_date = max(date)
|
112 |
+
with col[0]:
|
113 |
+
start_date = st.date_input('Select Start Date', min_date, min_value=min_date, max_value=max_date)
|
114 |
+
with col[1]:
|
115 |
+
end_date = st.date_input('Select End Date', max_date, min_value=min_date, max_value=max_date)
|
116 |
+
with col[2]:
|
117 |
+
repeat = st.selectbox('Repeat Annually', ['Yes', 'No'], index=1)
|
118 |
+
if repeat == 'Yes':
|
119 |
+
repeat = True
|
120 |
+
else:
|
121 |
+
repeat = False
|
122 |
+
|
123 |
+
if 'Flags' not in st.session_state:
|
124 |
+
st.session_state['Flags'] = {}
|
125 |
+
# print("**"*50)
|
126 |
+
# print(y_train)
|
127 |
+
# print("**"*50)
|
128 |
+
# print(model.fittedvalues)
|
129 |
+
if is_panel: # Sprint3
|
130 |
+
met, line_values, fig_flag = plot_actual_vs_predicted(X_train[date_col], y_train,
|
131 |
+
model.fittedvalues, model,
|
132 |
+
target_column=sel_target_col,
|
133 |
+
flag=(start_date, end_date),
|
134 |
+
repeat_all_years=repeat, is_panel=True)
|
135 |
+
st.plotly_chart(fig_flag, use_container_width=True)
|
136 |
+
|
137 |
+
# create flag on test
|
138 |
+
met, test_line_values, fig_flag = plot_actual_vs_predicted(X_test[date_col], y_test,
|
139 |
+
sel_model_dict['pred_test'], model,
|
140 |
+
target_column=sel_target_col,
|
141 |
+
flag=(start_date, end_date),
|
142 |
+
repeat_all_years=repeat, is_panel=True)
|
143 |
+
|
144 |
+
else:
|
145 |
+
pred_train=model.predict(X_train[features_set])
|
146 |
+
met, line_values, fig_flag = plot_actual_vs_predicted(X_train[date_col], y_train, pred_train, model,
|
147 |
+
flag=(start_date, end_date), repeat_all_years=repeat,is_panel=False)
|
148 |
+
st.plotly_chart(fig_flag, use_container_width=True)
|
149 |
+
|
150 |
+
pred_test=model.predict(X_test[features_set])
|
151 |
+
met, test_line_values, fig_flag = plot_actual_vs_predicted(X_test[date_col], y_test, pred_test, model,
|
152 |
+
flag=(start_date, end_date), repeat_all_years=repeat,is_panel=False)
|
153 |
+
flag_name = 'f1_flag'
|
154 |
+
flag_name = st.text_input('Enter Flag Name')
|
155 |
+
# Sprint4 - add selected target col to flag name
|
156 |
+
if st.button('Update flag'):
|
157 |
+
st.session_state['Flags'][flag_name + '__'+ target_col] = {}
|
158 |
+
st.session_state['Flags'][flag_name + '__'+ target_col]['train'] = line_values
|
159 |
+
st.session_state['Flags'][flag_name + '__'+ target_col]['test'] = test_line_values
|
160 |
+
# st.write(st.session_state['Flags'][flag_name])
|
161 |
+
st.success(f'{flag_name + "__" + target_col} stored')
|
162 |
+
|
163 |
+
# Sprint4 - only show flag created for the particular target col
|
164 |
+
st.write(st.session_state['Flags'].keys() )
|
165 |
+
target_model_flags = [f.split("__")[0] for f in st.session_state['Flags'].keys() if f.split("__")[1] == target_col]
|
166 |
+
options = list(target_model_flags)
|
167 |
+
selected_options = []
|
168 |
+
num_columns = 4
|
169 |
+
num_rows = -(-len(options) // num_columns)
|
170 |
+
|
171 |
+
tick = False
|
172 |
+
if st.checkbox('Select all'):
|
173 |
+
tick = True
|
174 |
+
selected_options = []
|
175 |
+
for row in range(num_rows):
|
176 |
+
cols = st.columns(num_columns)
|
177 |
+
for col in cols:
|
178 |
+
if options:
|
179 |
+
option = options.pop(0)
|
180 |
+
selected = col.checkbox(option, value=tick)
|
181 |
+
if selected:
|
182 |
+
selected_options.append(option)
|
183 |
+
|
184 |
+
st.markdown('### 1.2 Select Parameters to Apply')
|
185 |
+
parameters = st.columns(3)
|
186 |
+
with parameters[0]:
|
187 |
+
Trend = st.checkbox("**Trend**")
|
188 |
+
st.markdown('Helps account for long-term trends or seasonality that could influence advertising effectiveness')
|
189 |
+
with parameters[1]:
|
190 |
+
week_number = st.checkbox('**Week_number**')
|
191 |
+
st.markdown('Assists in detecting and incorporating weekly patterns or seasonality')
|
192 |
+
with parameters[2]:
|
193 |
+
sine_cosine = st.checkbox('**Sine and Cosine Waves**')
|
194 |
+
st.markdown('Helps in capturing cyclical patterns or seasonality in the data')
|
195 |
+
#
|
196 |
+
# def get_tuned_model():
|
197 |
+
# st.session_state['build_tuned_model']=True
|
198 |
+
|
199 |
+
if st.button('Build model with Selected Parameters and Flags', key='build_tuned_model'):
|
200 |
+
new_features = features_set
|
201 |
+
st.header('2.1 Results Summary')
|
202 |
+
# date=list(df.index)
|
203 |
+
# df = df.reset_index(drop=True)
|
204 |
+
# st.write(df.head(2))
|
205 |
+
# X_train=df[features_set]
|
206 |
+
ss = MinMaxScaler()
|
207 |
+
if is_panel == True:
|
208 |
+
X_train_tuned = X_train[features_set]
|
209 |
+
# X_train_tuned = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
|
210 |
+
X_train_tuned[target_col] = X_train[target_col]
|
211 |
+
X_train_tuned[date_col] = X_train[date_col]
|
212 |
+
X_train_tuned[panel_col] = X_train[panel_col]
|
213 |
+
|
214 |
+
X_test_tuned = X_test[features_set]
|
215 |
+
# X_test_tuned = pd.DataFrame(ss.transform(X), columns=X.columns)
|
216 |
+
X_test_tuned[target_col] = X_test[target_col]
|
217 |
+
X_test_tuned[date_col] = X_test[date_col]
|
218 |
+
X_test_tuned[panel_col] = X_test[panel_col]
|
219 |
+
|
220 |
+
else:
|
221 |
+
X_train_tuned = X_train[features_set]
|
222 |
+
# X_train_tuned = pd.DataFrame(ss.fit_transform(X_train_tuned), columns=X_train_tuned.columns)
|
223 |
+
|
224 |
+
X_test_tuned = X_test[features_set]
|
225 |
+
# X_test_tuned = pd.DataFrame(ss.transform(X_test_tuned), columns=X_test_tuned.columns)
|
226 |
+
|
227 |
+
for flag in selected_options:
|
228 |
+
# Spirnt4 - added target_col in flag name
|
229 |
+
X_train_tuned[flag] = st.session_state['Flags'][flag + "__" + target_col]['train']
|
230 |
+
X_test_tuned[flag] = st.session_state['Flags'][flag + "__" + target_col]['test']
|
231 |
+
|
232 |
+
# test
|
233 |
+
# X_train_tuned.to_csv("Test/X_train_tuned_flag.csv",index=False)
|
234 |
+
# X_test_tuned.to_csv("Test/X_test_tuned_flag.csv",index=False)
|
235 |
+
|
236 |
+
# print("()()"*20,flag, len(st.session_state['Flags'][flag]))
|
237 |
+
if Trend:
|
238 |
+
# Sprint3 - group by panel, calculate trend of each panel spearately. Add trend to new feature set
|
239 |
+
if is_panel:
|
240 |
+
newdata = pd.DataFrame()
|
241 |
+
panel_wise_end_point_train = {}
|
242 |
+
for panel, groupdf in X_train_tuned.groupby(panel_col):
|
243 |
+
groupdf.sort_values(date_col, inplace=True)
|
244 |
+
groupdf['Trend'] = np.arange(1, len(groupdf) + 1, 1)
|
245 |
+
newdata = pd.concat([newdata, groupdf])
|
246 |
+
panel_wise_end_point_train[panel] = len(groupdf)
|
247 |
+
X_train_tuned = newdata.copy()
|
248 |
+
|
249 |
+
test_newdata = pd.DataFrame()
|
250 |
+
for panel, test_groupdf in X_test_tuned.groupby(panel_col):
|
251 |
+
test_groupdf.sort_values(date_col, inplace=True)
|
252 |
+
start = panel_wise_end_point_train[panel] + 1
|
253 |
+
end = start + len(test_groupdf) # should be + 1? - Sprint4
|
254 |
+
# print("??"*20, panel, len(test_groupdf), len(np.arange(start, end, 1)), start)
|
255 |
+
test_groupdf['Trend'] = np.arange(start, end, 1)
|
256 |
+
test_newdata = pd.concat([test_newdata, test_groupdf])
|
257 |
+
X_test_tuned = test_newdata.copy()
|
258 |
+
|
259 |
+
new_features = new_features + ['Trend']
|
260 |
+
|
261 |
+
else:
|
262 |
+
X_train_tuned['Trend'] = np.arange(1, len(X_train_tuned) + 1, 1)
|
263 |
+
X_test_tuned['Trend'] = np.arange(len(X_train_tuned) + 1, len(X_train_tuned) + len(X_test_tuned) + 1, 1)
|
264 |
+
new_features = new_features + ['Trend']
|
265 |
+
|
266 |
+
|
267 |
+
if week_number:
|
268 |
+
# Sprint3 - create weeknumber from date column in xtrain tuned. add week num to new feature set
|
269 |
+
if is_panel:
|
270 |
+
X_train_tuned[date_col] = pd.to_datetime(X_train_tuned[date_col])
|
271 |
+
X_train_tuned['Week_number'] = X_train_tuned[date_col].dt.day_of_week
|
272 |
+
if X_train_tuned['Week_number'].nunique() == 1:
|
273 |
+
st.write("All dates in the data are of the same week day. Hence Week number can't be used.")
|
274 |
+
else:
|
275 |
+
X_test_tuned[date_col] = pd.to_datetime(X_test_tuned[date_col])
|
276 |
+
X_test_tuned['Week_number'] = X_test_tuned[date_col].dt.day_of_week
|
277 |
+
new_features = new_features + ['Week_number']
|
278 |
+
|
279 |
+
else:
|
280 |
+
date = pd.to_datetime(date.values)
|
281 |
+
X_train_tuned['Week_number'] = pd.to_datetime(X_train[date_col]).dt.day_of_week
|
282 |
+
X_test_tuned['Week_number'] = pd.to_datetime(X_test[date_col]).dt.day_of_week
|
283 |
+
new_features = new_features + ['Week_number']
|
284 |
+
|
285 |
+
if sine_cosine:
|
286 |
+
# Sprint3 - create panel wise sine cosine waves in xtrain tuned. add to new feature set
|
287 |
+
if is_panel:
|
288 |
+
new_features = new_features + ['sine_wave', 'cosine_wave']
|
289 |
+
newdata = pd.DataFrame()
|
290 |
+
newdata_test = pd.DataFrame()
|
291 |
+
groups = X_train_tuned.groupby(panel_col)
|
292 |
+
frequency = 2 * np.pi / 365 # Adjust the frequency as needed
|
293 |
+
|
294 |
+
train_panel_wise_end_point = {}
|
295 |
+
for panel, groupdf in groups:
|
296 |
+
num_samples = len(groupdf)
|
297 |
+
train_panel_wise_end_point[panel] = num_samples
|
298 |
+
days_since_start = np.arange(num_samples)
|
299 |
+
sine_wave = np.sin(frequency * days_since_start)
|
300 |
+
cosine_wave = np.cos(frequency * days_since_start)
|
301 |
+
sine_cosine_df = pd.DataFrame({'sine_wave': sine_wave, 'cosine_wave': cosine_wave})
|
302 |
+
assert len(sine_cosine_df) == len(groupdf)
|
303 |
+
# groupdf = pd.concat([groupdf, sine_cosine_df], axis=1)
|
304 |
+
groupdf['sine_wave'] = sine_wave
|
305 |
+
groupdf['cosine_wave'] = cosine_wave
|
306 |
+
newdata = pd.concat([newdata, groupdf])
|
307 |
+
|
308 |
+
X_train_tuned = newdata.copy()
|
309 |
+
|
310 |
+
test_groups = X_test_tuned.groupby(panel_col)
|
311 |
+
for panel, test_groupdf in test_groups:
|
312 |
+
num_samples = len(test_groupdf)
|
313 |
+
start = train_panel_wise_end_point[panel]
|
314 |
+
days_since_start = np.arange(start, start + num_samples, 1)
|
315 |
+
# print("##", panel, num_samples, start, len(np.arange(start, start+num_samples, 1)))
|
316 |
+
sine_wave = np.sin(frequency * days_since_start)
|
317 |
+
cosine_wave = np.cos(frequency * days_since_start)
|
318 |
+
sine_cosine_df = pd.DataFrame({'sine_wave': sine_wave, 'cosine_wave': cosine_wave})
|
319 |
+
assert len(sine_cosine_df) == len(test_groupdf)
|
320 |
+
# groupdf = pd.concat([groupdf, sine_cosine_df], axis=1)
|
321 |
+
test_groupdf['sine_wave'] = sine_wave
|
322 |
+
test_groupdf['cosine_wave'] = cosine_wave
|
323 |
+
newdata_test = pd.concat([newdata_test, test_groupdf])
|
324 |
+
|
325 |
+
X_test_tuned = newdata_test.copy()
|
326 |
+
|
327 |
+
|
328 |
+
else:
|
329 |
+
new_features = new_features + ['sine_wave', 'cosine_wave']
|
330 |
+
|
331 |
+
num_samples = len(X_train_tuned)
|
332 |
+
frequency = 2 * np.pi / 365 # Adjust the frequency as needed
|
333 |
+
days_since_start = np.arange(num_samples)
|
334 |
+
sine_wave = np.sin(frequency * days_since_start)
|
335 |
+
cosine_wave = np.cos(frequency * days_since_start)
|
336 |
+
sine_cosine_df = pd.DataFrame({'sine_wave': sine_wave, 'cosine_wave': cosine_wave})
|
337 |
+
# Concatenate the sine and cosine waves with the scaled X DataFrame
|
338 |
+
X_train_tuned = pd.concat([X_train_tuned, sine_cosine_df], axis=1)
|
339 |
+
|
340 |
+
test_num_samples = len(X_test_tuned)
|
341 |
+
start = num_samples
|
342 |
+
days_since_start = np.arange(start, start + test_num_samples, 1)
|
343 |
+
sine_wave = np.sin(frequency * days_since_start)
|
344 |
+
cosine_wave = np.cos(frequency * days_since_start)
|
345 |
+
sine_cosine_df = pd.DataFrame({'sine_wave': sine_wave, 'cosine_wave': cosine_wave})
|
346 |
+
# Concatenate the sine and cosine waves with the scaled X DataFrame
|
347 |
+
X_test_tuned = pd.concat([X_test_tuned, sine_cosine_df], axis=1)
|
348 |
+
|
349 |
+
# model
|
350 |
+
if selected_options:
|
351 |
+
new_features = new_features + selected_options
|
352 |
+
if is_panel:
|
353 |
+
inp_vars_str = " + ".join(new_features)
|
354 |
+
new_features=list(set(new_features))
|
355 |
+
# X_train_tuned.to_csv("Test/X_train_tuned.csv",index=False)
|
356 |
+
# st.write(X_train_tuned[['total_approved_accounts_revenue'] + new_features].dtypes)
|
357 |
+
# st.write(X_train_tuned[['total_approved_accounts_revenue', panel_col] + new_features].isna().sum())
|
358 |
+
md_str = target_col + " ~ " + inp_vars_str
|
359 |
+
md_tuned = smf.mixedlm(md_str,
|
360 |
+
data=X_train_tuned[[target_col] + new_features],
|
361 |
+
groups=X_train_tuned[panel_col])
|
362 |
+
model_tuned = md_tuned.fit()
|
363 |
+
|
364 |
+
# plot act v pred for original model and tuned model
|
365 |
+
metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(X_train[date_col], y_train,
|
366 |
+
model.fittedvalues, model,
|
367 |
+
target_column=sel_target_col,
|
368 |
+
is_panel=True)
|
369 |
+
metrics_table_tuned, line, actual_vs_predicted_plot_tuned = plot_actual_vs_predicted(X_train_tuned[date_col],
|
370 |
+
X_train_tuned[target_col],
|
371 |
+
model_tuned.fittedvalues,
|
372 |
+
model_tuned,
|
373 |
+
target_column=sel_target_col,
|
374 |
+
is_panel=True)
|
375 |
+
|
376 |
+
else:
|
377 |
+
new_features=list(set(new_features))
|
378 |
+
# st.write(new_features)
|
379 |
+
model_tuned = sm.OLS(y_train, X_train_tuned[new_features]).fit()
|
380 |
+
# st.write(X_train_tuned.columns)
|
381 |
+
metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(date[:130], y_train,
|
382 |
+
model.predict(X_train[features_set]), model,
|
383 |
+
target_column=sel_target_col)
|
384 |
+
metrics_table_tuned, line, actual_vs_predicted_plot_tuned = plot_actual_vs_predicted(date[:130], y_train,
|
385 |
+
model_tuned.predict(
|
386 |
+
X_train_tuned),
|
387 |
+
model_tuned,
|
388 |
+
target_column=sel_target_col)
|
389 |
+
|
390 |
+
# st.write(metrics_table_tuned)
|
391 |
+
mape = np.round(metrics_table.iloc[0, 1], 2)
|
392 |
+
r2 = np.round(metrics_table.iloc[1, 1], 2)
|
393 |
+
adjr2 = np.round(metrics_table.iloc[2, 1], 2)
|
394 |
+
|
395 |
+
mape_tuned = np.round(metrics_table_tuned.iloc[0, 1], 2)
|
396 |
+
r2_tuned = np.round(metrics_table_tuned.iloc[1, 1], 2)
|
397 |
+
adjr2_tuned = np.round(metrics_table_tuned.iloc[2, 1], 2)
|
398 |
+
|
399 |
+
parameters_ = st.columns(3)
|
400 |
+
with parameters_[0]:
|
401 |
+
st.metric('R2', r2_tuned, np.round(r2_tuned - r2, 2))
|
402 |
+
with parameters_[1]:
|
403 |
+
st.metric('Adjusted R2', adjr2_tuned, np.round(adjr2_tuned - adjr2, 2))
|
404 |
+
with parameters_[2]:
|
405 |
+
st.metric('MAPE', mape_tuned, np.round(mape_tuned - mape, 2), 'inverse')
|
406 |
+
st.write(model_tuned.summary())
|
407 |
+
|
408 |
+
X_train_tuned[date_col] = X_train[date_col]
|
409 |
+
X_test_tuned[date_col] = X_test[date_col]
|
410 |
+
X_train_tuned[target_col] = y_train
|
411 |
+
X_test_tuned[target_col] = y_test
|
412 |
+
|
413 |
+
st.header('2.2 Actual vs. Predicted Plot')
|
414 |
+
# if is_panel:
|
415 |
+
# metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(date, y_train, model.predict(X_train),
|
416 |
+
# model, target_column='Revenue',is_panel=True)
|
417 |
+
# else:
|
418 |
+
# metrics_table,line,actual_vs_predicted_plot=plot_actual_vs_predicted(date, y_train, model.predict(X_train), model,target_column='Revenue')
|
419 |
+
if is_panel :
|
420 |
+
metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(X_train_tuned[date_col],
|
421 |
+
X_train_tuned[target_col],
|
422 |
+
model_tuned.fittedvalues, model_tuned,
|
423 |
+
target_column=sel_target_col,
|
424 |
+
is_panel=True)
|
425 |
+
else :
|
426 |
+
metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(X_train_tuned[date_col],
|
427 |
+
X_train_tuned[target_col],
|
428 |
+
model_tuned.predict(X_train_tuned[new_features]),
|
429 |
+
model_tuned,
|
430 |
+
target_column=sel_target_col,
|
431 |
+
is_panel=False)
|
432 |
+
# plot_actual_vs_predicted(X_train[date_col], y_train,
|
433 |
+
# model.fittedvalues, model,
|
434 |
+
# target_column='Revenue',
|
435 |
+
# is_panel=is_panel)
|
436 |
+
|
437 |
+
st.plotly_chart(actual_vs_predicted_plot, use_container_width=True)
|
438 |
+
|
439 |
+
st.markdown('## 2.3 Residual Analysis')
|
440 |
+
if is_panel :
|
441 |
+
columns = st.columns(2)
|
442 |
+
with columns[0]:
|
443 |
+
fig = plot_residual_predicted(y_train, model_tuned.fittedvalues, X_train_tuned)
|
444 |
+
st.plotly_chart(fig)
|
445 |
+
|
446 |
+
with columns[1]:
|
447 |
+
st.empty()
|
448 |
+
fig = qqplot(y_train, model_tuned.fittedvalues)
|
449 |
+
st.plotly_chart(fig)
|
450 |
+
|
451 |
+
with columns[0]:
|
452 |
+
fig = residual_distribution(y_train, model_tuned.fittedvalues)
|
453 |
+
st.pyplot(fig)
|
454 |
+
else:
|
455 |
+
columns = st.columns(2)
|
456 |
+
with columns[0]:
|
457 |
+
fig = plot_residual_predicted(y_train, model_tuned.predict(X_train_tuned[new_features]), X_train)
|
458 |
+
st.plotly_chart(fig)
|
459 |
+
|
460 |
+
with columns[1]:
|
461 |
+
st.empty()
|
462 |
+
fig = qqplot(y_train, model_tuned.predict(X_train_tuned[new_features]))
|
463 |
+
st.plotly_chart(fig)
|
464 |
+
|
465 |
+
with columns[0]:
|
466 |
+
fig = residual_distribution(y_train, model_tuned.predict(X_train_tuned[new_features]))
|
467 |
+
st.pyplot(fig)
|
468 |
+
|
469 |
+
st.session_state['is_tuned_model'][target_col] = True
|
470 |
+
# Sprint4 - saved tuned model in a dict
|
471 |
+
st.session_state['Model_Tuned'][sel_model + "__" + target_col] = {
|
472 |
+
"Model_object": model_tuned,
|
473 |
+
'feature_set': new_features,
|
474 |
+
'X_train_tuned': X_train_tuned,
|
475 |
+
'X_test_tuned': X_test_tuned
|
476 |
+
}
|
477 |
+
|
478 |
+
# Pending
|
479 |
+
# if st.session_state['build_tuned_model']==True:
|
480 |
+
if st.session_state['Model_Tuned'] is not None :
|
481 |
+
if st.checkbox('Use this model to build response curves', key='save_model'):
|
482 |
+
# save_model = st.button('Use this model to build response curves', key='saved_tuned_model')
|
483 |
+
# if save_model:
|
484 |
+
st.session_state["is_tuned_model"][target_col]=True
|
485 |
+
with open("tuned_model.pkl", "wb") as f:
|
486 |
+
# pickle.dump(st.session_state['tuned_model'], f)
|
487 |
+
pickle.dump(st.session_state['Model_Tuned'], f) # Sprint4
|
488 |
+
|
489 |
+
# X_test_tuned.to_csv("Test/X_test_tuned_final.csv", index=False)
|
490 |
+
# X_train_tuned.to_csv("Test/X_train_tuned.csv", index=False)
|
491 |
+
st.success(sel_model + "__" + target_col + ' Tuned saved!')
|
492 |
+
|
493 |
+
|
494 |
+
# if is_panel:
|
495 |
+
# # st.session_state["tuned_model_features"] = new_features
|
496 |
+
# with open("tuned_model.pkl", "wb") as f:
|
497 |
+
# # pickle.dump(st.session_state['tuned_model'], f)
|
498 |
+
# pickle.dump(st.session_state['Model_Tuned'], f) # Sprint4
|
499 |
+
# st.success(sel_model + "__" + target_col + ' Tuned saved!')
|
500 |
+
|
501 |
+
# raw_data=df[features_set]
|
502 |
+
# columns_raw=[re.split(r"(_lag|_adst)",col)[0] for col in raw_data.columns]
|
503 |
+
# raw_data.columns=columns_raw
|
504 |
+
# columns_media=[col for col in columns_raw if Categorised_data[col]['BB']=='Media']
|
505 |
+
# raw_data=raw_data[columns_media]
|
506 |
+
|
507 |
+
# raw_data['Date']=list(df.index)
|
508 |
+
|
509 |
+
# spends_var=[col for col in df.columns if "spends" in col.lower() and 'adst' not in col.lower() and 'lag' not in col.lower()]
|
510 |
+
# spends_df=df[spends_var]
|
511 |
+
# spends_df['Week']=list(df.index)
|
512 |
+
|
513 |
+
|
514 |
+
# j=0
|
515 |
+
# X1=X.copy()
|
516 |
+
# col=X1.columns
|
517 |
+
# for i in model.params.values:
|
518 |
+
# X1[col[j]]=X1.iloc[:,j]*i
|
519 |
+
# j+=1
|
520 |
+
# contribution_df=X1
|
521 |
+
# contribution_df['Date']=list(df.index)
|
522 |
+
# excel_file='Overview_data.xlsx'
|
523 |
+
|
524 |
+
# with pd.ExcelWriter(excel_file,engine='xlsxwriter') as writer:
|
525 |
+
# raw_data.to_excel(writer,sheet_name='RAW DATA MMM',index=False)
|
526 |
+
# spends_df.to_excel(writer,sheet_name='SPEND INPUT',index=False)
|
527 |
+
# contribution_df.to_excel(writer,sheet_name='CONTRIBUTION MMM')
|
pages/6_Model_Result_Overview.py
ADDED
@@ -0,0 +1,348 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'''
|
2 |
+
MMO Build Sprint 3
|
3 |
+
additions : contributions calculated using tuned Mixed LM model
|
4 |
+
pending : contributions calculations using - 1. not tuned Mixed LM model, 2. tuned OLS model, 3. not tuned OLS model
|
5 |
+
|
6 |
+
MMO Build Sprint 4
|
7 |
+
additions : response metrics selection
|
8 |
+
pending : contributions calculations using - 1. not tuned Mixed LM model, 2. tuned OLS model, 3. not tuned OLS model
|
9 |
+
'''
|
10 |
+
|
11 |
+
import streamlit as st
|
12 |
+
import pandas as pd
|
13 |
+
from sklearn.preprocessing import MinMaxScaler
|
14 |
+
import pickle
|
15 |
+
|
16 |
+
|
17 |
+
|
18 |
+
from utilities_with_panel import (set_header,
|
19 |
+
overview_test_data_prep_panel,
|
20 |
+
overview_test_data_prep_nonpanel,
|
21 |
+
initialize_data,
|
22 |
+
load_local_css,
|
23 |
+
create_channel_summary,
|
24 |
+
create_contribution_pie,
|
25 |
+
create_contribuion_stacked_plot,
|
26 |
+
create_channel_spends_sales_plot,
|
27 |
+
format_numbers,
|
28 |
+
channel_name_formating)
|
29 |
+
|
30 |
+
import plotly.graph_objects as go
|
31 |
+
import streamlit_authenticator as stauth
|
32 |
+
import yaml
|
33 |
+
from yaml import SafeLoader
|
34 |
+
import time
|
35 |
+
|
36 |
+
st.set_page_config(layout='wide')
|
37 |
+
load_local_css('styles.css')
|
38 |
+
set_header()
|
39 |
+
|
40 |
+
|
41 |
+
def get_random_effects(media_data, panel_col, mdf):
|
42 |
+
random_eff_df = pd.DataFrame(columns=[panel_col, "random_effect"])
|
43 |
+
|
44 |
+
for i, market in enumerate(media_data[panel_col].unique()):
|
45 |
+
print(i, end='\r')
|
46 |
+
intercept = mdf.random_effects[market].values[0]
|
47 |
+
random_eff_df.loc[i, 'random_effect'] = intercept
|
48 |
+
random_eff_df.loc[i, panel_col] = market
|
49 |
+
|
50 |
+
return random_eff_df
|
51 |
+
|
52 |
+
|
53 |
+
def process_train_and_test(train, test, features, panel_col, target_col):
|
54 |
+
X1 = train[features]
|
55 |
+
|
56 |
+
ss = MinMaxScaler()
|
57 |
+
X1 = pd.DataFrame(ss.fit_transform(X1), columns=X1.columns)
|
58 |
+
|
59 |
+
X1[panel_col] = train[panel_col]
|
60 |
+
X1[target_col] = train[target_col]
|
61 |
+
|
62 |
+
if test is not None:
|
63 |
+
X2 = test[features]
|
64 |
+
X2 = pd.DataFrame(ss.transform(X2), columns=X2.columns)
|
65 |
+
X2[panel_col] = test[panel_col]
|
66 |
+
X2[target_col] = test[target_col]
|
67 |
+
return X1, X2
|
68 |
+
return X1
|
69 |
+
|
70 |
+
def mdf_predict(X_df, mdf, random_eff_df) :
|
71 |
+
X=X_df.copy()
|
72 |
+
X=pd.merge(X, random_eff_df[[panel_col,'random_effect']], on=panel_col, how='left')
|
73 |
+
X['pred_fixed_effect'] = mdf.predict(X)
|
74 |
+
|
75 |
+
X['pred'] = X['pred_fixed_effect'] + X['random_effect']
|
76 |
+
X.to_csv('Test/merged_df_contri.csv',index=False)
|
77 |
+
X.drop(columns=['pred_fixed_effect', 'random_effect'], inplace=True)
|
78 |
+
|
79 |
+
return X
|
80 |
+
|
81 |
+
|
82 |
+
target='Revenue'
|
83 |
+
|
84 |
+
# is_panel=False
|
85 |
+
# is_panel = st.session_state['is_panel']
|
86 |
+
panel_col = [col.lower().replace('.','_').replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in st.session_state['bin_dict']['Panel Level 1'] ] [0]# set the panel column
|
87 |
+
date_col = 'date'
|
88 |
+
|
89 |
+
#st.write(media_data)
|
90 |
+
|
91 |
+
is_panel = True if len(panel_col)>0 else False
|
92 |
+
|
93 |
+
# panel_col='markets'
|
94 |
+
date_col = 'date'
|
95 |
+
|
96 |
+
# Sprint4 - if used_response_metrics is not blank, then select one of the used_response_metrics, else target is revenue by default
|
97 |
+
if "used_response_metrics" in st.session_state and st.session_state['used_response_metrics']!=[]:
|
98 |
+
sel_target_col = st.selectbox("Select the response metric", st.session_state['used_response_metrics'])
|
99 |
+
target_col = sel_target_col.lower().replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_")
|
100 |
+
else :
|
101 |
+
sel_target_col = 'Total Approved Accounts - Revenue'
|
102 |
+
target_col = 'total_approved_accounts_revenue'
|
103 |
+
|
104 |
+
# Sprint4 - Look through all saved tuned models, only show saved models of the sel resp metric (target_col)
|
105 |
+
# saved_models = st.session_state['saved_model_names']
|
106 |
+
# Sprint4 - get the model obj of the selected model
|
107 |
+
# st.write(sel_model_dict)
|
108 |
+
|
109 |
+
# Sprint3 - Contribution
|
110 |
+
if is_panel:
|
111 |
+
# read tuned mixedLM model
|
112 |
+
# if st.session_state["tuned_model"] is not None :
|
113 |
+
|
114 |
+
if st.session_state["is_tuned_model"][target_col]==True: #Sprint4
|
115 |
+
with open("tuned_model.pkl", 'rb') as file:
|
116 |
+
model_dict = pickle.load(file)
|
117 |
+
saved_models = list(model_dict.keys())
|
118 |
+
required_saved_models = [m.split("__")[0] for m in saved_models if m.split("__")[1] == target_col]
|
119 |
+
sel_model = st.selectbox("Select the model to review", required_saved_models)
|
120 |
+
sel_model_dict = model_dict[sel_model + "__" + target_col]
|
121 |
+
|
122 |
+
# model=st.session_state["tuned_model"]
|
123 |
+
# X_train=st.session_state["X_train_tuned"]
|
124 |
+
# X_test=st.session_state["X_test_tuned"]
|
125 |
+
# best_feature_set=st.session_state["tuned_model_features"]
|
126 |
+
model=sel_model_dict["Model_object"]
|
127 |
+
X_train=sel_model_dict["X_train_tuned"]
|
128 |
+
X_test=sel_model_dict["X_test_tuned"]
|
129 |
+
best_feature_set=sel_model_dict["feature_set"]
|
130 |
+
|
131 |
+
# st.write("features", best_feature_set)
|
132 |
+
# st.write(X_test.columns)
|
133 |
+
|
134 |
+
else : # if non tuned model to be used # Pending
|
135 |
+
with open("best_models.pkl", 'rb') as file:
|
136 |
+
model_dict = pickle.load(file)
|
137 |
+
saved_models = list(model_dict.keys())
|
138 |
+
required_saved_models = [m.split("__")[0] for m in saved_models if m.split("__")[1] == target_col]
|
139 |
+
sel_model = st.selectbox("Select the model to review", required_saved_models)
|
140 |
+
sel_model_dict = model_dict[sel_model + "__" + target_col]
|
141 |
+
model=st.session_state["base_model"]
|
142 |
+
X_train = st.session_state['X_train']
|
143 |
+
X_test = st.session_state['X_test']
|
144 |
+
# y_train = st.session_state['y_train']
|
145 |
+
# y_test = st.session_state['y_test']
|
146 |
+
best_feature_set = st.session_state['base_model_feature_set']
|
147 |
+
# st.write(best_feature_set)
|
148 |
+
# st.write(X_test.columns)
|
149 |
+
|
150 |
+
# Calculate contributions
|
151 |
+
|
152 |
+
with open("data_import.pkl", "rb") as f:
|
153 |
+
data = pickle.load(f)
|
154 |
+
|
155 |
+
# Accessing the loaded objects
|
156 |
+
st.session_state['orig_media_data'] = data["final_df"]
|
157 |
+
|
158 |
+
st.session_state['orig_media_data'].columns=[col.lower().replace('.','_').replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in st.session_state['orig_media_data'].columns]
|
159 |
+
|
160 |
+
media_data = st.session_state["media_data"]
|
161 |
+
|
162 |
+
|
163 |
+
# st.session_state['orig_media_data']=st.session_state["media_data"]
|
164 |
+
|
165 |
+
#st.write(media_data)
|
166 |
+
|
167 |
+
contri_df = pd.DataFrame()
|
168 |
+
|
169 |
+
y = []
|
170 |
+
y_pred = []
|
171 |
+
|
172 |
+
random_eff_df = get_random_effects(media_data, panel_col, model)
|
173 |
+
random_eff_df['fixed_effect'] = model.fe_params['Intercept']
|
174 |
+
random_eff_df['panel_effect'] = random_eff_df['random_effect'] + random_eff_df['fixed_effect']
|
175 |
+
# random_eff_df.to_csv("Test/random_eff_df_contri.csv", index=False)
|
176 |
+
|
177 |
+
coef_df = pd.DataFrame(model.fe_params)
|
178 |
+
coef_df.columns = ['coef']
|
179 |
+
|
180 |
+
# coef_df.reset_index().to_csv("Test/coef_df_contri1.csv",index=False)
|
181 |
+
# print(model.fe_params)
|
182 |
+
|
183 |
+
x_train_contribution = X_train.copy()
|
184 |
+
x_test_contribution = X_test.copy()
|
185 |
+
|
186 |
+
# preprocessing not needed since X_train is already preprocessed
|
187 |
+
# X1, X2 = process_train_and_test(x_train_contribution, x_test_contribution, best_feature_set, panel_col, target_col)
|
188 |
+
# x_train_contribution[best_feature_set] = X1[best_feature_set]
|
189 |
+
# x_test_contribution[best_feature_set] = X2[best_feature_set]
|
190 |
+
|
191 |
+
x_train_contribution = mdf_predict(x_train_contribution, model, random_eff_df)
|
192 |
+
x_test_contribution = mdf_predict(x_test_contribution, model, random_eff_df)
|
193 |
+
|
194 |
+
x_train_contribution = pd.merge(x_train_contribution, random_eff_df[[panel_col, 'panel_effect']], on=panel_col,
|
195 |
+
how='left')
|
196 |
+
x_test_contribution = pd.merge(x_test_contribution, random_eff_df[[panel_col, 'panel_effect']], on=panel_col,
|
197 |
+
how='left')
|
198 |
+
|
199 |
+
inp_coef = coef_df['coef'][1:].tolist() # 0th index is intercept
|
200 |
+
|
201 |
+
for i in range(len(inp_coef)):
|
202 |
+
x_train_contribution[str(best_feature_set[i]) + "_contr"] = inp_coef[i] * x_train_contribution[best_feature_set[i]]
|
203 |
+
x_test_contribution[str(best_feature_set[i]) + "_contr"] = inp_coef[i] * x_test_contribution[best_feature_set[i]]
|
204 |
+
|
205 |
+
x_train_contribution['sum_contributions'] = x_train_contribution.filter(regex="contr").sum(axis=1)
|
206 |
+
x_train_contribution['sum_contributions'] = x_train_contribution['sum_contributions'] + x_train_contribution['panel_effect']
|
207 |
+
|
208 |
+
x_test_contribution['sum_contributions'] = x_test_contribution.filter(regex="contr").sum(axis=1)
|
209 |
+
x_test_contribution['sum_contributions'] = x_test_contribution['sum_contributions'] + x_test_contribution['panel_effect']
|
210 |
+
|
211 |
+
# # test
|
212 |
+
x_train_contribution.to_csv("Test/x_train_contribution.csv",index=False)
|
213 |
+
x_test_contribution.to_csv("Test/x_test_contribution.csv",index=False)
|
214 |
+
#
|
215 |
+
# st.session_state['orig_media_data'].to_csv("Test/transformed_data.csv",index=False)
|
216 |
+
# st.session_state['X_test_spends'].to_csv("Test/test_spends.csv",index=False)
|
217 |
+
# # st.write(st.session_state['orig_media_data'].columns)
|
218 |
+
|
219 |
+
st.write(date_col,panel_col)
|
220 |
+
# st.write(x_test_contribution)
|
221 |
+
|
222 |
+
overview_test_data_prep_panel(x_test_contribution, st.session_state['orig_media_data'], st.session_state['X_test_spends'],
|
223 |
+
date_col, panel_col, target_col)
|
224 |
+
|
225 |
+
else : # NON PANEL
|
226 |
+
if st.session_state["is_tuned_model"][target_col]==True: #Sprint4
|
227 |
+
with open("tuned_model.pkl", 'rb') as file:
|
228 |
+
model_dict = pickle.load(file)
|
229 |
+
saved_models = list(model_dict.keys())
|
230 |
+
required_saved_models = [m.split("__")[0] for m in saved_models if m.split("__")[1] == target_col]
|
231 |
+
sel_model = st.selectbox("Select the model to review", required_saved_models)
|
232 |
+
sel_model_dict = model_dict[sel_model + "__" + target_col]
|
233 |
+
|
234 |
+
model=sel_model_dict["Model_object"]
|
235 |
+
X_train=sel_model_dict["X_train_tuned"]
|
236 |
+
X_test=sel_model_dict["X_test_tuned"]
|
237 |
+
best_feature_set=sel_model_dict["feature_set"]
|
238 |
+
|
239 |
+
else : #Sprint4
|
240 |
+
with open("best_models.pkl", 'rb') as file:
|
241 |
+
model_dict = pickle.load(file)
|
242 |
+
saved_models = list(model_dict.keys())
|
243 |
+
required_saved_models = [m.split("__")[0] for m in saved_models if m.split("__")[1] == target_col]
|
244 |
+
sel_model = st.selectbox("Select the model to review", required_saved_models)
|
245 |
+
sel_model_dict = model_dict[sel_model + "__" + target_col]
|
246 |
+
|
247 |
+
model=sel_model_dict["Model_object"]
|
248 |
+
X_train=sel_model_dict["X_train"]
|
249 |
+
X_test=sel_model_dict["X_test"]
|
250 |
+
best_feature_set=sel_model_dict["feature_set"]
|
251 |
+
|
252 |
+
x_train_contribution = X_train.copy()
|
253 |
+
x_test_contribution = X_test.copy()
|
254 |
+
|
255 |
+
x_train_contribution['pred'] = model.predict(x_train_contribution[best_feature_set])
|
256 |
+
x_test_contribution['pred'] = model.predict(x_test_contribution[best_feature_set])
|
257 |
+
|
258 |
+
for num,i in enumerate(model.params.values):
|
259 |
+
col=best_feature_set[num]
|
260 |
+
x_train_contribution[col + "_contr"] = X_train[col] * i
|
261 |
+
x_test_contribution[col + "_contr"] = X_test[col] * i
|
262 |
+
|
263 |
+
x_test_contribution.to_csv("Test/x_test_contribution_non_panel.csv",index=False)
|
264 |
+
overview_test_data_prep_nonpanel(x_test_contribution, st.session_state['orig_media_data'].copy(), st.session_state['X_test_spends'].copy(), date_col, target_col)
|
265 |
+
# for k, v in st.session_sta
|
266 |
+
# te.items():
|
267 |
+
|
268 |
+
# if k not in ['logout', 'login','config'] and not k.startswith('FormSubmitter'):
|
269 |
+
# st.session_state[k] = v
|
270 |
+
|
271 |
+
# authenticator = st.session_state.get('authenticator')
|
272 |
+
|
273 |
+
# if authenticator is None:
|
274 |
+
# authenticator = load_authenticator()
|
275 |
+
|
276 |
+
# name, authentication_status, username = authenticator.login('Login', 'main')
|
277 |
+
# auth_status = st.session_state['authentication_status']
|
278 |
+
|
279 |
+
# if auth_status:
|
280 |
+
# authenticator.logout('Logout', 'main')
|
281 |
+
|
282 |
+
# is_state_initiaized = st.session_state.get('initialized',False)
|
283 |
+
# if not is_state_initiaized:
|
284 |
+
|
285 |
+
initialize_data(target_col)
|
286 |
+
scenario = st.session_state['scenario']
|
287 |
+
raw_df = st.session_state['raw_df']
|
288 |
+
st.header('Overview of previous spends')
|
289 |
+
|
290 |
+
# st.write(scenario.actual_total_spends)
|
291 |
+
# st.write(scenario.actual_total_sales)
|
292 |
+
columns = st.columns((1,1,3))
|
293 |
+
|
294 |
+
with columns[0]:
|
295 |
+
st.metric(label='Spends', value=format_numbers(float(scenario.actual_total_spends)))
|
296 |
+
###print(f"##################### {scenario.actual_total_sales} ##################")
|
297 |
+
with columns[1]:
|
298 |
+
st.metric(label=target, value=format_numbers(float(scenario.actual_total_sales),include_indicator=False))
|
299 |
+
|
300 |
+
|
301 |
+
actual_summary_df = create_channel_summary(scenario)
|
302 |
+
actual_summary_df['Channel'] = actual_summary_df['Channel'].apply(channel_name_formating)
|
303 |
+
|
304 |
+
columns = st.columns((2,1))
|
305 |
+
with columns[0]:
|
306 |
+
with st.expander('Channel wise overview'):
|
307 |
+
st.markdown(actual_summary_df.style.set_table_styles(
|
308 |
+
[{
|
309 |
+
'selector': 'th',
|
310 |
+
'props': [('background-color', '#11B6BD')]
|
311 |
+
},
|
312 |
+
{
|
313 |
+
'selector' : 'tr:nth-child(even)',
|
314 |
+
'props' : [('background-color', '#11B6BD')]
|
315 |
+
}]).to_html(), unsafe_allow_html=True)
|
316 |
+
|
317 |
+
st.markdown("<hr>",unsafe_allow_html=True)
|
318 |
+
##############################
|
319 |
+
|
320 |
+
st.plotly_chart(create_contribution_pie(scenario),use_container_width=True)
|
321 |
+
st.markdown("<hr>",unsafe_allow_html=True)
|
322 |
+
|
323 |
+
|
324 |
+
################################3
|
325 |
+
st.plotly_chart(create_contribuion_stacked_plot(scenario),use_container_width=True)
|
326 |
+
st.markdown("<hr>",unsafe_allow_html=True)
|
327 |
+
#######################################
|
328 |
+
|
329 |
+
selected_channel_name = st.selectbox('Channel', st.session_state['channels_list'] + ['non media'], format_func=channel_name_formating)
|
330 |
+
selected_channel = scenario.channels.get(selected_channel_name,None)
|
331 |
+
|
332 |
+
st.plotly_chart(create_channel_spends_sales_plot(selected_channel), use_container_width=True)
|
333 |
+
|
334 |
+
st.markdown("<hr>",unsafe_allow_html=True)
|
335 |
+
|
336 |
+
# elif auth_status == False:
|
337 |
+
# st.error('Username/Password is incorrect')
|
338 |
+
|
339 |
+
# if auth_status != True:
|
340 |
+
# try:
|
341 |
+
# username_forgot_pw, email_forgot_password, random_password = authenticator.forgot_password('Forgot password')
|
342 |
+
# if username_forgot_pw:
|
343 |
+
# st.success('New password sent securely')
|
344 |
+
# # Random password to be transferred to user securely
|
345 |
+
# elif username_forgot_pw == False:
|
346 |
+
# st.error('Username not found')
|
347 |
+
# except Exception as e:
|
348 |
+
# st.error(e)
|
pages/7_Build_Response_Curves.py
ADDED
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import plotly.express as px
|
3 |
+
import numpy as np
|
4 |
+
import plotly.graph_objects as go
|
5 |
+
from utilities_with_panel import channel_name_formating, load_authenticator, initialize_data
|
6 |
+
from sklearn.metrics import r2_score
|
7 |
+
from collections import OrderedDict
|
8 |
+
from classes import class_from_dict,class_to_dict
|
9 |
+
import pickle
|
10 |
+
import json
|
11 |
+
from utilities import (
|
12 |
+
load_local_css,
|
13 |
+
set_header,
|
14 |
+
channel_name_formating,
|
15 |
+
)
|
16 |
+
|
17 |
+
for k, v in st.session_state.items():
|
18 |
+
if k not in ['logout', 'login','config'] and not k.startswith('FormSubmitter'):
|
19 |
+
st.session_state[k] = v
|
20 |
+
|
21 |
+
def s_curve(x,K,b,a,x0):
|
22 |
+
return K / (1 + b*np.exp(-a*(x-x0)))
|
23 |
+
|
24 |
+
def save_scenario(scenario_name):
|
25 |
+
"""
|
26 |
+
Save the current scenario with the mentioned name in the session state
|
27 |
+
|
28 |
+
Parameters
|
29 |
+
----------
|
30 |
+
scenario_name
|
31 |
+
Name of the scenario to be saved
|
32 |
+
"""
|
33 |
+
if 'saved_scenarios' not in st.session_state:
|
34 |
+
st.session_state = OrderedDict()
|
35 |
+
|
36 |
+
#st.session_state['saved_scenarios'][scenario_name] = st.session_state['scenario'].save()
|
37 |
+
st.session_state['saved_scenarios'][scenario_name] = class_to_dict(st.session_state['scenario'])
|
38 |
+
st.session_state['scenario_input'] = ""
|
39 |
+
print(type(st.session_state['saved_scenarios']))
|
40 |
+
with open('../saved_scenarios.pkl', 'wb') as f:
|
41 |
+
pickle.dump(st.session_state['saved_scenarios'],f)
|
42 |
+
|
43 |
+
|
44 |
+
def reset_curve_parameters():
|
45 |
+
del st.session_state['K']
|
46 |
+
del st.session_state['b']
|
47 |
+
del st.session_state['a']
|
48 |
+
del st.session_state['x0']
|
49 |
+
|
50 |
+
def update_response_curve():
|
51 |
+
# st.session_state['rcs'][selected_channel_name]['K'] = st.session_state['K']
|
52 |
+
# st.session_state['rcs'][selected_channel_name]['b'] = st.session_state['b']
|
53 |
+
# st.session_state['rcs'][selected_channel_name]['a'] = st.session_state['a']
|
54 |
+
# st.session_state['rcs'][selected_channel_name]['x0'] = st.session_state['x0']
|
55 |
+
# rcs = st.session_state['rcs']
|
56 |
+
_channel_class = st.session_state['scenario'].channels[selected_channel_name]
|
57 |
+
_channel_class.update_response_curves({
|
58 |
+
'K' : st.session_state['K'],
|
59 |
+
'b' : st.session_state['b'],
|
60 |
+
'a' : st.session_state['a'],
|
61 |
+
'x0' : st.session_state['x0']})
|
62 |
+
|
63 |
+
|
64 |
+
# authenticator = st.session_state.get('authenticator')
|
65 |
+
# if authenticator is None:
|
66 |
+
# authenticator = load_authenticator()
|
67 |
+
|
68 |
+
# name, authentication_status, username = authenticator.login('Login', 'main')
|
69 |
+
# auth_status = st.session_state.get('authentication_status')
|
70 |
+
|
71 |
+
# if auth_status == True:
|
72 |
+
# is_state_initiaized = st.session_state.get('initialized',False)
|
73 |
+
# if not is_state_initiaized:
|
74 |
+
# print("Scenario page state reloaded")
|
75 |
+
|
76 |
+
# Sprint4 - if used_response_metrics is not blank, then select one of the used_response_metrics, else target is revenue by default
|
77 |
+
st.set_page_config(layout='wide')
|
78 |
+
load_local_css('styles.css')
|
79 |
+
set_header()
|
80 |
+
|
81 |
+
if "used_response_metrics" in st.session_state and st.session_state['used_response_metrics']!=[]:
|
82 |
+
sel_target_col = st.selectbox("Select the response metric", st.session_state['used_response_metrics'])
|
83 |
+
target_col = sel_target_col.lower().replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_")
|
84 |
+
else :
|
85 |
+
sel_target_col = 'Total Approved Accounts - Revenue'
|
86 |
+
target_col = 'total_approved_accounts_revenue'
|
87 |
+
|
88 |
+
initialize_data(target_col)
|
89 |
+
|
90 |
+
st.subheader("Build response curves")
|
91 |
+
|
92 |
+
channels_list = st.session_state['channels_list']
|
93 |
+
selected_channel_name = st.selectbox('Channel', st.session_state['channels_list'] + ['Others'], format_func=channel_name_formating,on_change=reset_curve_parameters)
|
94 |
+
|
95 |
+
rcs = {}
|
96 |
+
for channel_name in channels_list:
|
97 |
+
rcs[channel_name] = st.session_state['scenario'].channels[channel_name].response_curve_params
|
98 |
+
# rcs = st.session_state['rcs']
|
99 |
+
|
100 |
+
|
101 |
+
if 'K' not in st.session_state:
|
102 |
+
st.session_state['K'] = rcs[selected_channel_name]['K']
|
103 |
+
if 'b' not in st.session_state:
|
104 |
+
st.session_state['b'] = rcs[selected_channel_name]['b']
|
105 |
+
if 'a' not in st.session_state:
|
106 |
+
st.session_state['a'] = rcs[selected_channel_name]['a']
|
107 |
+
if 'x0' not in st.session_state:
|
108 |
+
st.session_state['x0'] = rcs[selected_channel_name]['x0']
|
109 |
+
|
110 |
+
x = st.session_state['actual_input_df'][selected_channel_name].values
|
111 |
+
y = st.session_state['actual_contribution_df'][selected_channel_name].values
|
112 |
+
|
113 |
+
power = (np.ceil(np.log(x.max()) / np.log(10) )- 3)
|
114 |
+
|
115 |
+
# fig = px.scatter(x, s_curve(x/10**power,
|
116 |
+
# st.session_state['K'],
|
117 |
+
# st.session_state['b'],
|
118 |
+
# st.session_state['a'],
|
119 |
+
# st.session_state['x0']))
|
120 |
+
|
121 |
+
fig = px.scatter(x=x, y=y)
|
122 |
+
fig.add_trace(go.Scatter(x=sorted(x), y=s_curve(sorted(x)/10**power,st.session_state['K'],
|
123 |
+
st.session_state['b'],
|
124 |
+
st.session_state['a'],
|
125 |
+
st.session_state['x0']),
|
126 |
+
line=dict(color='red')))
|
127 |
+
|
128 |
+
fig.update_layout(title_text="Response Curve",showlegend=False)
|
129 |
+
fig.update_annotations(font_size=10)
|
130 |
+
fig.update_xaxes(title='Spends')
|
131 |
+
fig.update_yaxes(title=sel_target_col)
|
132 |
+
|
133 |
+
st.plotly_chart(fig,use_container_width=True)
|
134 |
+
|
135 |
+
r2 = r2_score(y, s_curve(x / 10**power,
|
136 |
+
st.session_state['K'],
|
137 |
+
st.session_state['b'],
|
138 |
+
st.session_state['a'],
|
139 |
+
st.session_state['x0']))
|
140 |
+
|
141 |
+
st.metric('R2',round(r2,2))
|
142 |
+
columns = st.columns(4)
|
143 |
+
|
144 |
+
with columns[0]:
|
145 |
+
st.number_input('K',key='K',format="%0.5f")
|
146 |
+
with columns[1]:
|
147 |
+
st.number_input('b',key='b',format="%0.5f")
|
148 |
+
with columns[2]:
|
149 |
+
st.number_input('a',key='a',step=0.0001,format="%0.5f")
|
150 |
+
with columns[3]:
|
151 |
+
st.number_input('x0',key='x0',format="%0.5f")
|
152 |
+
|
153 |
+
|
154 |
+
st.button('Update parameters',on_click=update_response_curve)
|
155 |
+
st.button('Reset parameters',on_click=reset_curve_parameters)
|
156 |
+
scenario_name = st.text_input('Scenario name', key='scenario_input',placeholder='Scenario name',label_visibility='collapsed')
|
157 |
+
st.button('Save', on_click=lambda : save_scenario(scenario_name),disabled=len(st.session_state['scenario_input']) == 0)
|
158 |
+
|
159 |
+
file_name = st.text_input('rcs download file name', key='file_name_input',placeholder='file name',label_visibility='collapsed')
|
160 |
+
st.download_button(
|
161 |
+
label="Download response curves",
|
162 |
+
data=json.dumps(rcs),
|
163 |
+
file_name=f"{file_name}.json",
|
164 |
+
mime="application/json",
|
165 |
+
disabled= len(file_name) == 0,
|
166 |
+
)
|
167 |
+
|
168 |
+
|
169 |
+
def s_curve_derivative(x, K, b, a, x0):
|
170 |
+
# Derivative of the S-curve function
|
171 |
+
return a * b * K * np.exp(-a * (x - x0)) / ((1 + b * np.exp(-a * (x - x0))) ** 2)
|
172 |
+
|
173 |
+
# Parameters of the S-curve
|
174 |
+
K = st.session_state['K']
|
175 |
+
b = st.session_state['b']
|
176 |
+
a = st.session_state['a']
|
177 |
+
x0 = st.session_state['x0']
|
178 |
+
|
179 |
+
# Optimized spend value obtained from the tool
|
180 |
+
optimized_spend = st.number_input('value of x') # Replace this with your optimized spend value
|
181 |
+
|
182 |
+
# Calculate the slope at the optimized spend value
|
183 |
+
slope_at_optimized_spend = s_curve_derivative(optimized_spend, K, b, a, x0)
|
184 |
+
|
185 |
+
st.write("Slope ", slope_at_optimized_spend)
|
pages/8_Scenario_Planner.py
CHANGED
@@ -23,34 +23,28 @@ import re
|
|
23 |
import pandas as pd
|
24 |
import plotly.express as px
|
25 |
|
26 |
-
|
27 |
st.set_page_config(layout="wide")
|
28 |
load_local_css("styles.css")
|
29 |
set_header()
|
30 |
|
31 |
for k, v in st.session_state.items():
|
32 |
-
if k not in ["logout", "login", "config"] and not k.startswith(
|
33 |
-
"FormSubmitter"
|
34 |
-
):
|
35 |
st.session_state[k] = v
|
36 |
# ======================================================== #
|
37 |
# ======================= Functions ====================== #
|
38 |
# ======================================================== #
|
39 |
|
40 |
|
41 |
-
def optimize(key):
|
42 |
"""
|
43 |
Optimize the spends for the sales
|
44 |
"""
|
45 |
|
46 |
channel_list = [
|
47 |
-
key
|
48 |
-
for key, value in st.session_state["optimization_channels"].items()
|
49 |
-
if value
|
50 |
]
|
51 |
-
|
52 |
-
# print(channel_list)
|
53 |
-
# print('@@@@@@@@')
|
54 |
if len(channel_list) > 0:
|
55 |
scenario = st.session_state["scenario"]
|
56 |
if key.lower() == "media spends":
|
@@ -59,7 +53,8 @@ def optimize(key):
|
|
59 |
result = st.session_state["scenario"].optimize(
|
60 |
st.session_state["total_spends_change"], channel_list
|
61 |
)
|
62 |
-
elif key.lower() == "revenue":
|
|
|
63 |
with status_placeholder:
|
64 |
with st.spinner("Optimizing"):
|
65 |
|
@@ -69,14 +64,11 @@ def optimize(key):
|
|
69 |
for channel_name, modified_spends in result:
|
70 |
|
71 |
st.session_state[channel_name] = numerize(
|
72 |
-
modified_spends
|
73 |
-
* scenario.channels[channel_name].conversion_rate,
|
74 |
1,
|
75 |
)
|
76 |
prev_spends = (
|
77 |
-
st.session_state["scenario"]
|
78 |
-
.channels[channel_name]
|
79 |
-
.actual_total_spends
|
80 |
)
|
81 |
st.session_state[f"{channel_name}_change"] = round(
|
82 |
100 * (modified_spends - prev_spends) / prev_spends, 2
|
@@ -105,15 +97,46 @@ def save_scenario(scenario_name):
|
|
105 |
pickle.dump(st.session_state["saved_scenarios"], f)
|
106 |
|
107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
def update_sales_abs():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
actual_sales = _scenario.actual_total_sales
|
110 |
-
if
|
|
|
|
|
|
|
111 |
modified_sales = extract_number_for_string(
|
112 |
st.session_state["total_sales_change_abs"]
|
113 |
)
|
114 |
st.session_state["total_sales_change"] = round(
|
115 |
((modified_sales / actual_sales) - 1) * 100
|
116 |
)
|
|
|
117 |
|
118 |
|
119 |
def update_sales():
|
@@ -122,32 +145,95 @@ def update_sales():
|
|
122 |
* _scenario.actual_total_sales,
|
123 |
1,
|
124 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
|
126 |
|
127 |
def update_all_spends_abs():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
actual_spends = _scenario.actual_total_spends
|
129 |
-
if
|
|
|
|
|
|
|
130 |
modified_spends = extract_number_for_string(
|
131 |
st.session_state["total_spends_change_abs"]
|
132 |
)
|
133 |
-
print(modified_spends)
|
134 |
-
print(actual_spends)
|
135 |
-
|
136 |
st.session_state["total_spends_change"] = (
|
137 |
(modified_spends / actual_spends) - 1
|
138 |
) * 100
|
|
|
|
|
|
|
139 |
|
140 |
update_all_spends()
|
141 |
|
142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
def update_all_spends():
|
144 |
"""
|
145 |
Updates spends for all the channels with the given overall spends change
|
146 |
"""
|
147 |
percent_change = st.session_state["total_spends_change"]
|
148 |
-
|
149 |
-
(1 + percent_change / 100) * _scenario.actual_total_spends, 1
|
150 |
-
)
|
151 |
for channel_name in st.session_state["channels_list"]:
|
152 |
channel = st.session_state["scenario"].channels[channel_name]
|
153 |
current_spends = channel.actual_total_spends
|
@@ -199,16 +285,10 @@ def update_data(channel_name):
|
|
199 |
"""
|
200 |
|
201 |
if validate_input(st.session_state[channel_name]):
|
202 |
-
modified_spends = extract_number_for_string(
|
203 |
-
st.session_state[channel_name]
|
204 |
-
)
|
205 |
prev_spends = (
|
206 |
-
st.session_state["scenario"]
|
207 |
-
.channels[channel_name]
|
208 |
-
.actual_total_spends
|
209 |
-
* st.session_state["scenario"]
|
210 |
-
.channels[channel_name]
|
211 |
-
.conversion_rate
|
212 |
)
|
213 |
st.session_state[f"{channel_name}_change"] = round(
|
214 |
100 * (modified_spends - prev_spends) / prev_spends, 2
|
@@ -216,9 +296,7 @@ def update_data(channel_name):
|
|
216 |
st.session_state["scenario"].update(
|
217 |
channel_name,
|
218 |
modified_spends
|
219 |
-
/ st.session_state["scenario"]
|
220 |
-
.channels[channel_name]
|
221 |
-
.conversion_rate,
|
222 |
)
|
223 |
# st.session_state['scenario'].update(channel_name, modified_spends)
|
224 |
# else:
|
@@ -249,31 +327,55 @@ def select_all_channels_for_optimization():
|
|
249 |
st.session_state[f"{channel_name}_selected"] = st.session_state[
|
250 |
"optimze_all_channels"
|
251 |
]
|
252 |
-
st.session_state["optimization_channels"][channel_name] =
|
253 |
-
|
254 |
-
|
255 |
|
256 |
|
257 |
def update_penalty():
|
258 |
"""
|
259 |
Updates the penalty flag for sales calculation
|
260 |
"""
|
261 |
-
st.session_state["scenario"].update_penalty(
|
262 |
-
st.session_state["apply_penalty"]
|
263 |
-
)
|
264 |
|
265 |
|
266 |
-
def reset_scenario():
|
267 |
# #print(st.session_state['default_scenario_dict'])
|
268 |
# st.session_state['scenario'] = class_from_dict(st.session_state['default_scenario_dict'])
|
269 |
# for channel in st.session_state['scenario'].channels.values():
|
270 |
# st.session_state[channel.name] = float(channel.actual_total_spends * channel.conversion_rate)
|
271 |
-
initialize_data()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
272 |
for channel_name in st.session_state["channels_list"]:
|
273 |
st.session_state[f"{channel_name}_selected"] = False
|
274 |
st.session_state[f"{channel_name}_change"] = 0
|
275 |
st.session_state["optimze_all_channels"] = False
|
276 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
277 |
|
278 |
def format_number(num):
|
279 |
if num >= 1_000_000:
|
@@ -305,9 +407,7 @@ def summary_plot(data, x, y, title, text_column):
|
|
305 |
hovertemplate="%{x:.2s}",
|
306 |
)
|
307 |
|
308 |
-
fig.update_layout(
|
309 |
-
xaxis_title=x, yaxis_title="Channel Name", showlegend=False
|
310 |
-
)
|
311 |
return fig
|
312 |
|
313 |
|
@@ -342,27 +442,21 @@ def calculate_rgba(
|
|
342 |
relative_position = (current_channel_spends - start_value) / (
|
343 |
left_value - start_value
|
344 |
)
|
345 |
-
alpha = 0.8 - (
|
346 |
-
0.6 * relative_position
|
347 |
-
) # Alpha decreases from start to end
|
348 |
|
349 |
elif left_value < current_channel_spends <= right_value:
|
350 |
color = "green"
|
351 |
relative_position = (current_channel_spends - left_value) / (
|
352 |
right_value - left_value
|
353 |
)
|
354 |
-
alpha = 0.8 - (
|
355 |
-
0.6 * relative_position
|
356 |
-
) # Alpha decreases from start to end
|
357 |
|
358 |
elif right_value < current_channel_spends <= end_value:
|
359 |
color = "red"
|
360 |
relative_position = (current_channel_spends - right_value) / (
|
361 |
end_value - right_value
|
362 |
)
|
363 |
-
alpha = 0.2 + (
|
364 |
-
0.6 * relative_position
|
365 |
-
) # Alpha increases from start to end
|
366 |
|
367 |
else:
|
368 |
# Default case, if the spends are outside the defined ranges
|
@@ -432,9 +526,7 @@ def plot_response_curves():
|
|
432 |
|
433 |
for index in range(len(x_plot)):
|
434 |
marginal_roi.append(
|
435 |
-
a
|
436 |
-
* y[index]
|
437 |
-
* (1 - y[index] / np.maximum(K, np.finfo(float).eps))
|
438 |
)
|
439 |
|
440 |
x = (
|
@@ -466,9 +558,7 @@ def plot_response_curves():
|
|
466 |
st.session_state["scenario"].channels[col].modified_total_spends
|
467 |
* st.session_state["scenario"].channels[col].conversion_rate
|
468 |
)
|
469 |
-
y_optimal =
|
470 |
-
st.session_state["scenario"].channels[col].modified_total_sales
|
471 |
-
)
|
472 |
|
473 |
# if col == "Paid_social_others":
|
474 |
# debug_temp(x_optimal * x_actual / x_actual.sum(), power, K, b, a, x0)
|
@@ -576,7 +666,7 @@ def plot_response_curves():
|
|
576 |
fig.update_layout(
|
577 |
# height=1000,
|
578 |
# width=1000,
|
579 |
-
title_text="Response Curves (X: Spends Vs Y:
|
580 |
showlegend=False,
|
581 |
shapes=shapes,
|
582 |
)
|
@@ -718,12 +808,144 @@ authenticator = stauth.Authenticate(
|
|
718 |
st.session_state["authenticator"] = authenticator
|
719 |
name, authentication_status, username = authenticator.login("Login", "main")
|
720 |
auth_status = st.session_state.get("authentication_status")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
721 |
if auth_status == True:
|
722 |
authenticator.logout("Logout", "main")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
723 |
is_state_initiaized = st.session_state.get("initialized", False)
|
724 |
-
if not is_state_initiaized:
|
725 |
-
initialize_data()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
726 |
|
|
|
727 |
channels_list = st.session_state["channels_list"]
|
728 |
|
729 |
# ======================================================== #
|
@@ -731,12 +953,16 @@ if auth_status == True:
|
|
731 |
# ======================================================== #
|
732 |
|
733 |
# print(list(st.session_state.keys()))
|
734 |
-
|
735 |
-
st.header("Simulation")
|
736 |
main_header = st.columns((2, 2))
|
737 |
sub_header = st.columns((1, 1, 1, 1))
|
738 |
_scenario = st.session_state["scenario"]
|
739 |
|
|
|
|
|
|
|
|
|
|
|
|
|
740 |
if "total_spends_change_abs" not in st.session_state:
|
741 |
st.session_state["total_spends_change_abs"] = numerize(
|
742 |
_scenario.actual_total_spends, 1
|
@@ -747,6 +973,16 @@ if auth_status == True:
|
|
747 |
_scenario.actual_total_sales, 1
|
748 |
)
|
749 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
750 |
with main_header[0]:
|
751 |
st.subheader("Actual")
|
752 |
|
@@ -754,9 +990,7 @@ if auth_status == True:
|
|
754 |
st.subheader("Simulated")
|
755 |
|
756 |
with sub_header[0]:
|
757 |
-
st.metric(
|
758 |
-
label="Spends", value=format_numbers(_scenario.actual_total_spends)
|
759 |
-
)
|
760 |
|
761 |
with sub_header[1]:
|
762 |
st.metric(
|
@@ -782,33 +1016,49 @@ if auth_status == True:
|
|
782 |
delta=numerize(_scenario.delta_sales, 1),
|
783 |
)
|
784 |
|
785 |
-
with st.expander("Channel Spends Simulator"):
|
786 |
_columns1 = st.columns((2, 2, 1, 1))
|
787 |
with _columns1[0]:
|
788 |
-
|
789 |
optimization_selection = st.selectbox(
|
790 |
-
"Optimize", options=["Media Spends",
|
791 |
)
|
|
|
792 |
with _columns1[1]:
|
793 |
st.markdown("#")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
794 |
st.checkbox(
|
795 |
label="Optimize all Channels",
|
796 |
-
key=
|
797 |
value=False,
|
798 |
on_change=select_all_channels_for_optimization,
|
799 |
)
|
800 |
|
801 |
with _columns1[2]:
|
802 |
st.markdown("#")
|
803 |
-
st.button(
|
804 |
-
|
805 |
-
|
806 |
-
|
807 |
-
|
|
|
|
|
|
|
808 |
|
809 |
with _columns1[3]:
|
810 |
st.markdown("#")
|
811 |
-
st.button(
|
|
|
|
|
|
|
|
|
|
|
812 |
|
813 |
_columns2 = st.columns((2, 2, 2))
|
814 |
if st.session_state["optimization_key"] == "Media Spends":
|
@@ -819,37 +1069,90 @@ if auth_status == True:
|
|
819 |
# label_visibility="collapsed",
|
820 |
on_change=update_all_spends_abs,
|
821 |
)
|
822 |
-
with _columns2[1]:
|
823 |
|
|
|
824 |
st.number_input(
|
825 |
-
"Percent",
|
826 |
-
key=
|
|
|
|
|
827 |
step=1,
|
828 |
-
on_change=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
829 |
)
|
830 |
-
elif st.session_state["optimization_key"] == "Revenue":
|
831 |
-
with _columns2[0]:
|
832 |
|
|
|
|
|
833 |
sales_input = st.text_input(
|
834 |
"Absolute",
|
835 |
key="total_sales_change_abs",
|
836 |
on_change=update_sales_abs,
|
837 |
)
|
|
|
838 |
with _columns2[1]:
|
839 |
st.number_input(
|
840 |
-
"Percent
|
841 |
-
key=
|
|
|
|
|
842 |
step=1,
|
843 |
on_change=update_sales,
|
844 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
845 |
|
846 |
-
|
847 |
-
st.
|
848 |
-
|
849 |
-
|
850 |
-
|
851 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
852 |
)
|
|
|
|
|
853 |
_columns = st.columns((2.5, 2, 1.5, 1.5, 1))
|
854 |
with _columns[0]:
|
855 |
generate_spending_header("Channel")
|
@@ -862,9 +1165,7 @@ if auth_status == True:
|
|
862 |
with _columns[4]:
|
863 |
generate_spending_header("Optimize")
|
864 |
|
865 |
-
st.markdown(
|
866 |
-
"""<hr class="spends-heading-seperator">""", unsafe_allow_html=True
|
867 |
-
)
|
868 |
|
869 |
if "acutual_predicted" not in st.session_state:
|
870 |
st.session_state["acutual_predicted"] = {
|
@@ -874,9 +1175,7 @@ if auth_status == True:
|
|
874 |
"Delta": [],
|
875 |
}
|
876 |
for i, channel_name in enumerate(channels_list):
|
877 |
-
_channel_class = st.session_state["scenario"].channels[
|
878 |
-
channel_name
|
879 |
-
]
|
880 |
_columns = st.columns((2.5, 1.5, 1.5, 1.5, 1))
|
881 |
with _columns[0]:
|
882 |
st.write(channel_name_formating(channel_name))
|
@@ -885,12 +1184,8 @@ if auth_status == True:
|
|
885 |
with _columns[1]:
|
886 |
channel_bounds = _channel_class.bounds
|
887 |
channel_spends = float(_channel_class.actual_total_spends)
|
888 |
-
min_value = float(
|
889 |
-
|
890 |
-
)
|
891 |
-
max_value = float(
|
892 |
-
(1 + channel_bounds[1] / 100) * channel_spends
|
893 |
-
)
|
894 |
##print(st.session_state[channel_name])
|
895 |
spend_input = st.text_input(
|
896 |
channel_name,
|
@@ -901,9 +1196,11 @@ if auth_status == True:
|
|
901 |
if not validate_input(spend_input):
|
902 |
st.error("Invalid input")
|
903 |
|
|
|
|
|
904 |
st.number_input(
|
905 |
-
"Percent
|
906 |
-
key=
|
907 |
step=1,
|
908 |
on_change=partial(update_data_by_percent, channel_name),
|
909 |
)
|
@@ -915,12 +1212,10 @@ if auth_status == True:
|
|
915 |
* _channel_class.conversion_rate
|
916 |
)
|
917 |
actual_channel_spends = float(
|
918 |
-
_channel_class.actual_total_spends
|
919 |
-
* _channel_class.conversion_rate
|
920 |
)
|
921 |
spends_delta = float(
|
922 |
-
_channel_class.delta_spends
|
923 |
-
* _channel_class.conversion_rate
|
924 |
)
|
925 |
st.session_state["acutual_predicted"]["Channel_name"].append(
|
926 |
channel_name
|
@@ -928,12 +1223,10 @@ if auth_status == True:
|
|
928 |
st.session_state["acutual_predicted"]["Actual_spend"].append(
|
929 |
actual_channel_spends
|
930 |
)
|
931 |
-
st.session_state["acutual_predicted"][
|
932 |
-
|
933 |
-
].append(current_channel_spends)
|
934 |
-
st.session_state["acutual_predicted"]["Delta"].append(
|
935 |
-
spends_delta
|
936 |
)
|
|
|
937 |
## REMOVE
|
938 |
st.metric(
|
939 |
"Spends",
|
@@ -944,29 +1237,32 @@ if auth_status == True:
|
|
944 |
|
945 |
with _columns[3]:
|
946 |
# sales
|
947 |
-
current_channel_sales = float(
|
948 |
-
_channel_class.modified_total_sales
|
949 |
-
)
|
950 |
actual_channel_sales = float(_channel_class.actual_total_sales)
|
951 |
sales_delta = float(_channel_class.delta_sales)
|
952 |
st.metric(
|
953 |
target,
|
954 |
-
format_numbers(
|
955 |
-
current_channel_sales, include_indicator=False
|
956 |
-
),
|
957 |
delta=numerize(sales_delta, 1),
|
958 |
label_visibility="collapsed",
|
959 |
)
|
960 |
|
961 |
with _columns[4]:
|
962 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
963 |
st.checkbox(
|
964 |
label="select for optimization",
|
965 |
key=f"{channel_name}_selected",
|
966 |
value=False,
|
967 |
-
on_change=partial(
|
968 |
-
select_channel_for_optimization, channel_name
|
969 |
-
),
|
970 |
label_visibility="collapsed",
|
971 |
)
|
972 |
|
@@ -978,20 +1274,29 @@ if auth_status == True:
|
|
978 |
# Bins
|
979 |
col = channels_list[i]
|
980 |
x_actual = st.session_state["scenario"].channels[col].actual_spends
|
981 |
-
x_modified =
|
982 |
-
st.session_state["scenario"].channels[col].modified_spends
|
983 |
-
)
|
984 |
|
985 |
x_total = x_modified.sum()
|
986 |
power = np.ceil(np.log(x_actual.max()) / np.log(10)) - 3
|
987 |
|
988 |
-
|
989 |
-
|
990 |
-
|
991 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
992 |
|
993 |
x_plot = np.linspace(0, 5 * x_actual.sum(), 200)
|
994 |
|
|
|
|
|
|
|
995 |
x, y, marginal_roi = [], [], []
|
996 |
for x_p in x_plot:
|
997 |
x.append(x_p * x_actual / x_actual.sum())
|
@@ -1001,9 +1306,7 @@ if auth_status == True:
|
|
1001 |
|
1002 |
for index in range(len(x_plot)):
|
1003 |
marginal_roi.append(
|
1004 |
-
a
|
1005 |
-
* y[index]
|
1006 |
-
* (1 - y[index] / np.maximum(K, np.finfo(float).eps))
|
1007 |
)
|
1008 |
|
1009 |
x = (
|
@@ -1018,12 +1321,18 @@ if auth_status == True:
|
|
1018 |
|
1019 |
roi = y / np.maximum(x, np.finfo(float).eps)
|
1020 |
|
1021 |
-
|
1022 |
-
|
1023 |
-
|
1024 |
-
|
1025 |
-
|
1026 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
1027 |
)
|
1028 |
|
1029 |
rgba = calculate_rgba(
|
@@ -1034,16 +1343,6 @@ if auth_status == True:
|
|
1034 |
current_channel_spends,
|
1035 |
)
|
1036 |
|
1037 |
-
# Protecting division by zero by adding a small epsilon to denominators
|
1038 |
-
roi_current = current_channel_sales / np.maximum(
|
1039 |
-
current_channel_spends, np.finfo(float).eps
|
1040 |
-
)
|
1041 |
-
marginal_roi_current = (
|
1042 |
-
st.session_state["scenario"]
|
1043 |
-
.channels[col]
|
1044 |
-
.get_marginal_roi("modified")
|
1045 |
-
)
|
1046 |
-
|
1047 |
with bin_placeholder:
|
1048 |
st.markdown(
|
1049 |
f"""
|
@@ -1061,7 +1360,7 @@ if auth_status == True:
|
|
1061 |
unsafe_allow_html=True,
|
1062 |
)
|
1063 |
|
1064 |
-
with st.expander("See Response Curves"):
|
1065 |
fig = plot_response_curves()
|
1066 |
st.plotly_chart(fig, use_container_width=True)
|
1067 |
|
@@ -1081,19 +1380,11 @@ if auth_status == True:
|
|
1081 |
)
|
1082 |
|
1083 |
summary_df = pd.DataFrame(st.session_state["acutual_predicted"])
|
1084 |
-
summary_df.drop_duplicates(
|
1085 |
-
subset="Channel_name", keep="last", inplace=True
|
1086 |
-
)
|
1087 |
|
1088 |
summary_df_sorted = summary_df.sort_values(by="Delta", ascending=False)
|
1089 |
summary_df_sorted["Delta_percent"] = np.round(
|
1090 |
-
(
|
1091 |
-
(
|
1092 |
-
summary_df_sorted["Optimized_spend"]
|
1093 |
-
/ summary_df_sorted["Actual_spend"]
|
1094 |
-
)
|
1095 |
-
- 1
|
1096 |
-
)
|
1097 |
* 100,
|
1098 |
2,
|
1099 |
)
|
@@ -1121,9 +1412,9 @@ if auth_status != True:
|
|
1121 |
authenticator.forgot_password("Forgot password")
|
1122 |
)
|
1123 |
if username_forgot_pw:
|
1124 |
-
st.session_state["config"]["credentials"]["usernames"][
|
1125 |
-
|
1126 |
-
]
|
1127 |
send_email(email_forgot_password, random_password)
|
1128 |
st.success("New password sent securely")
|
1129 |
# Random password to be transferred to user securely
|
|
|
23 |
import pandas as pd
|
24 |
import plotly.express as px
|
25 |
|
26 |
+
|
27 |
st.set_page_config(layout="wide")
|
28 |
load_local_css("styles.css")
|
29 |
set_header()
|
30 |
|
31 |
for k, v in st.session_state.items():
|
32 |
+
if k not in ["logout", "login", "config"] and not k.startswith("FormSubmitter"):
|
|
|
|
|
33 |
st.session_state[k] = v
|
34 |
# ======================================================== #
|
35 |
# ======================= Functions ====================== #
|
36 |
# ======================================================== #
|
37 |
|
38 |
|
39 |
+
def optimize(key, status_placeholder):
|
40 |
"""
|
41 |
Optimize the spends for the sales
|
42 |
"""
|
43 |
|
44 |
channel_list = [
|
45 |
+
key for key, value in st.session_state["optimization_channels"].items() if value
|
|
|
|
|
46 |
]
|
47 |
+
|
|
|
|
|
48 |
if len(channel_list) > 0:
|
49 |
scenario = st.session_state["scenario"]
|
50 |
if key.lower() == "media spends":
|
|
|
53 |
result = st.session_state["scenario"].optimize(
|
54 |
st.session_state["total_spends_change"], channel_list
|
55 |
)
|
56 |
+
# elif key.lower() == "revenue":
|
57 |
+
else:
|
58 |
with status_placeholder:
|
59 |
with st.spinner("Optimizing"):
|
60 |
|
|
|
64 |
for channel_name, modified_spends in result:
|
65 |
|
66 |
st.session_state[channel_name] = numerize(
|
67 |
+
modified_spends * scenario.channels[channel_name].conversion_rate,
|
|
|
68 |
1,
|
69 |
)
|
70 |
prev_spends = (
|
71 |
+
st.session_state["scenario"].channels[channel_name].actual_total_spends
|
|
|
|
|
72 |
)
|
73 |
st.session_state[f"{channel_name}_change"] = round(
|
74 |
100 * (modified_spends - prev_spends) / prev_spends, 2
|
|
|
97 |
pickle.dump(st.session_state["saved_scenarios"], f)
|
98 |
|
99 |
|
100 |
+
if "allow_spends_update" not in st.session_state:
|
101 |
+
st.session_state["allow_spends_update"] = True
|
102 |
+
|
103 |
+
if "allow_sales_update" not in st.session_state:
|
104 |
+
st.session_state["allow_sales_update"] = True
|
105 |
+
|
106 |
+
|
107 |
+
def update_sales_abs_slider():
|
108 |
+
actual_sales = _scenario.actual_total_sales
|
109 |
+
if validate_input(st.session_state["total_sales_change_abs_slider"]):
|
110 |
+
modified_sales = extract_number_for_string(
|
111 |
+
st.session_state["total_sales_change_abs_slider"]
|
112 |
+
)
|
113 |
+
st.session_state["total_sales_change"] = round(
|
114 |
+
((modified_sales / actual_sales) - 1) * 100
|
115 |
+
)
|
116 |
+
st.session_state["total_sales_change_abs"] = numerize(modified_sales, 1)
|
117 |
+
|
118 |
+
|
119 |
def update_sales_abs():
|
120 |
+
if (
|
121 |
+
st.session_state["total_sales_change_abs"]
|
122 |
+
in st.session_state["total_sales_change_abs_slider_options"]
|
123 |
+
):
|
124 |
+
st.session_state["allow_sales_update"] = True
|
125 |
+
else:
|
126 |
+
st.session_state["allow_sales_update"] = False
|
127 |
+
|
128 |
actual_sales = _scenario.actual_total_sales
|
129 |
+
if (
|
130 |
+
validate_input(st.session_state["total_sales_change_abs"])
|
131 |
+
and st.session_state["allow_sales_update"]
|
132 |
+
):
|
133 |
modified_sales = extract_number_for_string(
|
134 |
st.session_state["total_sales_change_abs"]
|
135 |
)
|
136 |
st.session_state["total_sales_change"] = round(
|
137 |
((modified_sales / actual_sales) - 1) * 100
|
138 |
)
|
139 |
+
st.session_state["total_sales_change_abs_slider"] = numerize(modified_sales, 1)
|
140 |
|
141 |
|
142 |
def update_sales():
|
|
|
145 |
* _scenario.actual_total_sales,
|
146 |
1,
|
147 |
)
|
148 |
+
st.session_state["total_sales_change_abs_slider"] = numerize(
|
149 |
+
(1 + st.session_state["total_sales_change"] / 100)
|
150 |
+
* _scenario.actual_total_sales,
|
151 |
+
1,
|
152 |
+
)
|
153 |
+
|
154 |
+
|
155 |
+
def update_all_spends_abs_slider():
|
156 |
+
actual_spends = _scenario.actual_total_spends
|
157 |
+
if validate_input(st.session_state["total_spends_change_abs_slider"]):
|
158 |
+
modified_spends = extract_number_for_string(
|
159 |
+
st.session_state["total_spends_change_abs_slider"]
|
160 |
+
)
|
161 |
+
st.session_state["total_spends_change"] = round(
|
162 |
+
((modified_spends / actual_spends) - 1) * 100
|
163 |
+
)
|
164 |
+
st.session_state["total_spends_change_abs"] = numerize(modified_spends, 1)
|
165 |
+
|
166 |
+
update_all_spends()
|
167 |
+
|
168 |
+
|
169 |
+
# def update_all_spends_abs_slider():
|
170 |
+
# actual_spends = _scenario.actual_total_spends
|
171 |
+
# if validate_input(st.session_state["total_spends_change_abs_slider"]):
|
172 |
+
# print("#" * 100)
|
173 |
+
# print(st.session_state["total_spends_change_abs_slider"])
|
174 |
+
# print("#" * 100)
|
175 |
+
|
176 |
+
# modified_spends = extract_number_for_string(
|
177 |
+
# st.session_state["total_spends_change_abs_slider"]
|
178 |
+
# )
|
179 |
+
# st.session_state["total_spends_change"] = (
|
180 |
+
# (modified_spends / actual_spends) - 1
|
181 |
+
# ) * 100
|
182 |
+
# st.session_state["total_spends_change_abs"] = st.session_state[
|
183 |
+
# "total_spends_change_abs_slider"
|
184 |
+
# ]
|
185 |
+
|
186 |
+
# update_all_spends()
|
187 |
|
188 |
|
189 |
def update_all_spends_abs():
|
190 |
+
if (
|
191 |
+
st.session_state["total_spends_change_abs"]
|
192 |
+
in st.session_state["total_spends_change_abs_slider_options"]
|
193 |
+
):
|
194 |
+
st.session_state["allow_spends_update"] = True
|
195 |
+
else:
|
196 |
+
st.session_state["allow_spends_update"] = False
|
197 |
+
|
198 |
actual_spends = _scenario.actual_total_spends
|
199 |
+
if (
|
200 |
+
validate_input(st.session_state["total_spends_change_abs"])
|
201 |
+
and st.session_state["allow_spends_update"]
|
202 |
+
):
|
203 |
modified_spends = extract_number_for_string(
|
204 |
st.session_state["total_spends_change_abs"]
|
205 |
)
|
|
|
|
|
|
|
206 |
st.session_state["total_spends_change"] = (
|
207 |
(modified_spends / actual_spends) - 1
|
208 |
) * 100
|
209 |
+
st.session_state["total_spends_change_abs_slider"] = st.session_state[
|
210 |
+
"total_spends_change_abs"
|
211 |
+
]
|
212 |
|
213 |
update_all_spends()
|
214 |
|
215 |
|
216 |
+
def update_spends():
|
217 |
+
st.session_state["total_spends_change_abs"] = numerize(
|
218 |
+
(1 + st.session_state["total_spends_change"] / 100)
|
219 |
+
* _scenario.actual_total_spends,
|
220 |
+
1,
|
221 |
+
)
|
222 |
+
st.session_state["total_spends_change_abs_slider"] = numerize(
|
223 |
+
(1 + st.session_state["total_spends_change"] / 100)
|
224 |
+
* _scenario.actual_total_spends,
|
225 |
+
1,
|
226 |
+
)
|
227 |
+
|
228 |
+
update_all_spends()
|
229 |
+
|
230 |
+
|
231 |
def update_all_spends():
|
232 |
"""
|
233 |
Updates spends for all the channels with the given overall spends change
|
234 |
"""
|
235 |
percent_change = st.session_state["total_spends_change"]
|
236 |
+
|
|
|
|
|
237 |
for channel_name in st.session_state["channels_list"]:
|
238 |
channel = st.session_state["scenario"].channels[channel_name]
|
239 |
current_spends = channel.actual_total_spends
|
|
|
285 |
"""
|
286 |
|
287 |
if validate_input(st.session_state[channel_name]):
|
288 |
+
modified_spends = extract_number_for_string(st.session_state[channel_name])
|
|
|
|
|
289 |
prev_spends = (
|
290 |
+
st.session_state["scenario"].channels[channel_name].actual_total_spends
|
291 |
+
* st.session_state["scenario"].channels[channel_name].conversion_rate
|
|
|
|
|
|
|
|
|
292 |
)
|
293 |
st.session_state[f"{channel_name}_change"] = round(
|
294 |
100 * (modified_spends - prev_spends) / prev_spends, 2
|
|
|
296 |
st.session_state["scenario"].update(
|
297 |
channel_name,
|
298 |
modified_spends
|
299 |
+
/ st.session_state["scenario"].channels[channel_name].conversion_rate,
|
|
|
|
|
300 |
)
|
301 |
# st.session_state['scenario'].update(channel_name, modified_spends)
|
302 |
# else:
|
|
|
327 |
st.session_state[f"{channel_name}_selected"] = st.session_state[
|
328 |
"optimze_all_channels"
|
329 |
]
|
330 |
+
st.session_state["optimization_channels"][channel_name] = st.session_state[
|
331 |
+
"optimze_all_channels"
|
332 |
+
]
|
333 |
|
334 |
|
335 |
def update_penalty():
|
336 |
"""
|
337 |
Updates the penalty flag for sales calculation
|
338 |
"""
|
339 |
+
st.session_state["scenario"].update_penalty(st.session_state["apply_penalty"])
|
|
|
|
|
340 |
|
341 |
|
342 |
+
def reset_scenario(panel_selected, file_selected, updated_rcs):
|
343 |
# #print(st.session_state['default_scenario_dict'])
|
344 |
# st.session_state['scenario'] = class_from_dict(st.session_state['default_scenario_dict'])
|
345 |
# for channel in st.session_state['scenario'].channels.values():
|
346 |
# st.session_state[channel.name] = float(channel.actual_total_spends * channel.conversion_rate)
|
347 |
+
# initialize_data()
|
348 |
+
|
349 |
+
if panel_selected == "Aggregated":
|
350 |
+
initialize_data(
|
351 |
+
panel=panel_selected,
|
352 |
+
target_file=file_selected,
|
353 |
+
updated_rcs=updated_rcs,
|
354 |
+
metrics=metrics_selected,
|
355 |
+
)
|
356 |
+
panel = None
|
357 |
+
else:
|
358 |
+
initialize_data(
|
359 |
+
panel=panel_selected,
|
360 |
+
target_file=file_selected,
|
361 |
+
updated_rcs=updated_rcs,
|
362 |
+
metrics=metrics_selected,
|
363 |
+
)
|
364 |
+
|
365 |
for channel_name in st.session_state["channels_list"]:
|
366 |
st.session_state[f"{channel_name}_selected"] = False
|
367 |
st.session_state[f"{channel_name}_change"] = 0
|
368 |
st.session_state["optimze_all_channels"] = False
|
369 |
|
370 |
+
st.session_state["total_sales_change"] = 0
|
371 |
+
|
372 |
+
update_spends()
|
373 |
+
update_sales()
|
374 |
+
|
375 |
+
reset_inputs()
|
376 |
+
|
377 |
+
# st.rerun()
|
378 |
+
|
379 |
|
380 |
def format_number(num):
|
381 |
if num >= 1_000_000:
|
|
|
407 |
hovertemplate="%{x:.2s}",
|
408 |
)
|
409 |
|
410 |
+
fig.update_layout(xaxis_title=x, yaxis_title="Channel Name", showlegend=False)
|
|
|
|
|
411 |
return fig
|
412 |
|
413 |
|
|
|
442 |
relative_position = (current_channel_spends - start_value) / (
|
443 |
left_value - start_value
|
444 |
)
|
445 |
+
alpha = 0.8 - (0.6 * relative_position) # Alpha decreases from start to end
|
|
|
|
|
446 |
|
447 |
elif left_value < current_channel_spends <= right_value:
|
448 |
color = "green"
|
449 |
relative_position = (current_channel_spends - left_value) / (
|
450 |
right_value - left_value
|
451 |
)
|
452 |
+
alpha = 0.8 - (0.6 * relative_position) # Alpha decreases from start to end
|
|
|
|
|
453 |
|
454 |
elif right_value < current_channel_spends <= end_value:
|
455 |
color = "red"
|
456 |
relative_position = (current_channel_spends - right_value) / (
|
457 |
end_value - right_value
|
458 |
)
|
459 |
+
alpha = 0.2 + (0.6 * relative_position) # Alpha increases from start to end
|
|
|
|
|
460 |
|
461 |
else:
|
462 |
# Default case, if the spends are outside the defined ranges
|
|
|
526 |
|
527 |
for index in range(len(x_plot)):
|
528 |
marginal_roi.append(
|
529 |
+
a * y[index] * (1 - y[index] / np.maximum(K, np.finfo(float).eps))
|
|
|
|
|
530 |
)
|
531 |
|
532 |
x = (
|
|
|
558 |
st.session_state["scenario"].channels[col].modified_total_spends
|
559 |
* st.session_state["scenario"].channels[col].conversion_rate
|
560 |
)
|
561 |
+
y_optimal = st.session_state["scenario"].channels[col].modified_total_sales
|
|
|
|
|
562 |
|
563 |
# if col == "Paid_social_others":
|
564 |
# debug_temp(x_optimal * x_actual / x_actual.sum(), power, K, b, a, x0)
|
|
|
666 |
fig.update_layout(
|
667 |
# height=1000,
|
668 |
# width=1000,
|
669 |
+
title_text=f"Response Curves (X: Spends Vs Y: {target})",
|
670 |
showlegend=False,
|
671 |
shapes=shapes,
|
672 |
)
|
|
|
808 |
st.session_state["authenticator"] = authenticator
|
809 |
name, authentication_status, username = authenticator.login("Login", "main")
|
810 |
auth_status = st.session_state.get("authentication_status")
|
811 |
+
|
812 |
+
import os
|
813 |
+
import glob
|
814 |
+
|
815 |
+
|
816 |
+
def get_excel_names(directory):
|
817 |
+
# Create a list to hold the final parts of the filenames
|
818 |
+
last_portions = []
|
819 |
+
|
820 |
+
# Patterns to match Excel files (.xlsx and .xls) that contain @#
|
821 |
+
patterns = [
|
822 |
+
os.path.join(directory, "*@#*.xlsx"),
|
823 |
+
os.path.join(directory, "*@#*.xls"),
|
824 |
+
]
|
825 |
+
|
826 |
+
# Process each pattern
|
827 |
+
for pattern in patterns:
|
828 |
+
files = glob.glob(pattern)
|
829 |
+
|
830 |
+
# Extracting the last portion after @# for each file
|
831 |
+
for file in files:
|
832 |
+
base_name = os.path.basename(file)
|
833 |
+
last_portion = base_name.split("@#")[-1]
|
834 |
+
last_portion = last_portion.replace(".xlsx", "").replace(
|
835 |
+
".xls", ""
|
836 |
+
) # Removing extensions
|
837 |
+
last_portions.append(last_portion)
|
838 |
+
|
839 |
+
return last_portions
|
840 |
+
|
841 |
+
|
842 |
+
def name_formating(channel_name):
|
843 |
+
# Replace underscores with spaces
|
844 |
+
name_mod = channel_name.replace("_", " ")
|
845 |
+
|
846 |
+
# Capitalize the first letter of each word
|
847 |
+
name_mod = name_mod.title()
|
848 |
+
|
849 |
+
return name_mod
|
850 |
+
|
851 |
+
|
852 |
+
@st.cache_resource(show_spinner=False)
|
853 |
+
def panel_fetch(file_selected):
|
854 |
+
raw_data_mmm_df = pd.read_excel(file_selected, sheet_name="RAW DATA MMM")
|
855 |
+
|
856 |
+
if "Panel" in raw_data_mmm_df.columns:
|
857 |
+
panel = list(set(raw_data_mmm_df["Panel"]))
|
858 |
+
else:
|
859 |
+
raw_data_mmm_df = None
|
860 |
+
panel = None
|
861 |
+
|
862 |
+
return panel
|
863 |
+
|
864 |
+
|
865 |
+
def reset_inputs():
|
866 |
+
if "total_spends_change_abs" in st.session_state:
|
867 |
+
del st.session_state.total_spends_change_abs
|
868 |
+
if "total_spends_change" in st.session_state:
|
869 |
+
del st.session_state.total_spends_change
|
870 |
+
if "total_spends_change_abs_slider" in st.session_state:
|
871 |
+
del st.session_state.total_spends_change_abs_slider
|
872 |
+
|
873 |
+
if "total_sales_change_abs" in st.session_state:
|
874 |
+
del st.session_state.total_sales_change_abs
|
875 |
+
if "total_sales_change" in st.session_state:
|
876 |
+
del st.session_state.total_sales_change
|
877 |
+
if "total_sales_change_abs_slider" in st.session_state:
|
878 |
+
del st.session_state.total_sales_change_abs_slider
|
879 |
+
|
880 |
+
st.session_state["initialized"] = False
|
881 |
+
|
882 |
+
|
883 |
if auth_status == True:
|
884 |
authenticator.logout("Logout", "main")
|
885 |
+
|
886 |
+
st.header("Simulation")
|
887 |
+
col1, col2 = st.columns([1, 1])
|
888 |
+
|
889 |
+
# Response Metrics
|
890 |
+
directory = "metrics_level_data"
|
891 |
+
metrics_list = get_excel_names(directory)
|
892 |
+
metrics_selected = col1.selectbox(
|
893 |
+
"Response Metrics",
|
894 |
+
metrics_list,
|
895 |
+
format_func=name_formating,
|
896 |
+
index=0,
|
897 |
+
on_change=reset_inputs,
|
898 |
+
)
|
899 |
+
|
900 |
+
# Target
|
901 |
+
target = name_formating(metrics_selected)
|
902 |
+
|
903 |
+
file_selected = (
|
904 |
+
f".\metrics_level_data\Overview_data_test_panel@#{metrics_selected}.xlsx"
|
905 |
+
)
|
906 |
+
|
907 |
+
# Panel List
|
908 |
+
panel_list = panel_fetch(file_selected)
|
909 |
+
|
910 |
+
# Panel Selected
|
911 |
+
panel_selected = col2.selectbox(
|
912 |
+
"Panel",
|
913 |
+
["Aggregated"] + panel_list,
|
914 |
+
index=0,
|
915 |
+
on_change=reset_inputs,
|
916 |
+
)
|
917 |
+
|
918 |
+
if "update_rcs" in st.session_state:
|
919 |
+
updated_rcs = st.session_state["update_rcs"]
|
920 |
+
else:
|
921 |
+
updated_rcs = None
|
922 |
+
|
923 |
+
if "first_time" not in st.session_state:
|
924 |
+
st.session_state["first_time"] = True
|
925 |
+
|
926 |
+
# Check if state is initiaized
|
927 |
is_state_initiaized = st.session_state.get("initialized", False)
|
928 |
+
if not is_state_initiaized or st.session_state["first_time"]:
|
929 |
+
# initialize_data()
|
930 |
+
if panel_selected == "Aggregated":
|
931 |
+
initialize_data(
|
932 |
+
panel=panel_selected,
|
933 |
+
target_file=file_selected,
|
934 |
+
updated_rcs=updated_rcs,
|
935 |
+
metrics=metrics_selected,
|
936 |
+
)
|
937 |
+
panel = None
|
938 |
+
else:
|
939 |
+
initialize_data(
|
940 |
+
panel=panel_selected,
|
941 |
+
target_file=file_selected,
|
942 |
+
updated_rcs=updated_rcs,
|
943 |
+
metrics=metrics_selected,
|
944 |
+
)
|
945 |
+
st.session_state["initialized"] = True
|
946 |
+
st.session_state["first_time"] = False
|
947 |
|
948 |
+
# Channels List
|
949 |
channels_list = st.session_state["channels_list"]
|
950 |
|
951 |
# ======================================================== #
|
|
|
953 |
# ======================================================== #
|
954 |
|
955 |
# print(list(st.session_state.keys()))
|
|
|
|
|
956 |
main_header = st.columns((2, 2))
|
957 |
sub_header = st.columns((1, 1, 1, 1))
|
958 |
_scenario = st.session_state["scenario"]
|
959 |
|
960 |
+
if "total_spends_change" not in st.session_state:
|
961 |
+
st.session_state.total_spends_change = 0
|
962 |
+
|
963 |
+
if "total_sales_change" not in st.session_state:
|
964 |
+
st.session_state.total_sales_change = 0
|
965 |
+
|
966 |
if "total_spends_change_abs" not in st.session_state:
|
967 |
st.session_state["total_spends_change_abs"] = numerize(
|
968 |
_scenario.actual_total_spends, 1
|
|
|
973 |
_scenario.actual_total_sales, 1
|
974 |
)
|
975 |
|
976 |
+
if "total_spends_change_abs_slider" not in st.session_state:
|
977 |
+
st.session_state.total_spends_change_abs_slider = numerize(
|
978 |
+
_scenario.actual_total_spends, 1
|
979 |
+
)
|
980 |
+
|
981 |
+
if "total_sales_change_abs_slider" not in st.session_state:
|
982 |
+
st.session_state.total_sales_change_abs_slider = numerize(
|
983 |
+
_scenario.actual_total_sales, 1
|
984 |
+
)
|
985 |
+
|
986 |
with main_header[0]:
|
987 |
st.subheader("Actual")
|
988 |
|
|
|
990 |
st.subheader("Simulated")
|
991 |
|
992 |
with sub_header[0]:
|
993 |
+
st.metric(label="Spends", value=format_numbers(_scenario.actual_total_spends))
|
|
|
|
|
994 |
|
995 |
with sub_header[1]:
|
996 |
st.metric(
|
|
|
1016 |
delta=numerize(_scenario.delta_sales, 1),
|
1017 |
)
|
1018 |
|
1019 |
+
with st.expander("Channel Spends Simulator", expanded=True):
|
1020 |
_columns1 = st.columns((2, 2, 1, 1))
|
1021 |
with _columns1[0]:
|
|
|
1022 |
optimization_selection = st.selectbox(
|
1023 |
+
"Optimize", options=["Media Spends", target], key="optimization_key"
|
1024 |
)
|
1025 |
+
|
1026 |
with _columns1[1]:
|
1027 |
st.markdown("#")
|
1028 |
+
# if st.checkbox(
|
1029 |
+
# label="Optimize all Channels",
|
1030 |
+
# key="optimze_all_channels",
|
1031 |
+
# value=False,
|
1032 |
+
# # on_change=select_all_channels_for_optimization,
|
1033 |
+
# ):
|
1034 |
+
# select_all_channels_for_optimization()
|
1035 |
+
|
1036 |
st.checkbox(
|
1037 |
label="Optimize all Channels",
|
1038 |
+
key="optimze_all_channels",
|
1039 |
value=False,
|
1040 |
on_change=select_all_channels_for_optimization,
|
1041 |
)
|
1042 |
|
1043 |
with _columns1[2]:
|
1044 |
st.markdown("#")
|
1045 |
+
# st.button(
|
1046 |
+
# "Optimize",
|
1047 |
+
# on_click=optimize,
|
1048 |
+
# args=(st.session_state["optimization_key"]),
|
1049 |
+
# use_container_width=True,
|
1050 |
+
# )
|
1051 |
+
|
1052 |
+
optimize_placeholder = st.empty()
|
1053 |
|
1054 |
with _columns1[3]:
|
1055 |
st.markdown("#")
|
1056 |
+
st.button(
|
1057 |
+
"Reset",
|
1058 |
+
on_click=reset_scenario,
|
1059 |
+
args=(panel_selected, file_selected, updated_rcs),
|
1060 |
+
use_container_width=True,
|
1061 |
+
)
|
1062 |
|
1063 |
_columns2 = st.columns((2, 2, 2))
|
1064 |
if st.session_state["optimization_key"] == "Media Spends":
|
|
|
1069 |
# label_visibility="collapsed",
|
1070 |
on_change=update_all_spends_abs,
|
1071 |
)
|
|
|
1072 |
|
1073 |
+
with _columns2[1]:
|
1074 |
st.number_input(
|
1075 |
+
"Percent Change",
|
1076 |
+
key="total_spends_change",
|
1077 |
+
min_value=-50,
|
1078 |
+
max_value=50,
|
1079 |
step=1,
|
1080 |
+
on_change=update_spends,
|
1081 |
+
)
|
1082 |
+
|
1083 |
+
with _columns2[2]:
|
1084 |
+
min_value = round(_scenario.actual_total_spends * 0.5)
|
1085 |
+
max_value = round(_scenario.actual_total_spends * 1.5)
|
1086 |
+
st.session_state["total_spends_change_abs_slider_options"] = [
|
1087 |
+
numerize(value, 1)
|
1088 |
+
for value in range(min_value, max_value + 1, int(1e4))
|
1089 |
+
]
|
1090 |
+
|
1091 |
+
st.select_slider(
|
1092 |
+
"Absolute Slider",
|
1093 |
+
options=st.session_state["total_spends_change_abs_slider_options"],
|
1094 |
+
key="total_spends_change_abs_slider",
|
1095 |
+
on_change=update_all_spends_abs_slider,
|
1096 |
)
|
|
|
|
|
1097 |
|
1098 |
+
elif st.session_state["optimization_key"] == target:
|
1099 |
+
with _columns2[0]:
|
1100 |
sales_input = st.text_input(
|
1101 |
"Absolute",
|
1102 |
key="total_sales_change_abs",
|
1103 |
on_change=update_sales_abs,
|
1104 |
)
|
1105 |
+
|
1106 |
with _columns2[1]:
|
1107 |
st.number_input(
|
1108 |
+
"Percent Change",
|
1109 |
+
key="total_sales_change",
|
1110 |
+
min_value=-50,
|
1111 |
+
max_value=50,
|
1112 |
step=1,
|
1113 |
on_change=update_sales,
|
1114 |
)
|
1115 |
+
with _columns2[2]:
|
1116 |
+
min_value = round(_scenario.actual_total_sales * 0.5)
|
1117 |
+
max_value = round(_scenario.actual_total_sales * 1.5)
|
1118 |
+
st.session_state["total_sales_change_abs_slider_options"] = [
|
1119 |
+
numerize(value, 1)
|
1120 |
+
for value in range(min_value, max_value + 1, int(1e5))
|
1121 |
+
]
|
1122 |
+
|
1123 |
+
st.select_slider(
|
1124 |
+
"Absolute Slider",
|
1125 |
+
options=st.session_state["total_sales_change_abs_slider_options"],
|
1126 |
+
key="total_sales_change_abs_slider",
|
1127 |
+
on_change=update_sales_abs_slider,
|
1128 |
+
)
|
1129 |
|
1130 |
+
if (
|
1131 |
+
not st.session_state["allow_sales_update"]
|
1132 |
+
and optimization_selection == target
|
1133 |
+
):
|
1134 |
+
st.warning("Invalid Input")
|
1135 |
+
|
1136 |
+
if (
|
1137 |
+
not st.session_state["allow_spends_update"]
|
1138 |
+
and optimization_selection == "Media Spends"
|
1139 |
+
):
|
1140 |
+
st.warning("Invalid Input")
|
1141 |
+
|
1142 |
+
status_placeholder = st.empty()
|
1143 |
+
|
1144 |
+
# if optimize_placeholder.button("Optimize", use_container_width=True):
|
1145 |
+
# optimize(st.session_state["optimization_key"], status_placeholder)
|
1146 |
+
# st.rerun()
|
1147 |
+
|
1148 |
+
optimize_placeholder.button(
|
1149 |
+
"Optimize",
|
1150 |
+
on_click=optimize,
|
1151 |
+
args=(st.session_state["optimization_key"], status_placeholder),
|
1152 |
+
use_container_width=True,
|
1153 |
)
|
1154 |
+
|
1155 |
+
st.markdown("""<hr class="spends-heading-seperator">""", unsafe_allow_html=True)
|
1156 |
_columns = st.columns((2.5, 2, 1.5, 1.5, 1))
|
1157 |
with _columns[0]:
|
1158 |
generate_spending_header("Channel")
|
|
|
1165 |
with _columns[4]:
|
1166 |
generate_spending_header("Optimize")
|
1167 |
|
1168 |
+
st.markdown("""<hr class="spends-heading-seperator">""", unsafe_allow_html=True)
|
|
|
|
|
1169 |
|
1170 |
if "acutual_predicted" not in st.session_state:
|
1171 |
st.session_state["acutual_predicted"] = {
|
|
|
1175 |
"Delta": [],
|
1176 |
}
|
1177 |
for i, channel_name in enumerate(channels_list):
|
1178 |
+
_channel_class = st.session_state["scenario"].channels[channel_name]
|
|
|
|
|
1179 |
_columns = st.columns((2.5, 1.5, 1.5, 1.5, 1))
|
1180 |
with _columns[0]:
|
1181 |
st.write(channel_name_formating(channel_name))
|
|
|
1184 |
with _columns[1]:
|
1185 |
channel_bounds = _channel_class.bounds
|
1186 |
channel_spends = float(_channel_class.actual_total_spends)
|
1187 |
+
min_value = float((1 + channel_bounds[0] / 100) * channel_spends)
|
1188 |
+
max_value = float((1 + channel_bounds[1] / 100) * channel_spends)
|
|
|
|
|
|
|
|
|
1189 |
##print(st.session_state[channel_name])
|
1190 |
spend_input = st.text_input(
|
1191 |
channel_name,
|
|
|
1196 |
if not validate_input(spend_input):
|
1197 |
st.error("Invalid input")
|
1198 |
|
1199 |
+
channel_name_current = f"{channel_name}_change"
|
1200 |
+
|
1201 |
st.number_input(
|
1202 |
+
"Percent Change",
|
1203 |
+
key=channel_name_current,
|
1204 |
step=1,
|
1205 |
on_change=partial(update_data_by_percent, channel_name),
|
1206 |
)
|
|
|
1212 |
* _channel_class.conversion_rate
|
1213 |
)
|
1214 |
actual_channel_spends = float(
|
1215 |
+
_channel_class.actual_total_spends * _channel_class.conversion_rate
|
|
|
1216 |
)
|
1217 |
spends_delta = float(
|
1218 |
+
_channel_class.delta_spends * _channel_class.conversion_rate
|
|
|
1219 |
)
|
1220 |
st.session_state["acutual_predicted"]["Channel_name"].append(
|
1221 |
channel_name
|
|
|
1223 |
st.session_state["acutual_predicted"]["Actual_spend"].append(
|
1224 |
actual_channel_spends
|
1225 |
)
|
1226 |
+
st.session_state["acutual_predicted"]["Optimized_spend"].append(
|
1227 |
+
current_channel_spends
|
|
|
|
|
|
|
1228 |
)
|
1229 |
+
st.session_state["acutual_predicted"]["Delta"].append(spends_delta)
|
1230 |
## REMOVE
|
1231 |
st.metric(
|
1232 |
"Spends",
|
|
|
1237 |
|
1238 |
with _columns[3]:
|
1239 |
# sales
|
1240 |
+
current_channel_sales = float(_channel_class.modified_total_sales)
|
|
|
|
|
1241 |
actual_channel_sales = float(_channel_class.actual_total_sales)
|
1242 |
sales_delta = float(_channel_class.delta_sales)
|
1243 |
st.metric(
|
1244 |
target,
|
1245 |
+
format_numbers(current_channel_sales, include_indicator=False),
|
|
|
|
|
1246 |
delta=numerize(sales_delta, 1),
|
1247 |
label_visibility="collapsed",
|
1248 |
)
|
1249 |
|
1250 |
with _columns[4]:
|
1251 |
|
1252 |
+
# if st.checkbox(
|
1253 |
+
# label="select for optimization",
|
1254 |
+
# key=f"{channel_name}_selected",
|
1255 |
+
# value=False,
|
1256 |
+
# # on_change=partial(select_channel_for_optimization, channel_name),
|
1257 |
+
# label_visibility="collapsed",
|
1258 |
+
# ):
|
1259 |
+
# select_channel_for_optimization(channel_name)
|
1260 |
+
|
1261 |
st.checkbox(
|
1262 |
label="select for optimization",
|
1263 |
key=f"{channel_name}_selected",
|
1264 |
value=False,
|
1265 |
+
on_change=partial(select_channel_for_optimization, channel_name),
|
|
|
|
|
1266 |
label_visibility="collapsed",
|
1267 |
)
|
1268 |
|
|
|
1274 |
# Bins
|
1275 |
col = channels_list[i]
|
1276 |
x_actual = st.session_state["scenario"].channels[col].actual_spends
|
1277 |
+
x_modified = st.session_state["scenario"].channels[col].modified_spends
|
|
|
|
|
1278 |
|
1279 |
x_total = x_modified.sum()
|
1280 |
power = np.ceil(np.log(x_actual.max()) / np.log(10)) - 3
|
1281 |
|
1282 |
+
updated_rcs_key = f"{metrics_selected}#@{panel_selected}#@{channel_name}"
|
1283 |
+
|
1284 |
+
if updated_rcs and updated_rcs_key in list(updated_rcs.keys()):
|
1285 |
+
K = updated_rcs[updated_rcs_key]["K"]
|
1286 |
+
b = updated_rcs[updated_rcs_key]["b"]
|
1287 |
+
a = updated_rcs[updated_rcs_key]["a"]
|
1288 |
+
x0 = updated_rcs[updated_rcs_key]["x0"]
|
1289 |
+
else:
|
1290 |
+
K = st.session_state["rcs"][col]["K"]
|
1291 |
+
b = st.session_state["rcs"][col]["b"]
|
1292 |
+
a = st.session_state["rcs"][col]["a"]
|
1293 |
+
x0 = st.session_state["rcs"][col]["x0"]
|
1294 |
|
1295 |
x_plot = np.linspace(0, 5 * x_actual.sum(), 200)
|
1296 |
|
1297 |
+
# Append current_channel_spends to the end of x_plot
|
1298 |
+
x_plot = np.append(x_plot, current_channel_spends)
|
1299 |
+
|
1300 |
x, y, marginal_roi = [], [], []
|
1301 |
for x_p in x_plot:
|
1302 |
x.append(x_p * x_actual / x_actual.sum())
|
|
|
1306 |
|
1307 |
for index in range(len(x_plot)):
|
1308 |
marginal_roi.append(
|
1309 |
+
a * y[index] * (1 - y[index] / np.maximum(K, np.finfo(float).eps))
|
|
|
|
|
1310 |
)
|
1311 |
|
1312 |
x = (
|
|
|
1321 |
|
1322 |
roi = y / np.maximum(x, np.finfo(float).eps)
|
1323 |
|
1324 |
+
roi_current, marginal_roi_current = roi[-1], marginal_roi[-1]
|
1325 |
+
x, y, roi, marginal_roi = (
|
1326 |
+
x[:-1],
|
1327 |
+
y[:-1],
|
1328 |
+
roi[:-1],
|
1329 |
+
marginal_roi[:-1],
|
1330 |
+
) # Drop data for current spends
|
1331 |
+
|
1332 |
+
start_value, end_value, left_value, right_value = find_segment_value(
|
1333 |
+
x,
|
1334 |
+
roi,
|
1335 |
+
marginal_roi,
|
1336 |
)
|
1337 |
|
1338 |
rgba = calculate_rgba(
|
|
|
1343 |
current_channel_spends,
|
1344 |
)
|
1345 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1346 |
with bin_placeholder:
|
1347 |
st.markdown(
|
1348 |
f"""
|
|
|
1360 |
unsafe_allow_html=True,
|
1361 |
)
|
1362 |
|
1363 |
+
with st.expander("See Response Curves", expanded=True):
|
1364 |
fig = plot_response_curves()
|
1365 |
st.plotly_chart(fig, use_container_width=True)
|
1366 |
|
|
|
1380 |
)
|
1381 |
|
1382 |
summary_df = pd.DataFrame(st.session_state["acutual_predicted"])
|
1383 |
+
summary_df.drop_duplicates(subset="Channel_name", keep="last", inplace=True)
|
|
|
|
|
1384 |
|
1385 |
summary_df_sorted = summary_df.sort_values(by="Delta", ascending=False)
|
1386 |
summary_df_sorted["Delta_percent"] = np.round(
|
1387 |
+
((summary_df_sorted["Optimized_spend"] / summary_df_sorted["Actual_spend"]) - 1)
|
|
|
|
|
|
|
|
|
|
|
|
|
1388 |
* 100,
|
1389 |
2,
|
1390 |
)
|
|
|
1412 |
authenticator.forgot_password("Forgot password")
|
1413 |
)
|
1414 |
if username_forgot_pw:
|
1415 |
+
st.session_state["config"]["credentials"]["usernames"][username_forgot_pw][
|
1416 |
+
"password"
|
1417 |
+
] = stauth.Hasher([random_password]).generate()[0]
|
1418 |
send_email(email_forgot_password, random_password)
|
1419 |
st.success("New password sent securely")
|
1420 |
# Random password to be transferred to user securely
|
requirements.txt
CHANGED
@@ -1,102 +1,94 @@
|
|
1 |
-
altair==
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
et-xmlfile==1.1.0
|
15 |
-
extra-streamlit-components==0.1.56
|
16 |
-
fonttools==4.
|
17 |
-
gitdb==4.0.
|
18 |
-
GitPython==3.1.
|
19 |
-
htmlmin==0.1.12
|
20 |
-
idna==3.
|
21 |
-
ImageHash==4.3.1
|
22 |
-
importlib-metadata==
|
23 |
-
importlib-resources==6.1.
|
24 |
-
Jinja2==3.1.
|
25 |
-
joblib==1.3.2
|
26 |
-
jsonschema==4.
|
27 |
-
jsonschema-specifications==2023.
|
28 |
-
|
29 |
-
|
30 |
-
markdown-it-py==3.0.0
|
31 |
-
MarkupSafe==2.1.
|
32 |
-
matplotlib==3.7.0
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
validators==0.22.0
|
96 |
-
visions==0.7.5
|
97 |
-
watchdog==4.0.0
|
98 |
-
wheel==0.42.0
|
99 |
-
wordcloud==1.9.3
|
100 |
-
ydata-profiling==4.6.5
|
101 |
-
zipp==3.17.0
|
102 |
-
|
|
|
1 |
+
altair == 4.2.0
|
2 |
+
attrs == 23.1.0
|
3 |
+
bcrypt == 4.0.1
|
4 |
+
blinker == 1.6.2
|
5 |
+
cachetools == 5.3.1
|
6 |
+
certifi == 2023.7.22
|
7 |
+
charset-normalizer == 3.2.0
|
8 |
+
click == 8.1.7
|
9 |
+
colorama == 0.4.6
|
10 |
+
contourpy == 1.1.1
|
11 |
+
cycler == 0.11.0
|
12 |
+
dacite == 1.8.1
|
13 |
+
entrypoints == 0.4
|
14 |
+
et-xmlfile == 1.1.0
|
15 |
+
extra-streamlit-components == 0.1.56
|
16 |
+
fonttools == 4.42.1
|
17 |
+
gitdb == 4.0.10
|
18 |
+
GitPython == 3.1.35
|
19 |
+
htmlmin == 0.1.12
|
20 |
+
idna == 3.4
|
21 |
+
ImageHash == 4.3.1
|
22 |
+
importlib-metadata == 6.8.0
|
23 |
+
importlib-resources == 6.1.0
|
24 |
+
Jinja2 == 3.1.2
|
25 |
+
joblib == 1.3.2
|
26 |
+
jsonschema == 4.19.0
|
27 |
+
jsonschema-specifications== 2023.7.1
|
28 |
+
kaleido == 0.2.1
|
29 |
+
kiwisolver == 1.4.5
|
30 |
+
markdown-it-py == 3.0.0
|
31 |
+
MarkupSafe == 2.1.3
|
32 |
+
matplotlib == 3.7.0
|
33 |
+
mdurl == 0.1.2
|
34 |
+
networkx == 3.1
|
35 |
+
numerize == 0.12
|
36 |
+
numpy == 1.23.5
|
37 |
+
openpyxl>=3.1.0
|
38 |
+
packaging == 23.1
|
39 |
+
pandas == 1.5.2
|
40 |
+
pandas-profiling == 3.6.6
|
41 |
+
patsy == 0.5.3
|
42 |
+
phik == 0.12.3
|
43 |
+
Pillow == 10.0.0
|
44 |
+
pip == 23.2.1
|
45 |
+
plotly == 5.11.0
|
46 |
+
protobuf == 3.20.3
|
47 |
+
pyarrow == 13.0.0
|
48 |
+
pydantic == 1.10.13
|
49 |
+
pydeck == 0.8.1b0
|
50 |
+
Pygments == 2.16.1
|
51 |
+
PyJWT == 2.8.0
|
52 |
+
Pympler == 1.0.1
|
53 |
+
pyparsing == 3.1.1
|
54 |
+
python-dateutil == 2.8.2
|
55 |
+
python-decouple == 3.8
|
56 |
+
pytz == 2023.3.post1
|
57 |
+
PyWavelets == 1.4.1
|
58 |
+
PyYAML == 6.0.1
|
59 |
+
referencing == 0.30.2
|
60 |
+
requests == 2.31.0
|
61 |
+
rich == 13.5.2
|
62 |
+
rpds-py == 0.10.2
|
63 |
+
scikit-learn == 1.1.3
|
64 |
+
scipy == 1.9.3
|
65 |
+
seaborn == 0.12.2
|
66 |
+
semver == 3.0.1
|
67 |
+
setuptools == 68.1.2
|
68 |
+
six == 1.16.0
|
69 |
+
smmap == 5.0.0
|
70 |
+
statsmodels == 0.14.0
|
71 |
+
streamlit == 1.16.0
|
72 |
+
streamlit-aggrid == 0.3.4.post3
|
73 |
+
streamlit-authenticator == 0.2.1
|
74 |
+
streamlit-pandas-profiling== 0.1.3
|
75 |
+
sweetviz == 2.2.1
|
76 |
+
tangled-up-in-unicode == 0.2.0
|
77 |
+
tenacity == 8.2.3
|
78 |
+
threadpoolctl == 3.2.0
|
79 |
+
toml == 0.10.2
|
80 |
+
toolz == 0.12.0
|
81 |
+
tornado == 6.3.3
|
82 |
+
tqdm == 4.66.1
|
83 |
+
typeguard == 2.13.3
|
84 |
+
typing_extensions == 4.7.1
|
85 |
+
tzdata == 2023.3
|
86 |
+
tzlocal == 5.0.1
|
87 |
+
urllib3 == 2.0.4
|
88 |
+
validators == 0.22.0
|
89 |
+
visions == 0.7.5
|
90 |
+
watchdog == 3.0.0
|
91 |
+
wheel == 0.41.2
|
92 |
+
wordcloud == 1.9.2
|
93 |
+
ydata-profiling == 4.5.1
|
94 |
+
zipp == 3.16.2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
summary_df.pkl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 1482
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2f2aa1b3c4f759d4179abf2dbed90751ec0849b3750a1019827173d2152954ac
|
3 |
size 1482
|
tuned_model.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9871c17d7d10846b84c31343a1b9fc3ad87c1a67fa8bf8b10b2199032a1581be
|
3 |
+
size 4287842
|
upf_data_converted_old.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
upf_data_converted_old.xlsx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:92716069afa2c16a8afb6494da6d5f93878558de0215b1b9334ffeb997fdc6b6
|
3 |
+
size 1561111
|
upf_data_converted_randomized_resp_metrics.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
upf_data_converted_randomized_resp_metrics.xlsx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bf24972737d4c10d274ce6e3165551442e662992623754dbef11155f4b177531
|
3 |
+
size 1893805
|
utilities.py
CHANGED
@@ -12,7 +12,6 @@ import io
|
|
12 |
import plotly
|
13 |
from pathlib import Path
|
14 |
import pickle
|
15 |
-
import streamlit_authenticator as stauth
|
16 |
import yaml
|
17 |
from yaml import SafeLoader
|
18 |
from streamlit.components.v1 import html
|
@@ -24,27 +23,59 @@ import os
|
|
24 |
import base64
|
25 |
|
26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
|
29 |
-
|
30 |
|
|
|
31 |
|
32 |
-
CURRENCY_INDICATOR = '$'
|
33 |
|
34 |
def load_authenticator():
|
35 |
-
with open(
|
36 |
config = yaml.load(file, Loader=SafeLoader)
|
37 |
-
st.session_state[
|
38 |
authenticator = stauth.Authenticate(
|
39 |
-
config[
|
40 |
-
config[
|
41 |
-
config[
|
42 |
-
config[
|
43 |
-
config[
|
44 |
)
|
45 |
-
st.session_state[
|
46 |
return authenticator
|
47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
def nav_page(page_name, timeout_secs=3):
|
49 |
nav_script = """
|
50 |
<script type="text/javascript">
|
@@ -67,7 +98,10 @@ def nav_page(page_name, timeout_secs=3):
|
|
67 |
attempt_nav_page("%s", new Date(), %d);
|
68 |
});
|
69 |
</script>
|
70 |
-
""" % (
|
|
|
|
|
|
|
71 |
html(nav_script)
|
72 |
|
73 |
|
@@ -92,23 +126,18 @@ data_url = base64.b64encode(contents).decode("utf-8")
|
|
92 |
|
93 |
file_.close()
|
94 |
|
95 |
-
|
96 |
|
97 |
-
DATA_PATH =
|
98 |
|
99 |
-
IMAGES_PATH =
|
100 |
|
101 |
-
|
102 |
|
103 |
def load_local_css(file_name):
|
104 |
|
105 |
with open(file_name) as f:
|
106 |
|
107 |
-
st.markdown(f
|
108 |
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
|
113 |
# def set_header():
|
114 |
|
@@ -129,24 +158,24 @@ data_url1 = base64.b64encode(contents1).decode("utf-8")
|
|
129 |
|
130 |
file_1.close()
|
131 |
|
132 |
-
|
133 |
-
|
134 |
-
DATA_PATH1 = './data'
|
135 |
-
|
136 |
-
IMAGES_PATH1 = './data/images_224_224'
|
137 |
-
|
138 |
|
|
|
139 |
|
|
|
140 |
|
141 |
|
142 |
def set_header():
|
143 |
-
return st.markdown(
|
|
|
144 |
<!-- <h1></h1> -->
|
145 |
<div >
|
146 |
<img class='blend-logo' src="data:image;base64,{data_url1}", alt="Logo">
|
147 |
</div>
|
148 |
<img class='blend-logo' src="data:image;base64,{data_url}", alt="Logo">
|
149 |
-
</div>""",
|
|
|
|
|
|
|
150 |
|
151 |
# def set_header():
|
152 |
# logo_path = "./path/to/your/local/LIME_logo.png" # Replace with the actual file path
|
@@ -157,51 +186,87 @@ def set_header():
|
|
157 |
# </div>""", unsafe_allow_html=True)
|
158 |
|
159 |
|
160 |
-
def s_curve(x,K,b,a,x0):
|
161 |
-
return K / (1 + b * np.exp(-a*(x-x0)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
|
163 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
# uopx_conv_rates = {'streaming_impressions' : 0.007,'digital_impressions' : 0.007,'search_clicks' : 0.00719,'tv_impressions' : 0.000173,
|
165 |
# "digital_clicks":0.005,"streaming_clicks":0.004,'streaming_spends':1,"tv_spends":1,"search_spends":1,
|
166 |
# "digital_spends":1}
|
167 |
-
#print('State initialized')
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
198 |
|
199 |
# spend_df['Week'] = pd.to_datetime(spend_df['Week'], errors='coerce')
|
200 |
# spend_df = spend_df.sort_values(by='Week')
|
201 |
-
|
202 |
|
203 |
channel_list = [col for col in input_df.columns if col not in exclude_columns]
|
204 |
-
|
|
|
205 |
response_curves = {}
|
206 |
mapes = {}
|
207 |
rmses = {}
|
@@ -215,14 +280,14 @@ def initialize_data():
|
|
215 |
dates = input_df.Date.values
|
216 |
actual_output_dic = {}
|
217 |
actual_input_dic = {}
|
218 |
-
|
219 |
for inp_col in channel_list:
|
220 |
-
#st.write(inp_col)
|
221 |
spends = input_df[inp_col].values
|
222 |
x = spends.copy()
|
223 |
-
# upper limit for penalty
|
224 |
-
upper_limits[inp_col] = 2*x.max()
|
225 |
-
|
226 |
# contribution
|
227 |
out_col = [_col for _col in output_df.columns if _col.startswith(inp_col)][0]
|
228 |
y = output_df[out_col].values.copy()
|
@@ -230,96 +295,141 @@ def initialize_data():
|
|
230 |
actual_input_dic[inp_col] = x.copy()
|
231 |
##output cols aggregation
|
232 |
output_cols.append(out_col)
|
233 |
-
|
234 |
## scale the input
|
235 |
-
power =
|
236 |
-
if power >= 0
|
237 |
x = x / 10**power
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
#print(
|
243 |
-
#print(
|
244 |
-
#print(
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
|
|
|
|
|
|
|
|
252 |
mape = (100 * abs(1 - s_curve(x, *params) / y.clip(min=1))).mean()
|
253 |
-
rmse =
|
254 |
-
r2_ = r2_score(y, s_curve(x
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
255 |
|
256 |
-
response_curves[inp_col] = {'K' : params[0], 'b' : params[1], 'a' : params[2], 'x0' : params[3]}
|
257 |
mapes[inp_col] = mape
|
258 |
rmses[inp_col] = rmse
|
259 |
r2[inp_col] = r2_
|
260 |
powers[inp_col] = power
|
261 |
-
|
262 |
-
|
263 |
## conversion rates
|
264 |
-
spend_col = [
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
272 |
##print('Before',conv_rates[inp_col])
|
273 |
# conv_rates[inp_col] = uopx_conv_rates[inp_col]
|
274 |
##print('After',(conv_rates[inp_col]))
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
284 |
channels[inp_col] = channel
|
285 |
if sales is None:
|
286 |
sales = channel.actual_sales
|
287 |
else:
|
288 |
sales += channel.actual_sales
|
289 |
-
other_contributions =
|
290 |
-
|
291 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
292 |
## setting session variables
|
293 |
-
st.session_state[
|
294 |
-
st.session_state[
|
295 |
-
st.session_state[
|
296 |
-
st.session_state[
|
297 |
default_scenario_dict = class_to_dict(scenario)
|
298 |
-
st.session_state[
|
299 |
-
st.session_state[
|
300 |
-
st.session_state[
|
301 |
-
st.session_state[
|
302 |
-
|
303 |
-
|
304 |
-
st.session_state[
|
305 |
-
|
306 |
-
|
|
|
|
|
|
|
307 |
for channel in channels.values():
|
308 |
-
st.session_state[channel.name] = numerize(
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
|
|
316 |
else:
|
317 |
-
st.session_state[
|
318 |
-
|
319 |
-
st.session_state[
|
320 |
-
st.session_state[
|
321 |
-
|
322 |
-
|
|
|
|
|
|
|
323 |
# def initialize_data():
|
324 |
# # fetch data from excel
|
325 |
# output = pd.read_excel('data.xlsx',sheet_name=None)
|
@@ -335,17 +445,17 @@ def initialize_data():
|
|
335 |
# channel_list.append(col)
|
336 |
# else:
|
337 |
# pass
|
338 |
-
|
339 |
# ## NOTE : Considered only Desktop spends for all calculations
|
340 |
# acutal_df = raw_df[raw_df.Region == 'Desktop'].copy()
|
341 |
# ## NOTE : Considered one year of data
|
342 |
# acutal_df = acutal_df[acutal_df.Date>'2020-12-31']
|
343 |
# actual_df = acutal_df.drop('Region',axis=1).sort_values(by='Date')[[*channel_list,'Date']]
|
344 |
-
|
345 |
# ##load response curves
|
346 |
# with open('./grammarly_response_curves.json','r') as f:
|
347 |
# response_curves = json.load(f)
|
348 |
-
|
349 |
# ## create channel dict for scenario creation
|
350 |
# dates = actual_df.Date.values
|
351 |
# channels = {}
|
@@ -363,15 +473,15 @@ def initialize_data():
|
|
363 |
# response_curve_type=response_curve_type,
|
364 |
# response_curve_params=response_curve_params,
|
365 |
# bounds=np.array([-30,30]))
|
366 |
-
|
367 |
# channels[name] = channel
|
368 |
# else:
|
369 |
# constant = info_dict.get('value',0.) * len(dates)
|
370 |
-
|
371 |
# ## create scenario
|
372 |
# scenario = Scenario(name='default', channels=channels, constant=constant)
|
373 |
# default_scenario_dict = class_to_dict(scenario)
|
374 |
-
|
375 |
|
376 |
# ## setting session variables
|
377 |
# st.session_state['initialized'] = True
|
@@ -385,7 +495,7 @@ def initialize_data():
|
|
385 |
# for channel in channels.values():
|
386 |
# if channel.name not in st.session_state:
|
387 |
# st.session_state[channel.name] = float(channel.actual_total_spends)
|
388 |
-
|
389 |
# if 'xlsx_buffer' not in st.session_state:
|
390 |
# st.session_state['xlsx_buffer'] = io.BytesIO()
|
391 |
|
@@ -394,51 +504,121 @@ def initialize_data():
|
|
394 |
# if Path('../saved_scenarios.pkl').exists():
|
395 |
# with open('../saved_scenarios.pkl','rb') as f:
|
396 |
# st.session_state['saved_scenarios'] = pickle.load(f)
|
397 |
-
|
398 |
# else:
|
399 |
# st.session_state['saved_scenarios'] = OrderedDict()
|
400 |
|
401 |
# if 'total_spends_change' not in st.session_state:
|
402 |
# st.session_state['total_spends_change'] = 0
|
403 |
-
|
404 |
# if 'optimization_channels' not in st.session_state:
|
405 |
# st.session_state['optimization_channels'] = {channel_name : False for channel_name in channel_list}
|
406 |
-
|
407 |
# if 'disable_download_button' not in st.session_state:
|
408 |
# st.session_state['disable_download_button'] = True
|
409 |
-
|
410 |
-
|
411 |
def create_channel_summary(scenario):
|
412 |
|
413 |
# Provided data
|
414 |
data = {
|
415 |
-
|
416 |
-
|
417 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
418 |
}
|
419 |
|
420 |
# Create DataFrame
|
421 |
df = pd.DataFrame(data)
|
422 |
|
423 |
# Convert currency strings to numeric values
|
424 |
-
df[
|
425 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
426 |
|
427 |
# Calculate ROI
|
428 |
-
df[
|
429 |
|
430 |
# Format columns
|
431 |
format_currency = lambda x: f"${x:,.1f}"
|
432 |
format_roi = lambda x: f"{x:.1f}"
|
433 |
|
434 |
-
df[
|
435 |
-
|
436 |
-
|
437 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
438 |
return df
|
439 |
|
440 |
|
441 |
-
|
442 |
# def create_contribution_pie(scenario):
|
443 |
# #c1f7dc
|
444 |
# colors_map = {col:color for col,color in zip(st.session_state['channels_list'],plotly.colors.n_colors(plotly.colors.hex_to_rgb('#BE6468'), plotly.colors.hex_to_rgb('#E7B8B7'),23))}
|
@@ -470,23 +650,23 @@ def create_channel_summary(scenario):
|
|
470 |
# weekly_spends_data = []
|
471 |
# weekly_sales_data = []
|
472 |
# for channel_name in st.session_state['channels_list']:
|
473 |
-
# weekly_spends_data.append((go.Bar(x=x,
|
474 |
# y=scenario.channels[channel_name].actual_spends * scenario.channels[channel_name].conversion_rate,
|
475 |
-
# name=channel_name_formating(channel_name),
|
476 |
# hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}",
|
477 |
# legendgroup=channel_name)))
|
478 |
-
# weekly_sales_data.append((go.Bar(x=x,
|
479 |
# y=scenario.channels[channel_name].actual_sales,
|
480 |
-
# name=channel_name_formating(channel_name),
|
481 |
# hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
|
482 |
# legendgroup=channel_name, showlegend=False)))
|
483 |
# for _d in weekly_spends_data:
|
484 |
# weekly_contribution_fig.add_trace(_d, row=1, col=1)
|
485 |
# for _d in weekly_sales_data:
|
486 |
# weekly_contribution_fig.add_trace(_d, row=1, col=2)
|
487 |
-
# weekly_contribution_fig.add_trace(go.Bar(x=x,
|
488 |
# y=scenario.constant + scenario.correction,
|
489 |
-
# name='Non Media',
|
490 |
# hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}"), row=1, col=2)
|
491 |
# weekly_contribution_fig.update_layout(barmode='stack', title='Channel contribuion by week', xaxis_title='Date')
|
492 |
# weekly_contribution_fig.update_xaxes(showgrid=False)
|
@@ -524,14 +704,50 @@ def create_channel_summary(scenario):
|
|
524 |
|
525 |
|
526 |
def create_contribution_pie():
|
527 |
-
color_palette = [
|
528 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
529 |
|
530 |
-
channels_list = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
531 |
|
532 |
# Assign colors from the limited palette to channels
|
533 |
-
colors_map = {
|
534 |
-
|
|
|
|
|
|
|
|
|
|
|
535 |
|
536 |
# Hardcoded values for Spends and Revenue
|
537 |
spends_values = [0.5, 3.36, 1.1, 2.7, 2.7, 2.27, 70.6, 1, 1, 13.7, 1, 0]
|
@@ -542,10 +758,13 @@ def create_contribution_pie():
|
|
542 |
go.Pie(
|
543 |
labels=[channel_name for channel_name in channels_list],
|
544 |
values=spends_values,
|
545 |
-
marker=dict(
|
546 |
-
|
|
|
|
|
547 |
),
|
548 |
-
row=1,
|
|
|
549 |
)
|
550 |
|
551 |
# Add trace for Revenue pie chart
|
@@ -553,144 +772,196 @@ def create_contribution_pie():
|
|
553 |
go.Pie(
|
554 |
labels=[channel_name for channel_name in channels_list],
|
555 |
values=revenue_values,
|
556 |
-
marker=dict(
|
557 |
-
|
|
|
|
|
558 |
),
|
559 |
-
row=1,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
560 |
)
|
561 |
-
|
562 |
-
total_contribution_fig.update_traces(textposition='inside', texttemplate='%{percent:.1%}')
|
563 |
-
total_contribution_fig.update_layout(uniformtext_minsize=12, title='Channel contribution', uniformtext_mode='hide')
|
564 |
return total_contribution_fig
|
565 |
|
|
|
566 |
def create_contribuion_stacked_plot(scenario):
|
567 |
-
weekly_contribution_fig = make_subplots(
|
568 |
-
|
569 |
-
|
|
|
|
|
|
|
|
|
|
|
570 |
x = df.Date
|
571 |
weekly_spends_data = []
|
572 |
weekly_sales_data = []
|
573 |
-
|
574 |
-
for i, channel_name in enumerate(st.session_state[
|
575 |
color = color_palette[i % len(color_palette)]
|
576 |
-
|
577 |
-
weekly_spends_data.append(
|
578 |
-
|
579 |
-
|
580 |
-
|
581 |
-
|
582 |
-
|
583 |
-
|
584 |
-
|
585 |
-
|
586 |
-
|
587 |
-
|
588 |
-
|
589 |
-
|
590 |
-
|
591 |
-
|
592 |
-
|
593 |
-
|
594 |
-
|
595 |
-
|
|
|
|
|
|
|
|
|
|
|
596 |
for _d in weekly_spends_data:
|
597 |
weekly_contribution_fig.add_trace(_d, row=1, col=1)
|
598 |
for _d in weekly_sales_data:
|
599 |
weekly_contribution_fig.add_trace(_d, row=1, col=2)
|
600 |
-
|
601 |
-
weekly_contribution_fig.add_trace(
|
602 |
-
|
603 |
-
|
604 |
-
|
605 |
-
|
606 |
-
|
607 |
-
|
608 |
-
|
609 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
610 |
weekly_contribution_fig.update_xaxes(showgrid=False)
|
611 |
weekly_contribution_fig.update_yaxes(showgrid=False)
|
612 |
return weekly_contribution_fig
|
613 |
|
|
|
614 |
def create_channel_spends_sales_plot(channel):
|
615 |
if channel is not None:
|
616 |
x = channel.dates
|
617 |
_spends = channel.actual_spends * channel.conversion_rate
|
618 |
_sales = channel.actual_sales
|
619 |
channel_sales_spends_fig = make_subplots(specs=[[{"secondary_y": True}]])
|
620 |
-
channel_sales_spends_fig.add_trace(
|
621 |
-
|
622 |
-
|
623 |
-
|
624 |
-
|
625 |
-
|
626 |
-
|
627 |
-
|
628 |
-
|
629 |
-
|
630 |
-
|
631 |
-
|
632 |
-
|
633 |
-
|
634 |
-
|
635 |
-
|
636 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
637 |
channel_sales_spends_fig.update_xaxes(showgrid=False)
|
638 |
channel_sales_spends_fig.update_yaxes(showgrid=False)
|
639 |
else:
|
640 |
-
raw_df = st.session_state[
|
641 |
-
df = raw_df.sort_values(by=
|
642 |
x = df.Date
|
643 |
-
scenario = class_from_dict(st.session_state[
|
644 |
_sales = scenario.constant + scenario.correction
|
645 |
channel_sales_spends_fig = make_subplots(specs=[[{"secondary_y": True}]])
|
646 |
-
channel_sales_spends_fig.add_trace(
|
647 |
-
|
648 |
-
|
649 |
-
|
650 |
-
|
651 |
-
|
652 |
-
|
653 |
-
|
654 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
655 |
channel_sales_spends_fig.update_xaxes(showgrid=False)
|
656 |
channel_sales_spends_fig.update_yaxes(showgrid=False)
|
657 |
-
|
658 |
return channel_sales_spends_fig
|
659 |
|
660 |
-
|
|
|
661 |
if include_indicator:
|
662 |
-
return f
|
663 |
else:
|
664 |
-
return f
|
665 |
|
666 |
|
667 |
-
def decimal_formater(num_string,n_decimals=1):
|
668 |
-
parts = num_string.split(
|
669 |
if len(parts) == 1:
|
670 |
-
return num_string+
|
671 |
else:
|
672 |
to_be_padded = n_decimals - len(parts[-1])
|
673 |
-
if to_be_padded > 0
|
674 |
-
return num_string+
|
675 |
else:
|
676 |
return num_string
|
677 |
-
|
678 |
-
|
679 |
def channel_name_formating(channel_name):
|
680 |
-
name_mod = channel_name.replace(
|
681 |
-
if name_mod.lower().endswith(
|
682 |
-
name_mod = name_mod.replace(
|
683 |
-
elif name_mod.lower().endswith(
|
684 |
-
name_mod = name_mod.replace(
|
685 |
return name_mod
|
686 |
|
687 |
|
688 |
-
def send_email(email,message):
|
689 |
-
s = smtplib.SMTP(
|
690 |
s.starttls()
|
691 |
s.login("geethu4444@gmail.com", "jgydhpfusuremcol")
|
692 |
s.sendmail("geethu4444@gmail.com", email, message)
|
693 |
s.quit()
|
694 |
|
|
|
695 |
if __name__ == "__main__":
|
696 |
initialize_data()
|
|
|
12 |
import plotly
|
13 |
from pathlib import Path
|
14 |
import pickle
|
|
|
15 |
import yaml
|
16 |
from yaml import SafeLoader
|
17 |
from streamlit.components.v1 import html
|
|
|
23 |
import base64
|
24 |
|
25 |
|
26 |
+
color_palette = [
|
27 |
+
"#F3F3F0",
|
28 |
+
"#5E7D7E",
|
29 |
+
"#2FA1FF",
|
30 |
+
"#00EDED",
|
31 |
+
"#00EAE4",
|
32 |
+
"#304550",
|
33 |
+
"#EDEBEB",
|
34 |
+
"#7FBEFD",
|
35 |
+
"#003059",
|
36 |
+
"#A2F3F3",
|
37 |
+
"#E1D6E2",
|
38 |
+
"#B6B6B6",
|
39 |
+
]
|
40 |
|
41 |
|
42 |
+
CURRENCY_INDICATOR = "$"
|
43 |
|
44 |
+
import streamlit_authenticator as stauth
|
45 |
|
|
|
46 |
|
47 |
def load_authenticator():
|
48 |
+
with open("config.yaml") as file:
|
49 |
config = yaml.load(file, Loader=SafeLoader)
|
50 |
+
st.session_state["config"] = config
|
51 |
authenticator = stauth.Authenticate(
|
52 |
+
credentials=config["credentials"],
|
53 |
+
cookie_name=config["cookie"]["name"],
|
54 |
+
key=config["cookie"]["key"],
|
55 |
+
cookie_expiry_days=config["cookie"]["expiry_days"],
|
56 |
+
preauthorized=config["preauthorized"],
|
57 |
)
|
58 |
+
st.session_state["authenticator"] = authenticator
|
59 |
return authenticator
|
60 |
|
61 |
+
|
62 |
+
# Authentication
|
63 |
+
def authentication():
|
64 |
+
with open("config.yaml") as file:
|
65 |
+
config = yaml.load(file, Loader=SafeLoader)
|
66 |
+
|
67 |
+
authenticator = stauth.Authenticate(
|
68 |
+
config["credentials"],
|
69 |
+
config["cookie"]["name"],
|
70 |
+
config["cookie"]["key"],
|
71 |
+
config["cookie"]["expiry_days"],
|
72 |
+
config["preauthorized"],
|
73 |
+
)
|
74 |
+
|
75 |
+
name, authentication_status, username = authenticator.login("Login", "main")
|
76 |
+
return authenticator, name, authentication_status, username
|
77 |
+
|
78 |
+
|
79 |
def nav_page(page_name, timeout_secs=3):
|
80 |
nav_script = """
|
81 |
<script type="text/javascript">
|
|
|
98 |
attempt_nav_page("%s", new Date(), %d);
|
99 |
});
|
100 |
</script>
|
101 |
+
""" % (
|
102 |
+
page_name,
|
103 |
+
timeout_secs,
|
104 |
+
)
|
105 |
html(nav_script)
|
106 |
|
107 |
|
|
|
126 |
|
127 |
file_.close()
|
128 |
|
|
|
129 |
|
130 |
+
DATA_PATH = "./data"
|
131 |
|
132 |
+
IMAGES_PATH = "./data/images_224_224"
|
133 |
|
|
|
134 |
|
135 |
def load_local_css(file_name):
|
136 |
|
137 |
with open(file_name) as f:
|
138 |
|
139 |
+
st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
|
140 |
|
|
|
|
|
|
|
141 |
|
142 |
# def set_header():
|
143 |
|
|
|
158 |
|
159 |
file_1.close()
|
160 |
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
|
162 |
+
DATA_PATH1 = "./data"
|
163 |
|
164 |
+
IMAGES_PATH1 = "./data/images_224_224"
|
165 |
|
166 |
|
167 |
def set_header():
|
168 |
+
return st.markdown(
|
169 |
+
f"""<div class='main-header'>
|
170 |
<!-- <h1></h1> -->
|
171 |
<div >
|
172 |
<img class='blend-logo' src="data:image;base64,{data_url1}", alt="Logo">
|
173 |
</div>
|
174 |
<img class='blend-logo' src="data:image;base64,{data_url}", alt="Logo">
|
175 |
+
</div>""",
|
176 |
+
unsafe_allow_html=True,
|
177 |
+
)
|
178 |
+
|
179 |
|
180 |
# def set_header():
|
181 |
# logo_path = "./path/to/your/local/LIME_logo.png" # Replace with the actual file path
|
|
|
186 |
# </div>""", unsafe_allow_html=True)
|
187 |
|
188 |
|
189 |
+
def s_curve(x, K, b, a, x0):
|
190 |
+
return K / (1 + b * np.exp(-a * (x - x0)))
|
191 |
+
|
192 |
+
|
193 |
+
def panel_level(input_df, date_column="Date"):
|
194 |
+
# Ensure 'Date' is set as the index
|
195 |
+
if date_column not in input_df.index.names:
|
196 |
+
input_df = input_df.set_index(date_column)
|
197 |
+
|
198 |
+
# Select numeric columns only (excluding 'Date' since it's now the index)
|
199 |
+
numeric_columns_df = input_df.select_dtypes(include="number")
|
200 |
|
201 |
+
# Group by 'Date' (which is the index) and sum the numeric columns
|
202 |
+
aggregated_df = numeric_columns_df.groupby(input_df.index).sum()
|
203 |
+
|
204 |
+
# Reset index if you want 'Date' back as a column
|
205 |
+
aggregated_df = aggregated_df.reset_index()
|
206 |
+
|
207 |
+
return aggregated_df
|
208 |
+
|
209 |
+
|
210 |
+
def initialize_data(
|
211 |
+
panel=None, target_file="Overview_data_test.xlsx", updated_rcs=None, metrics=None
|
212 |
+
):
|
213 |
# uopx_conv_rates = {'streaming_impressions' : 0.007,'digital_impressions' : 0.007,'search_clicks' : 0.00719,'tv_impressions' : 0.000173,
|
214 |
# "digital_clicks":0.005,"streaming_clicks":0.004,'streaming_spends':1,"tv_spends":1,"search_spends":1,
|
215 |
# "digital_spends":1}
|
216 |
+
# print('State initialized')
|
217 |
+
|
218 |
+
excel = pd.read_excel(target_file, sheet_name=None)
|
219 |
+
|
220 |
+
# Extract dataframes for raw data, spend input, and contribution MMM
|
221 |
+
raw_df = excel["RAW DATA MMM"]
|
222 |
+
spend_df = excel["SPEND INPUT"]
|
223 |
+
contri_df = excel["CONTRIBUTION MMM"]
|
224 |
+
|
225 |
+
# Check if the panel is not None
|
226 |
+
if panel is not None and panel != "Aggregated":
|
227 |
+
raw_df = raw_df[raw_df["Panel"] == panel].drop(columns=["Panel"])
|
228 |
+
spend_df = spend_df[spend_df["Panel"] == panel].drop(columns=["Panel"])
|
229 |
+
contri_df = contri_df[contri_df["Panel"] == panel].drop(columns=["Panel"])
|
230 |
+
elif panel == "Aggregated":
|
231 |
+
raw_df = panel_level(raw_df, date_column="Date")
|
232 |
+
spend_df = panel_level(spend_df, date_column="Week")
|
233 |
+
contri_df = panel_level(contri_df, date_column="Date")
|
234 |
+
|
235 |
+
# Revenue_df = excel['Revenue']
|
236 |
+
|
237 |
+
## remove sesonalities, indices etc ...
|
238 |
+
exclude_columns = [
|
239 |
+
"Date",
|
240 |
+
"Region",
|
241 |
+
"Controls_Grammarly_Index_SeasonalAVG",
|
242 |
+
"Controls_Quillbot_Index",
|
243 |
+
"Daily_Positive_Outliers",
|
244 |
+
"External_RemoteClass_Index",
|
245 |
+
"Intervals ON 20190520-20190805 | 20200518-20200803 | 20210517-20210802",
|
246 |
+
"Intervals ON 20190826-20191209 | 20200824-20201207 | 20210823-20211206",
|
247 |
+
"Intervals ON 20201005-20201019",
|
248 |
+
"Promotion_PercentOff",
|
249 |
+
"Promotion_TimeBased",
|
250 |
+
"Seasonality_Indicator_Chirstmas",
|
251 |
+
"Seasonality_Indicator_NewYears_Days",
|
252 |
+
"Seasonality_Indicator_Thanksgiving",
|
253 |
+
"Trend 20200302 / 20200803",
|
254 |
+
]
|
255 |
+
raw_df["Date"] = pd.to_datetime(raw_df["Date"])
|
256 |
+
contri_df["Date"] = pd.to_datetime(contri_df["Date"])
|
257 |
+
input_df = raw_df.sort_values(by="Date")
|
258 |
+
output_df = contri_df.sort_values(by="Date")
|
259 |
+
spend_df["Week"] = pd.to_datetime(
|
260 |
+
spend_df["Week"], format="%Y-%m-%d", errors="coerce"
|
261 |
+
)
|
262 |
+
spend_df.sort_values(by="Week", inplace=True)
|
263 |
|
264 |
# spend_df['Week'] = pd.to_datetime(spend_df['Week'], errors='coerce')
|
265 |
# spend_df = spend_df.sort_values(by='Week')
|
|
|
266 |
|
267 |
channel_list = [col for col in input_df.columns if col not in exclude_columns]
|
268 |
+
channel_list = list(set(channel_list) - set(["fb_level_achieved_tier_1", "ga_app"]))
|
269 |
+
|
270 |
response_curves = {}
|
271 |
mapes = {}
|
272 |
rmses = {}
|
|
|
280 |
dates = input_df.Date.values
|
281 |
actual_output_dic = {}
|
282 |
actual_input_dic = {}
|
283 |
+
|
284 |
for inp_col in channel_list:
|
285 |
+
# st.write(inp_col)
|
286 |
spends = input_df[inp_col].values
|
287 |
x = spends.copy()
|
288 |
+
# upper limit for penalty
|
289 |
+
upper_limits[inp_col] = 2 * x.max()
|
290 |
+
|
291 |
# contribution
|
292 |
out_col = [_col for _col in output_df.columns if _col.startswith(inp_col)][0]
|
293 |
y = output_df[out_col].values.copy()
|
|
|
295 |
actual_input_dic[inp_col] = x.copy()
|
296 |
##output cols aggregation
|
297 |
output_cols.append(out_col)
|
298 |
+
|
299 |
## scale the input
|
300 |
+
power = np.ceil(np.log(x.max()) / np.log(10)) - 3
|
301 |
+
if power >= 0:
|
302 |
x = x / 10**power
|
303 |
+
|
304 |
+
x = x.astype("float64")
|
305 |
+
y = y.astype("float64")
|
306 |
+
# print('#printing yyyyyyyyy')
|
307 |
+
# print(inp_col)
|
308 |
+
# print(x.max())
|
309 |
+
# print(y.max())
|
310 |
+
bounds = ((0, 0, 0, 0), (3 * y.max(), 1000, 1, x.max()))
|
311 |
+
|
312 |
+
# bounds = ((y.max(), 3*y.max()),(0,1000),(0,1),(0,x.max()))
|
313 |
+
params, _ = curve_fit(
|
314 |
+
s_curve,
|
315 |
+
x,
|
316 |
+
y,
|
317 |
+
p0=(2 * y.max(), 0.01, 1e-5, x.max()),
|
318 |
+
bounds=bounds,
|
319 |
+
maxfev=int(1e5),
|
320 |
+
)
|
321 |
mape = (100 * abs(1 - s_curve(x, *params) / y.clip(min=1))).mean()
|
322 |
+
rmse = np.sqrt(((y - s_curve(x, *params)) ** 2).mean())
|
323 |
+
r2_ = r2_score(y, s_curve(x, *params))
|
324 |
+
|
325 |
+
response_curves[inp_col] = {
|
326 |
+
"K": params[0],
|
327 |
+
"b": params[1],
|
328 |
+
"a": params[2],
|
329 |
+
"x0": params[3],
|
330 |
+
}
|
331 |
+
|
332 |
+
updated_rcs_key = f"{metrics}#@{panel}#@{inp_col}"
|
333 |
+
if updated_rcs is not None and updated_rcs_key in list(updated_rcs.keys()):
|
334 |
+
response_curves[inp_col] = updated_rcs[updated_rcs_key]
|
335 |
|
|
|
336 |
mapes[inp_col] = mape
|
337 |
rmses[inp_col] = rmse
|
338 |
r2[inp_col] = r2_
|
339 |
powers[inp_col] = power
|
340 |
+
|
|
|
341 |
## conversion rates
|
342 |
+
spend_col = [
|
343 |
+
_col
|
344 |
+
for _col in spend_df.columns
|
345 |
+
if _col.startswith(inp_col.rsplit("_", 1)[0])
|
346 |
+
][0]
|
347 |
+
|
348 |
+
# print('#printing spendssss')
|
349 |
+
# print(spend_col)
|
350 |
+
conv = (
|
351 |
+
spend_df.set_index("Week")[spend_col]
|
352 |
+
/ input_df.set_index("Date")[inp_col].clip(lower=1)
|
353 |
+
).reset_index()
|
354 |
+
conv.rename(columns={"index": "Week"}, inplace=True)
|
355 |
+
conv["year"] = conv.Week.dt.year
|
356 |
+
conv_rates[inp_col] = list(conv.drop("Week", axis=1).mean().to_dict().values())[
|
357 |
+
0
|
358 |
+
]
|
359 |
##print('Before',conv_rates[inp_col])
|
360 |
# conv_rates[inp_col] = uopx_conv_rates[inp_col]
|
361 |
##print('After',(conv_rates[inp_col]))
|
362 |
+
|
363 |
+
channel = Channel(
|
364 |
+
name=inp_col,
|
365 |
+
dates=dates,
|
366 |
+
spends=spends,
|
367 |
+
# conversion_rate = np.mean(list(conv_rates[inp_col].values())),
|
368 |
+
conversion_rate=conv_rates[inp_col],
|
369 |
+
response_curve_type="s-curve",
|
370 |
+
response_curve_params={
|
371 |
+
"K": params[0],
|
372 |
+
"b": params[1],
|
373 |
+
"a": params[2],
|
374 |
+
"x0": params[3],
|
375 |
+
},
|
376 |
+
bounds=np.array([-10, 10]),
|
377 |
+
)
|
378 |
channels[inp_col] = channel
|
379 |
if sales is None:
|
380 |
sales = channel.actual_sales
|
381 |
else:
|
382 |
sales += channel.actual_sales
|
383 |
+
other_contributions = (
|
384 |
+
output_df.drop([*output_cols], axis=1).sum(axis=1, numeric_only=True).values
|
385 |
+
)
|
386 |
+
correction = output_df.drop("Date", axis=1).sum(axis=1).values - (
|
387 |
+
sales + other_contributions
|
388 |
+
)
|
389 |
+
scenario = Scenario(
|
390 |
+
name="default",
|
391 |
+
channels=channels,
|
392 |
+
constant=other_contributions,
|
393 |
+
correction=correction,
|
394 |
+
)
|
395 |
## setting session variables
|
396 |
+
st.session_state["initialized"] = True
|
397 |
+
st.session_state["actual_df"] = input_df
|
398 |
+
st.session_state["raw_df"] = raw_df
|
399 |
+
st.session_state["contri_df"] = output_df
|
400 |
default_scenario_dict = class_to_dict(scenario)
|
401 |
+
st.session_state["default_scenario_dict"] = default_scenario_dict
|
402 |
+
st.session_state["scenario"] = scenario
|
403 |
+
st.session_state["channels_list"] = channel_list
|
404 |
+
st.session_state["optimization_channels"] = {
|
405 |
+
channel_name: False for channel_name in channel_list
|
406 |
+
}
|
407 |
+
st.session_state["rcs"] = response_curves
|
408 |
+
|
409 |
+
st.session_state["powers"] = powers
|
410 |
+
st.session_state["actual_contribution_df"] = pd.DataFrame(actual_output_dic)
|
411 |
+
st.session_state["actual_input_df"] = pd.DataFrame(actual_input_dic)
|
412 |
+
|
413 |
for channel in channels.values():
|
414 |
+
st.session_state[channel.name] = numerize(
|
415 |
+
channel.actual_total_spends * channel.conversion_rate, 1
|
416 |
+
)
|
417 |
+
|
418 |
+
st.session_state["xlsx_buffer"] = io.BytesIO()
|
419 |
+
|
420 |
+
if Path("../saved_scenarios.pkl").exists():
|
421 |
+
with open("../saved_scenarios.pkl", "rb") as f:
|
422 |
+
st.session_state["saved_scenarios"] = pickle.load(f)
|
423 |
else:
|
424 |
+
st.session_state["saved_scenarios"] = OrderedDict()
|
425 |
+
|
426 |
+
# st.session_state["total_spends_change"] = 0
|
427 |
+
st.session_state["optimization_channels"] = {
|
428 |
+
channel_name: False for channel_name in channel_list
|
429 |
+
}
|
430 |
+
st.session_state["disable_download_button"] = True
|
431 |
+
|
432 |
+
|
433 |
# def initialize_data():
|
434 |
# # fetch data from excel
|
435 |
# output = pd.read_excel('data.xlsx',sheet_name=None)
|
|
|
445 |
# channel_list.append(col)
|
446 |
# else:
|
447 |
# pass
|
448 |
+
|
449 |
# ## NOTE : Considered only Desktop spends for all calculations
|
450 |
# acutal_df = raw_df[raw_df.Region == 'Desktop'].copy()
|
451 |
# ## NOTE : Considered one year of data
|
452 |
# acutal_df = acutal_df[acutal_df.Date>'2020-12-31']
|
453 |
# actual_df = acutal_df.drop('Region',axis=1).sort_values(by='Date')[[*channel_list,'Date']]
|
454 |
+
|
455 |
# ##load response curves
|
456 |
# with open('./grammarly_response_curves.json','r') as f:
|
457 |
# response_curves = json.load(f)
|
458 |
+
|
459 |
# ## create channel dict for scenario creation
|
460 |
# dates = actual_df.Date.values
|
461 |
# channels = {}
|
|
|
473 |
# response_curve_type=response_curve_type,
|
474 |
# response_curve_params=response_curve_params,
|
475 |
# bounds=np.array([-30,30]))
|
476 |
+
|
477 |
# channels[name] = channel
|
478 |
# else:
|
479 |
# constant = info_dict.get('value',0.) * len(dates)
|
480 |
+
|
481 |
# ## create scenario
|
482 |
# scenario = Scenario(name='default', channels=channels, constant=constant)
|
483 |
# default_scenario_dict = class_to_dict(scenario)
|
484 |
+
|
485 |
|
486 |
# ## setting session variables
|
487 |
# st.session_state['initialized'] = True
|
|
|
495 |
# for channel in channels.values():
|
496 |
# if channel.name not in st.session_state:
|
497 |
# st.session_state[channel.name] = float(channel.actual_total_spends)
|
498 |
+
|
499 |
# if 'xlsx_buffer' not in st.session_state:
|
500 |
# st.session_state['xlsx_buffer'] = io.BytesIO()
|
501 |
|
|
|
504 |
# if Path('../saved_scenarios.pkl').exists():
|
505 |
# with open('../saved_scenarios.pkl','rb') as f:
|
506 |
# st.session_state['saved_scenarios'] = pickle.load(f)
|
507 |
+
|
508 |
# else:
|
509 |
# st.session_state['saved_scenarios'] = OrderedDict()
|
510 |
|
511 |
# if 'total_spends_change' not in st.session_state:
|
512 |
# st.session_state['total_spends_change'] = 0
|
513 |
+
|
514 |
# if 'optimization_channels' not in st.session_state:
|
515 |
# st.session_state['optimization_channels'] = {channel_name : False for channel_name in channel_list}
|
516 |
+
|
517 |
# if 'disable_download_button' not in st.session_state:
|
518 |
# st.session_state['disable_download_button'] = True
|
519 |
+
|
520 |
+
|
521 |
def create_channel_summary(scenario):
|
522 |
|
523 |
# Provided data
|
524 |
data = {
|
525 |
+
"Channel": [
|
526 |
+
"Paid Search",
|
527 |
+
"Ga will cid baixo risco",
|
528 |
+
"Digital tactic others",
|
529 |
+
"Fb la tier 1",
|
530 |
+
"Fb la tier 2",
|
531 |
+
"Paid social others",
|
532 |
+
"Programmatic",
|
533 |
+
"Kwai",
|
534 |
+
"Indicacao",
|
535 |
+
"Infleux",
|
536 |
+
"Influencer",
|
537 |
+
],
|
538 |
+
"Spends": [
|
539 |
+
"$ 11.3K",
|
540 |
+
"$ 155.2K",
|
541 |
+
"$ 50.7K",
|
542 |
+
"$ 125.4K",
|
543 |
+
"$ 125.2K",
|
544 |
+
"$ 105K",
|
545 |
+
"$ 3.3M",
|
546 |
+
"$ 47.5K",
|
547 |
+
"$ 55.9K",
|
548 |
+
"$ 632.3K",
|
549 |
+
"$ 48.3K",
|
550 |
+
],
|
551 |
+
"Revenue": [
|
552 |
+
"558.0K",
|
553 |
+
"3.5M",
|
554 |
+
"5.2M",
|
555 |
+
"3.1M",
|
556 |
+
"3.1M",
|
557 |
+
"2.1M",
|
558 |
+
"20.8M",
|
559 |
+
"1.6M",
|
560 |
+
"728.4K",
|
561 |
+
"22.9M",
|
562 |
+
"4.8M",
|
563 |
+
],
|
564 |
}
|
565 |
|
566 |
# Create DataFrame
|
567 |
df = pd.DataFrame(data)
|
568 |
|
569 |
# Convert currency strings to numeric values
|
570 |
+
df["Spends"] = (
|
571 |
+
df["Spends"]
|
572 |
+
.replace({"\$": "", "K": "*1e3", "M": "*1e6"}, regex=True)
|
573 |
+
.map(pd.eval)
|
574 |
+
.astype(int)
|
575 |
+
)
|
576 |
+
df["Revenue"] = (
|
577 |
+
df["Revenue"]
|
578 |
+
.replace({"\$": "", "K": "*1e3", "M": "*1e6"}, regex=True)
|
579 |
+
.map(pd.eval)
|
580 |
+
.astype(int)
|
581 |
+
)
|
582 |
|
583 |
# Calculate ROI
|
584 |
+
df["ROI"] = (df["Revenue"] - df["Spends"]) / df["Spends"]
|
585 |
|
586 |
# Format columns
|
587 |
format_currency = lambda x: f"${x:,.1f}"
|
588 |
format_roi = lambda x: f"{x:.1f}"
|
589 |
|
590 |
+
df["Spends"] = [
|
591 |
+
"$ 11.3K",
|
592 |
+
"$ 155.2K",
|
593 |
+
"$ 50.7K",
|
594 |
+
"$ 125.4K",
|
595 |
+
"$ 125.2K",
|
596 |
+
"$ 105K",
|
597 |
+
"$ 3.3M",
|
598 |
+
"$ 47.5K",
|
599 |
+
"$ 55.9K",
|
600 |
+
"$ 632.3K",
|
601 |
+
"$ 48.3K",
|
602 |
+
]
|
603 |
+
df["Revenue"] = [
|
604 |
+
"$ 536.3K",
|
605 |
+
"$ 3.4M",
|
606 |
+
"$ 5M",
|
607 |
+
"$ 3M",
|
608 |
+
"$ 3M",
|
609 |
+
"$ 2M",
|
610 |
+
"$ 20M",
|
611 |
+
"$ 1.5M",
|
612 |
+
"$ 7.1M",
|
613 |
+
"$ 22M",
|
614 |
+
"$ 4.6M",
|
615 |
+
]
|
616 |
+
df["ROI"] = df["ROI"].apply(format_roi)
|
617 |
+
|
618 |
return df
|
619 |
|
620 |
|
621 |
+
# @st.cache(allow_output_mutation=True)
|
622 |
# def create_contribution_pie(scenario):
|
623 |
# #c1f7dc
|
624 |
# colors_map = {col:color for col,color in zip(st.session_state['channels_list'],plotly.colors.n_colors(plotly.colors.hex_to_rgb('#BE6468'), plotly.colors.hex_to_rgb('#E7B8B7'),23))}
|
|
|
650 |
# weekly_spends_data = []
|
651 |
# weekly_sales_data = []
|
652 |
# for channel_name in st.session_state['channels_list']:
|
653 |
+
# weekly_spends_data.append((go.Bar(x=x,
|
654 |
# y=scenario.channels[channel_name].actual_spends * scenario.channels[channel_name].conversion_rate,
|
655 |
+
# name=channel_name_formating(channel_name),
|
656 |
# hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}",
|
657 |
# legendgroup=channel_name)))
|
658 |
+
# weekly_sales_data.append((go.Bar(x=x,
|
659 |
# y=scenario.channels[channel_name].actual_sales,
|
660 |
+
# name=channel_name_formating(channel_name),
|
661 |
# hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
|
662 |
# legendgroup=channel_name, showlegend=False)))
|
663 |
# for _d in weekly_spends_data:
|
664 |
# weekly_contribution_fig.add_trace(_d, row=1, col=1)
|
665 |
# for _d in weekly_sales_data:
|
666 |
# weekly_contribution_fig.add_trace(_d, row=1, col=2)
|
667 |
+
# weekly_contribution_fig.add_trace(go.Bar(x=x,
|
668 |
# y=scenario.constant + scenario.correction,
|
669 |
+
# name='Non Media',
|
670 |
# hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}"), row=1, col=2)
|
671 |
# weekly_contribution_fig.update_layout(barmode='stack', title='Channel contribuion by week', xaxis_title='Date')
|
672 |
# weekly_contribution_fig.update_xaxes(showgrid=False)
|
|
|
704 |
|
705 |
|
706 |
def create_contribution_pie():
|
707 |
+
color_palette = [
|
708 |
+
"#F3F3F0",
|
709 |
+
"#5E7D7E",
|
710 |
+
"#2FA1FF",
|
711 |
+
"#00EDED",
|
712 |
+
"#00EAE4",
|
713 |
+
"#304550",
|
714 |
+
"#EDEBEB",
|
715 |
+
"#7FBEFD",
|
716 |
+
"#003059",
|
717 |
+
"#A2F3F3",
|
718 |
+
"#E1D6E2",
|
719 |
+
"#B6B6B6",
|
720 |
+
]
|
721 |
+
total_contribution_fig = make_subplots(
|
722 |
+
rows=1,
|
723 |
+
cols=2,
|
724 |
+
subplot_titles=["Spends", "Revenue"],
|
725 |
+
specs=[[{"type": "pie"}, {"type": "pie"}]],
|
726 |
+
)
|
727 |
|
728 |
+
channels_list = [
|
729 |
+
"Paid Search",
|
730 |
+
"Ga will cid baixo risco",
|
731 |
+
"Digital tactic others",
|
732 |
+
"Fb la tier 1",
|
733 |
+
"Fb la tier 2",
|
734 |
+
"Paid social others",
|
735 |
+
"Programmatic",
|
736 |
+
"Kwai",
|
737 |
+
"Indicacao",
|
738 |
+
"Infleux",
|
739 |
+
"Influencer",
|
740 |
+
"Non Media",
|
741 |
+
]
|
742 |
|
743 |
# Assign colors from the limited palette to channels
|
744 |
+
colors_map = {
|
745 |
+
col: color_palette[i % len(color_palette)]
|
746 |
+
for i, col in enumerate(channels_list)
|
747 |
+
}
|
748 |
+
colors_map["Non Media"] = color_palette[
|
749 |
+
5
|
750 |
+
] # Assign fixed green color for 'Non Media'
|
751 |
|
752 |
# Hardcoded values for Spends and Revenue
|
753 |
spends_values = [0.5, 3.36, 1.1, 2.7, 2.7, 2.27, 70.6, 1, 1, 13.7, 1, 0]
|
|
|
758 |
go.Pie(
|
759 |
labels=[channel_name for channel_name in channels_list],
|
760 |
values=spends_values,
|
761 |
+
marker=dict(
|
762 |
+
colors=[colors_map[channel_name] for channel_name in channels_list]
|
763 |
+
),
|
764 |
+
hole=0.3,
|
765 |
),
|
766 |
+
row=1,
|
767 |
+
col=1,
|
768 |
)
|
769 |
|
770 |
# Add trace for Revenue pie chart
|
|
|
772 |
go.Pie(
|
773 |
labels=[channel_name for channel_name in channels_list],
|
774 |
values=revenue_values,
|
775 |
+
marker=dict(
|
776 |
+
colors=[colors_map[channel_name] for channel_name in channels_list]
|
777 |
+
),
|
778 |
+
hole=0.3,
|
779 |
),
|
780 |
+
row=1,
|
781 |
+
col=2,
|
782 |
+
)
|
783 |
+
|
784 |
+
total_contribution_fig.update_traces(
|
785 |
+
textposition="inside", texttemplate="%{percent:.1%}"
|
786 |
+
)
|
787 |
+
total_contribution_fig.update_layout(
|
788 |
+
uniformtext_minsize=12, title="Channel contribution", uniformtext_mode="hide"
|
789 |
)
|
|
|
|
|
|
|
790 |
return total_contribution_fig
|
791 |
|
792 |
+
|
793 |
def create_contribuion_stacked_plot(scenario):
|
794 |
+
weekly_contribution_fig = make_subplots(
|
795 |
+
rows=1,
|
796 |
+
cols=2,
|
797 |
+
subplot_titles=["Spends", "Revenue"],
|
798 |
+
specs=[[{"type": "bar"}, {"type": "bar"}]],
|
799 |
+
)
|
800 |
+
raw_df = st.session_state["raw_df"]
|
801 |
+
df = raw_df.sort_values(by="Date")
|
802 |
x = df.Date
|
803 |
weekly_spends_data = []
|
804 |
weekly_sales_data = []
|
805 |
+
|
806 |
+
for i, channel_name in enumerate(st.session_state["channels_list"]):
|
807 |
color = color_palette[i % len(color_palette)]
|
808 |
+
|
809 |
+
weekly_spends_data.append(
|
810 |
+
go.Bar(
|
811 |
+
x=x,
|
812 |
+
y=scenario.channels[channel_name].actual_spends
|
813 |
+
* scenario.channels[channel_name].conversion_rate,
|
814 |
+
name=channel_name_formating(channel_name),
|
815 |
+
hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}",
|
816 |
+
legendgroup=channel_name,
|
817 |
+
marker_color=color,
|
818 |
+
)
|
819 |
+
)
|
820 |
+
|
821 |
+
weekly_sales_data.append(
|
822 |
+
go.Bar(
|
823 |
+
x=x,
|
824 |
+
y=scenario.channels[channel_name].actual_sales,
|
825 |
+
name=channel_name_formating(channel_name),
|
826 |
+
hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
|
827 |
+
legendgroup=channel_name,
|
828 |
+
showlegend=False,
|
829 |
+
marker_color=color,
|
830 |
+
)
|
831 |
+
)
|
832 |
+
|
833 |
for _d in weekly_spends_data:
|
834 |
weekly_contribution_fig.add_trace(_d, row=1, col=1)
|
835 |
for _d in weekly_sales_data:
|
836 |
weekly_contribution_fig.add_trace(_d, row=1, col=2)
|
837 |
+
|
838 |
+
weekly_contribution_fig.add_trace(
|
839 |
+
go.Bar(
|
840 |
+
x=x,
|
841 |
+
y=scenario.constant + scenario.correction,
|
842 |
+
name="Non Media",
|
843 |
+
hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
|
844 |
+
marker_color=color_palette[-1],
|
845 |
+
),
|
846 |
+
row=1,
|
847 |
+
col=2,
|
848 |
+
)
|
849 |
+
|
850 |
+
weekly_contribution_fig.update_layout(
|
851 |
+
barmode="stack", title="Channel contribution by week", xaxis_title="Date"
|
852 |
+
)
|
853 |
weekly_contribution_fig.update_xaxes(showgrid=False)
|
854 |
weekly_contribution_fig.update_yaxes(showgrid=False)
|
855 |
return weekly_contribution_fig
|
856 |
|
857 |
+
|
858 |
def create_channel_spends_sales_plot(channel):
|
859 |
if channel is not None:
|
860 |
x = channel.dates
|
861 |
_spends = channel.actual_spends * channel.conversion_rate
|
862 |
_sales = channel.actual_sales
|
863 |
channel_sales_spends_fig = make_subplots(specs=[[{"secondary_y": True}]])
|
864 |
+
channel_sales_spends_fig.add_trace(
|
865 |
+
go.Bar(
|
866 |
+
x=x,
|
867 |
+
y=_sales,
|
868 |
+
marker_color=color_palette[
|
869 |
+
3
|
870 |
+
], # You can choose a color from the palette
|
871 |
+
name="Revenue",
|
872 |
+
hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
|
873 |
+
),
|
874 |
+
secondary_y=False,
|
875 |
+
)
|
876 |
+
|
877 |
+
channel_sales_spends_fig.add_trace(
|
878 |
+
go.Scatter(
|
879 |
+
x=x,
|
880 |
+
y=_spends,
|
881 |
+
line=dict(
|
882 |
+
color=color_palette[2]
|
883 |
+
), # You can choose another color from the palette
|
884 |
+
name="Spends",
|
885 |
+
hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}",
|
886 |
+
),
|
887 |
+
secondary_y=True,
|
888 |
+
)
|
889 |
+
|
890 |
+
channel_sales_spends_fig.update_layout(
|
891 |
+
xaxis_title="Date",
|
892 |
+
yaxis_title="Revenue",
|
893 |
+
yaxis2_title="Spends ($)",
|
894 |
+
title="Channel spends and Revenue week-wise",
|
895 |
+
)
|
896 |
channel_sales_spends_fig.update_xaxes(showgrid=False)
|
897 |
channel_sales_spends_fig.update_yaxes(showgrid=False)
|
898 |
else:
|
899 |
+
raw_df = st.session_state["raw_df"]
|
900 |
+
df = raw_df.sort_values(by="Date")
|
901 |
x = df.Date
|
902 |
+
scenario = class_from_dict(st.session_state["default_scenario_dict"])
|
903 |
_sales = scenario.constant + scenario.correction
|
904 |
channel_sales_spends_fig = make_subplots(specs=[[{"secondary_y": True}]])
|
905 |
+
channel_sales_spends_fig.add_trace(
|
906 |
+
go.Bar(
|
907 |
+
x=x,
|
908 |
+
y=_sales,
|
909 |
+
marker_color=color_palette[
|
910 |
+
0
|
911 |
+
], # You can choose a color from the palette
|
912 |
+
name="Revenue",
|
913 |
+
hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
|
914 |
+
),
|
915 |
+
secondary_y=False,
|
916 |
+
)
|
917 |
+
|
918 |
+
channel_sales_spends_fig.update_layout(
|
919 |
+
xaxis_title="Date",
|
920 |
+
yaxis_title="Revenue",
|
921 |
+
yaxis2_title="Spends ($)",
|
922 |
+
title="Channel spends and Revenue week-wise",
|
923 |
+
)
|
924 |
channel_sales_spends_fig.update_xaxes(showgrid=False)
|
925 |
channel_sales_spends_fig.update_yaxes(showgrid=False)
|
926 |
+
|
927 |
return channel_sales_spends_fig
|
928 |
|
929 |
+
|
930 |
+
def format_numbers(value, n_decimals=1, include_indicator=True):
|
931 |
if include_indicator:
|
932 |
+
return f"{CURRENCY_INDICATOR} {numerize(value,n_decimals)}"
|
933 |
else:
|
934 |
+
return f"{numerize(value,n_decimals)}"
|
935 |
|
936 |
|
937 |
+
def decimal_formater(num_string, n_decimals=1):
|
938 |
+
parts = num_string.split(".")
|
939 |
if len(parts) == 1:
|
940 |
+
return num_string + "." + "0" * n_decimals
|
941 |
else:
|
942 |
to_be_padded = n_decimals - len(parts[-1])
|
943 |
+
if to_be_padded > 0:
|
944 |
+
return num_string + "0" * to_be_padded
|
945 |
else:
|
946 |
return num_string
|
947 |
+
|
948 |
+
|
949 |
def channel_name_formating(channel_name):
|
950 |
+
name_mod = channel_name.replace("_", " ")
|
951 |
+
if name_mod.lower().endswith(" imp"):
|
952 |
+
name_mod = name_mod.replace("Imp", "Spend")
|
953 |
+
elif name_mod.lower().endswith(" clicks"):
|
954 |
+
name_mod = name_mod.replace("Clicks", "Spend")
|
955 |
return name_mod
|
956 |
|
957 |
|
958 |
+
def send_email(email, message):
|
959 |
+
s = smtplib.SMTP("smtp.gmail.com", 587)
|
960 |
s.starttls()
|
961 |
s.login("geethu4444@gmail.com", "jgydhpfusuremcol")
|
962 |
s.sendmail("geethu4444@gmail.com", email, message)
|
963 |
s.quit()
|
964 |
|
965 |
+
|
966 |
if __name__ == "__main__":
|
967 |
initialize_data()
|
utilities_with_panel.py
ADDED
@@ -0,0 +1,1018 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from numerize.numerize import numerize
|
2 |
+
import streamlit as st
|
3 |
+
import pandas as pd
|
4 |
+
import json
|
5 |
+
from classes import Channel, Scenario
|
6 |
+
import numpy as np
|
7 |
+
from plotly.subplots import make_subplots
|
8 |
+
import plotly.graph_objects as go
|
9 |
+
from classes import class_to_dict
|
10 |
+
from collections import OrderedDict
|
11 |
+
import io
|
12 |
+
import plotly
|
13 |
+
from pathlib import Path
|
14 |
+
import pickle
|
15 |
+
import streamlit_authenticator as stauth
|
16 |
+
import yaml
|
17 |
+
from yaml import SafeLoader
|
18 |
+
from streamlit.components.v1 import html
|
19 |
+
import smtplib
|
20 |
+
from scipy.optimize import curve_fit
|
21 |
+
from sklearn.metrics import r2_score
|
22 |
+
from classes import class_from_dict
|
23 |
+
import os
|
24 |
+
import base64
|
25 |
+
|
26 |
+
|
27 |
+
|
28 |
+
|
29 |
+
color_palette = ['#F3F3F0', '#5E7D7E', '#2FA1FF', '#00EDED', '#00EAE4', '#304550', '#EDEBEB', '#7FBEFD', '#003059', '#A2F3F3', '#E1D6E2', '#B6B6B6']
|
30 |
+
|
31 |
+
|
32 |
+
CURRENCY_INDICATOR = '$'
|
33 |
+
|
34 |
+
def load_authenticator():
|
35 |
+
with open('config.yaml') as file:
|
36 |
+
config = yaml.load(file, Loader=SafeLoader)
|
37 |
+
st.session_state['config'] = config
|
38 |
+
authenticator = stauth.Authenticate(
|
39 |
+
config['credentials'],
|
40 |
+
config['cookie']['name'],
|
41 |
+
config['cookie']['key'],
|
42 |
+
config['cookie']['expiry_days'],
|
43 |
+
config['preauthorized']
|
44 |
+
)
|
45 |
+
st.session_state['authenticator'] = authenticator
|
46 |
+
return authenticator
|
47 |
+
|
48 |
+
def nav_page(page_name, timeout_secs=3):
|
49 |
+
nav_script = """
|
50 |
+
<script type="text/javascript">
|
51 |
+
function attempt_nav_page(page_name, start_time, timeout_secs) {
|
52 |
+
var links = window.parent.document.getElementsByTagName("a");
|
53 |
+
for (var i = 0; i < links.length; i++) {
|
54 |
+
if (links[i].href.toLowerCase().endsWith("/" + page_name.toLowerCase())) {
|
55 |
+
links[i].click();
|
56 |
+
return;
|
57 |
+
}
|
58 |
+
}
|
59 |
+
var elasped = new Date() - start_time;
|
60 |
+
if (elasped < timeout_secs * 1000) {
|
61 |
+
setTimeout(attempt_nav_page, 100, page_name, start_time, timeout_secs);
|
62 |
+
} else {
|
63 |
+
alert("Unable to navigate to page '" + page_name + "' after " + timeout_secs + " second(s).");
|
64 |
+
}
|
65 |
+
}
|
66 |
+
window.addEventListener("load", function() {
|
67 |
+
attempt_nav_page("%s", new Date(), %d);
|
68 |
+
});
|
69 |
+
</script>
|
70 |
+
""" % (page_name, timeout_secs)
|
71 |
+
html(nav_script)
|
72 |
+
|
73 |
+
|
74 |
+
# def load_local_css(file_name):
|
75 |
+
# with open(file_name) as f:
|
76 |
+
# st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
|
77 |
+
|
78 |
+
|
79 |
+
# def set_header():
|
80 |
+
# return st.markdown(f"""<div class='main-header'>
|
81 |
+
# <h1>MMM LiME</h1>
|
82 |
+
# <img src="https://assets-global.website-files.com/64c8fffb0e95cbc525815b79/64df84637f83a891c1473c51_Vector%20(Stroke).svg ">
|
83 |
+
# </div>""", unsafe_allow_html=True)
|
84 |
+
|
85 |
+
path = os.path.dirname(__file__)
|
86 |
+
|
87 |
+
file_ = open(f"{path}/mastercard_logo.png", "rb")
|
88 |
+
|
89 |
+
contents = file_.read()
|
90 |
+
|
91 |
+
data_url = base64.b64encode(contents).decode("utf-8")
|
92 |
+
|
93 |
+
file_.close()
|
94 |
+
|
95 |
+
|
96 |
+
|
97 |
+
DATA_PATH = './data'
|
98 |
+
|
99 |
+
IMAGES_PATH = './data/images_224_224'
|
100 |
+
|
101 |
+
# New - Sprint 2
|
102 |
+
if 'bin_dict' not in st.session_state:
|
103 |
+
|
104 |
+
with open("data_import.pkl", "rb") as f:
|
105 |
+
data = pickle.load(f)
|
106 |
+
|
107 |
+
st.session_state['bin_dict'] = data["bin_dict"]
|
108 |
+
|
109 |
+
panel_col = [col.lower().replace('.','_').replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in st.session_state['bin_dict']['Panel Level 1'] ] [0]# set the panel column
|
110 |
+
|
111 |
+
is_panel = True if len(panel_col)>0 else False
|
112 |
+
|
113 |
+
date_col='Date'
|
114 |
+
#is_panel = False # flag if set to true - do panel level response curves
|
115 |
+
|
116 |
+
def load_local_css(file_name):
|
117 |
+
|
118 |
+
with open(file_name) as f:
|
119 |
+
|
120 |
+
st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
|
121 |
+
|
122 |
+
|
123 |
+
|
124 |
+
|
125 |
+
|
126 |
+
# def set_header():
|
127 |
+
|
128 |
+
# return st.markdown(f"""<div class='main-header'>
|
129 |
+
|
130 |
+
# <h1>H & M Recommendations</h1>
|
131 |
+
|
132 |
+
# <img src="data:image;base64,{data_url}", alt="Logo">
|
133 |
+
|
134 |
+
# </div>""", unsafe_allow_html=True)
|
135 |
+
path1 = os.path.dirname(__file__)
|
136 |
+
|
137 |
+
file_1 = open(f"{path}/willbank.png", "rb")
|
138 |
+
|
139 |
+
contents1 = file_1.read()
|
140 |
+
|
141 |
+
data_url1 = base64.b64encode(contents1).decode("utf-8")
|
142 |
+
|
143 |
+
file_1.close()
|
144 |
+
|
145 |
+
|
146 |
+
|
147 |
+
DATA_PATH1 = './data'
|
148 |
+
|
149 |
+
IMAGES_PATH1 = './data/images_224_224'
|
150 |
+
|
151 |
+
|
152 |
+
|
153 |
+
|
154 |
+
|
155 |
+
def set_header():
|
156 |
+
return st.markdown(f"""<div class='main-header'>
|
157 |
+
<!-- <h1></h1> -->
|
158 |
+
<div >
|
159 |
+
<img class='blend-logo' src="data:image;base64,{data_url1}", alt="Logo">
|
160 |
+
</div>
|
161 |
+
<img class='blend-logo' src="data:image;base64,{data_url}", alt="Logo">
|
162 |
+
</div>""", unsafe_allow_html=True)
|
163 |
+
|
164 |
+
# def set_header():
|
165 |
+
# logo_path = "./path/to/your/local/LIME_logo.png" # Replace with the actual file path
|
166 |
+
# text = "LiME"
|
167 |
+
# return st.markdown(f"""<div class='main-header'>
|
168 |
+
# <img src="data:image/png;base64,{data_url}" alt="Logo" style="float: left; margin-right: 10px; width: 100px; height: auto;">
|
169 |
+
# <h1>{text}</h1>
|
170 |
+
# </div>""", unsafe_allow_html=True)
|
171 |
+
|
172 |
+
|
173 |
+
def s_curve(x,K,b,a,x0):
|
174 |
+
return K / (1 + b * np.exp(-a*(x-x0)))
|
175 |
+
|
176 |
+
|
177 |
+
def overview_test_data_prep_panel(X, df, spends_X, date_col, panel_col, target_col):
|
178 |
+
'''
|
179 |
+
function to create the data which is used in initialize data fn
|
180 |
+
X : X test with contributions
|
181 |
+
df : originally uploaded data (media data) which has raw vars
|
182 |
+
spends_X : spends of dates in X test
|
183 |
+
'''
|
184 |
+
|
185 |
+
# define channels
|
186 |
+
channels = {'paid_search': ['paid_search_impressions', 'paid_search_clicks'],
|
187 |
+
|
188 |
+
'fb_level_achieved_tier_1': ['fb_level_achieved_tier_1_impressions'], #, 'fb:_level_achieved_-_tier_1_clicks'],
|
189 |
+
|
190 |
+
'fb_level_achieved_tier_2': ['fb:_level_achieved_tier_2_impressions',
|
191 |
+
'fb_level_achieved_tier_2_clicks'],
|
192 |
+
|
193 |
+
'paid_social_others' : ['paid_social_others_impressions', 'paid_social_others_clicks'],
|
194 |
+
|
195 |
+
'ga_app': ['ga_app_impressions', 'ga_app_clicks'],
|
196 |
+
|
197 |
+
'digital_tactic_others': ['digital_tactic_others_impressions', 'digital_tactic_others_clicks'],
|
198 |
+
|
199 |
+
'kwai': ['kwai_impressions', 'kwai_clicks'],
|
200 |
+
|
201 |
+
'programmatic': ['programmatic_impressions', 'programmatic_clicks'],
|
202 |
+
|
203 |
+
# 'affiliates':['affiliates_clicks'],
|
204 |
+
#
|
205 |
+
# "indicacao":['indicacao_clicks'],
|
206 |
+
#
|
207 |
+
# "infleux":['infleux_clicks'],
|
208 |
+
#
|
209 |
+
# "influencer":['influencer_clicks']
|
210 |
+
}
|
211 |
+
|
212 |
+
channel_list = list(channels.keys())
|
213 |
+
|
214 |
+
# map transformed variable to raw variable name & channel name
|
215 |
+
# mapping eg : paid_search_clicks_lag_2 (transformed var) --> paid_search_clicks (raw var) --> paid_search (channel)
|
216 |
+
variables = {}
|
217 |
+
channel_and_variables = {}
|
218 |
+
new_variables = {}
|
219 |
+
new_channels_and_variables = {}
|
220 |
+
|
221 |
+
for transformed_var in [col for col in
|
222 |
+
X.drop(columns=[date_col, panel_col, target_col, 'pred', 'panel_effect']).columns if
|
223 |
+
"_contr" not in col]:
|
224 |
+
if len([col for col in df.columns if col in transformed_var]) == 1:
|
225 |
+
raw_var = [col for col in df.columns if col in transformed_var][0]
|
226 |
+
variables[transformed_var] = raw_var
|
227 |
+
channel_and_variables[raw_var] = [channel for channel, raw_vars in channels.items() if raw_var in raw_vars][
|
228 |
+
0]
|
229 |
+
else:
|
230 |
+
new_variables[transformed_var] = transformed_var
|
231 |
+
new_channels_and_variables[transformed_var] = 'base'
|
232 |
+
|
233 |
+
# Raw DF
|
234 |
+
raw_X = pd.merge(X[[date_col, panel_col]], df[[date_col, panel_col] + list(variables.values())], how='left',
|
235 |
+
on=[date_col, panel_col])
|
236 |
+
assert len(raw_X) == len(X)
|
237 |
+
|
238 |
+
raw_X_cols = []
|
239 |
+
for i in raw_X.columns:
|
240 |
+
if i in channel_and_variables.keys():
|
241 |
+
raw_X_cols.append(channel_and_variables[i])
|
242 |
+
else:
|
243 |
+
raw_X_cols.append(i)
|
244 |
+
raw_X.columns = raw_X_cols
|
245 |
+
|
246 |
+
# Contribution DF
|
247 |
+
contr_X = X[[date_col, panel_col, 'panel_effect'] + [col for col in X.columns if
|
248 |
+
"_contr" in col and "sum_" not in col]].copy()
|
249 |
+
new_variables = [col for col in contr_X.columns if
|
250 |
+
"_flag" in col.lower() or "trend" in col.lower() or "sine" in col.lower()]
|
251 |
+
if len(new_variables) > 0:
|
252 |
+
contr_X['const'] = contr_X[['panel_effect'] + new_variables].sum(axis=1)
|
253 |
+
contr_X.drop(columns=['panel_effect'], inplace=True)
|
254 |
+
contr_X.drop(columns=new_variables, inplace=True)
|
255 |
+
else:
|
256 |
+
contr_X.rename(columns={'panel_effect': 'const'}, inplace=True)
|
257 |
+
|
258 |
+
new_contr_X_cols = []
|
259 |
+
for col in contr_X.columns:
|
260 |
+
col_clean = col.replace("_contr", "")
|
261 |
+
new_contr_X_cols.append(col_clean)
|
262 |
+
contr_X.columns = new_contr_X_cols
|
263 |
+
|
264 |
+
contr_X_cols = []
|
265 |
+
for i in contr_X.columns:
|
266 |
+
if i in variables.keys():
|
267 |
+
contr_X_cols.append(channel_and_variables[variables[i]])
|
268 |
+
else:
|
269 |
+
contr_X_cols.append(i)
|
270 |
+
contr_X.columns = contr_X_cols
|
271 |
+
|
272 |
+
# Spends DF
|
273 |
+
spends_X.columns = [col.replace("_cost", "") for col in spends_X.columns]
|
274 |
+
|
275 |
+
raw_X.rename(columns={"date": "Date"}, inplace=True)
|
276 |
+
contr_X.rename(columns={"date": "Date"}, inplace=True)
|
277 |
+
spends_X.rename(columns={'date': 'Week'}, inplace=True)
|
278 |
+
|
279 |
+
# Create excel
|
280 |
+
file_name = "data_test_overview_panel_#" + target_col + ".xlsx"
|
281 |
+
with pd.ExcelWriter(file_name) as writer:
|
282 |
+
raw_X.to_excel(writer, sheet_name="RAW DATA MMM", index=False)
|
283 |
+
contr_X.to_excel(writer, sheet_name="CONTRIBUTION MMM", index=False)
|
284 |
+
spends_X.to_excel(writer, sheet_name="SPEND INPUT", index=False)
|
285 |
+
|
286 |
+
|
287 |
+
def overview_test_data_prep_nonpanel(X, df, spends_X, date_col, target_col):
|
288 |
+
'''
|
289 |
+
function to create the data which is used in initialize data fn
|
290 |
+
X : X test with contributions
|
291 |
+
df : originally uploaded data (media data) which has raw vars
|
292 |
+
spends_X : spends of dates in X test
|
293 |
+
'''
|
294 |
+
# define channels
|
295 |
+
channels = {'paid_search': ['paid_search_impressions', 'paid_search_clicks'],
|
296 |
+
|
297 |
+
'fb_level_achieved_tier_1': ['fb_level_achieved_tier_1_impressions', 'fb_level_achieved_tier_1_clicks'],
|
298 |
+
|
299 |
+
'fb_level_achieved_tier_2': ['fb_level_achieved_tier_2_impressions',
|
300 |
+
'fb_level_achieved_tier_2_clicks'],
|
301 |
+
|
302 |
+
'paid_social_others' : ['paid_social_others_impressions', 'paid_social_others_clicks'],
|
303 |
+
|
304 |
+
'ga_app_will_and_cid_pequena_baixo_risco': ['ga_app_will_and_cid_pequena_baixo_risco_impressions', 'ga_app_will_and_cid_pequena_baixo_risco_clicks'],
|
305 |
+
|
306 |
+
'digital_tactic_others': ['digital_tactic_others_impressions', 'digital_tactic_others_clicks'],
|
307 |
+
|
308 |
+
'kwai': ['kwai_impressions', 'kwai_clicks'],
|
309 |
+
|
310 |
+
'programmatic': ['programmatic_impressions', 'programmatic_clicks'],
|
311 |
+
|
312 |
+
'affiliates':['affiliates_clicks', 'affiliates_impressions'],
|
313 |
+
|
314 |
+
"indicacao":['indicacao_clicks', 'indicacao_impressions'],
|
315 |
+
|
316 |
+
"infleux":['infleux_clicks', 'infleux_impressions'],
|
317 |
+
|
318 |
+
"influencer":['influencer_clicks', 'influencer_impressions']
|
319 |
+
}
|
320 |
+
|
321 |
+
channel_list = list(channels.keys())
|
322 |
+
|
323 |
+
# map transformed variable to raw variable name & channel name
|
324 |
+
# mapping eg : paid_search_clicks_lag_2 (transformed var) --> paid_search_clicks (raw var) --> paid_search (channel)
|
325 |
+
variables = {}
|
326 |
+
channel_and_variables = {}
|
327 |
+
new_variables = {}
|
328 |
+
new_channels_and_variables = {}
|
329 |
+
|
330 |
+
cols_to_del = list(set([date_col, target_col, 'pred']).intersection((set(X.columns))))
|
331 |
+
for transformed_var in [col for col in
|
332 |
+
X.drop(columns=cols_to_del).columns if
|
333 |
+
"_contr" not in col]: # also has 'const'
|
334 |
+
if len([col for col in df.columns if col in transformed_var]) == 1: # col is raw var
|
335 |
+
raw_var = [col for col in df.columns if col in transformed_var][0]
|
336 |
+
variables[transformed_var] = raw_var
|
337 |
+
channel_and_variables[raw_var] = [channel for channel, raw_vars in channels.items() if raw_var in raw_vars][0]
|
338 |
+
else: # when no corresponding raw var then base
|
339 |
+
new_variables[transformed_var] = transformed_var
|
340 |
+
new_channels_and_variables[transformed_var] = 'base'
|
341 |
+
|
342 |
+
# Raw DF
|
343 |
+
raw_X = pd.merge(X[[date_col]], df[[date_col] + list(variables.values())], how='left',
|
344 |
+
on=[date_col])
|
345 |
+
assert len(raw_X) == len(X)
|
346 |
+
|
347 |
+
raw_X_cols = []
|
348 |
+
for i in raw_X.columns:
|
349 |
+
if i in channel_and_variables.keys():
|
350 |
+
raw_X_cols.append(channel_and_variables[i])
|
351 |
+
else:
|
352 |
+
raw_X_cols.append(i)
|
353 |
+
raw_X.columns = raw_X_cols
|
354 |
+
|
355 |
+
# Contribution DF
|
356 |
+
contr_X = X[[date_col] + [col for col in X.columns if "_contr" in col and "sum_" not in col]].copy()
|
357 |
+
# st.write(contr_X.columns)
|
358 |
+
new_variables = [col for col in contr_X.columns if
|
359 |
+
"_flag" in col.lower() or "trend" in col.lower() or "sine" in col.lower()]
|
360 |
+
if len(new_variables) > 0: # if new vars are available, their contributions should be added to base (called const)
|
361 |
+
contr_X['const_contr'] = contr_X[['const_contr'] + new_variables].sum(axis=1)
|
362 |
+
contr_X.drop(columns=new_variables, inplace=True)
|
363 |
+
|
364 |
+
|
365 |
+
new_contr_X_cols = []
|
366 |
+
for col in contr_X.columns:
|
367 |
+
col_clean = col.replace("_contr", "")
|
368 |
+
new_contr_X_cols.append(col_clean)
|
369 |
+
contr_X.columns = new_contr_X_cols
|
370 |
+
|
371 |
+
contr_X_cols = []
|
372 |
+
for i in contr_X.columns:
|
373 |
+
if i in variables.keys():
|
374 |
+
contr_X_cols.append(channel_and_variables[variables[i]])
|
375 |
+
else:
|
376 |
+
contr_X_cols.append(i)
|
377 |
+
contr_X.columns = contr_X_cols
|
378 |
+
|
379 |
+
# Spends DF
|
380 |
+
spends_X.columns = [col.replace("_cost", "").replace("_spends", '').replace("_spend", "") for col in spends_X.columns]
|
381 |
+
|
382 |
+
raw_X.rename(columns={"date": "Date"}, inplace=True)
|
383 |
+
contr_X.rename(columns={"date": "Date"}, inplace=True)
|
384 |
+
spends_X.rename(columns={'date': 'Week'}, inplace=True)
|
385 |
+
|
386 |
+
# Create excel
|
387 |
+
file_name = "data_test_overview_panel_#" + target_col + ".xlsx"
|
388 |
+
with pd.ExcelWriter(file_name) as writer:
|
389 |
+
raw_X.to_excel(writer, sheet_name="RAW DATA MMM", index=False)
|
390 |
+
contr_X.to_excel(writer, sheet_name="CONTRIBUTION MMM", index=False)
|
391 |
+
spends_X.to_excel(writer, sheet_name="SPEND INPUT", index=False)
|
392 |
+
|
393 |
+
|
394 |
+
def initialize_data(target_col):
|
395 |
+
# uopx_conv_rates = {'streaming_impressions' : 0.007,'digital_impressions' : 0.007,'search_clicks' : 0.00719,'tv_impressions' : 0.000173,
|
396 |
+
# "digital_clicks":0.005,"streaming_clicks":0.004,'streaming_spends':1,"tv_spends":1,"search_spends":1,
|
397 |
+
# "digital_spends":1}
|
398 |
+
#print('State initialized')
|
399 |
+
# excel = pd.read_excel("data_test_overview_panel.xlsx",sheet_name=None)
|
400 |
+
excel = pd.read_excel("data_test_overview_panel_#" + target_col + ".xlsx",sheet_name=None)
|
401 |
+
|
402 |
+
raw_df = excel['RAW DATA MMM']
|
403 |
+
|
404 |
+
spend_df = excel['SPEND INPUT']
|
405 |
+
contri_df = excel['CONTRIBUTION MMM']
|
406 |
+
#Revenue_df = excel['Revenue']
|
407 |
+
|
408 |
+
## remove sesonalities, indices etc ...
|
409 |
+
exclude_columns = ['Date', 'Week',
|
410 |
+
'Region',
|
411 |
+
'Controls_Grammarly_Index_SeasonalAVG',
|
412 |
+
'Controls_Quillbot_Index',
|
413 |
+
'Daily_Positive_Outliers',
|
414 |
+
'External_RemoteClass_Index',
|
415 |
+
'Intervals ON 20190520-20190805 | 20200518-20200803 | 20210517-20210802',
|
416 |
+
'Intervals ON 20190826-20191209 | 20200824-20201207 | 20210823-20211206',
|
417 |
+
'Intervals ON 20201005-20201019',
|
418 |
+
'Promotion_PercentOff',
|
419 |
+
'Promotion_TimeBased',
|
420 |
+
'Seasonality_Indicator_Chirstmas',
|
421 |
+
'Seasonality_Indicator_NewYears_Days',
|
422 |
+
'Seasonality_Indicator_Thanksgiving',
|
423 |
+
'Trend 20200302 / 20200803',
|
424 |
+
date_col, panel_col
|
425 |
+
]
|
426 |
+
|
427 |
+
# Aggregate all 3 dfs to date level (from date-panel level)
|
428 |
+
raw_df[date_col]=pd.to_datetime(raw_df[date_col])
|
429 |
+
raw_df_aggregations = {c:'sum' for c in raw_df.columns if c not in exclude_columns}
|
430 |
+
raw_df = raw_df.groupby(date_col).agg(raw_df_aggregations).reset_index()
|
431 |
+
|
432 |
+
contri_df[date_col]=pd.to_datetime(contri_df[date_col])
|
433 |
+
contri_df_aggregations = {c:'sum' for c in contri_df.columns if c not in exclude_columns}
|
434 |
+
contri_df = contri_df.groupby(date_col).agg(contri_df_aggregations).reset_index()
|
435 |
+
|
436 |
+
input_df = raw_df.sort_values(by=[date_col])
|
437 |
+
|
438 |
+
output_df = contri_df.sort_values(by=[date_col])
|
439 |
+
|
440 |
+
spend_df['Week'] = pd.to_datetime(spend_df['Week'], format='%Y-%m-%d', errors='coerce')
|
441 |
+
spend_df_aggregations = {c: 'sum' for c in spend_df.columns if c not in exclude_columns}
|
442 |
+
spend_df = spend_df.groupby('Week').agg(spend_df_aggregations).reset_index()
|
443 |
+
# spend_df['Week'] = pd.to_datetime(spend_df['Week'], errors='coerce')
|
444 |
+
# spend_df = spend_df.sort_values(by='Week')
|
445 |
+
|
446 |
+
|
447 |
+
channel_list = [col for col in input_df.columns if col not in exclude_columns]
|
448 |
+
|
449 |
+
response_curves = {}
|
450 |
+
mapes = {}
|
451 |
+
rmses = {}
|
452 |
+
upper_limits = {}
|
453 |
+
powers = {}
|
454 |
+
r2 = {}
|
455 |
+
conv_rates = {}
|
456 |
+
output_cols = []
|
457 |
+
channels = {}
|
458 |
+
sales = None
|
459 |
+
dates = input_df.Date.values
|
460 |
+
actual_output_dic = {}
|
461 |
+
actual_input_dic = {}
|
462 |
+
|
463 |
+
# ONLY FOR TESTING
|
464 |
+
# channel_list=['programmatic']
|
465 |
+
infeasible_channels = [c for c in contri_df.select_dtypes(include=['float', 'int']).columns if contri_df[c].sum()<=0]
|
466 |
+
# st.write(infeasible_channels)
|
467 |
+
channel_list=list(set(channel_list)-set(infeasible_channels))
|
468 |
+
|
469 |
+
for inp_col in channel_list:
|
470 |
+
st.write(inp_col)
|
471 |
+
|
472 |
+
# # New - Sprint 2
|
473 |
+
# if is_panel:
|
474 |
+
# input_df1 = input_df.groupby([date_col]).agg({inp_col:'sum'}).reset_index() # aggregate spends on date
|
475 |
+
# spends = input_df1[inp_col].values
|
476 |
+
# else :
|
477 |
+
# spends = input_df[inp_col].values
|
478 |
+
spends = spend_df[inp_col].values
|
479 |
+
|
480 |
+
x = spends.copy()
|
481 |
+
# upper limit for penalty
|
482 |
+
upper_limits[inp_col] = 2*x.max()
|
483 |
+
|
484 |
+
|
485 |
+
|
486 |
+
# contribution
|
487 |
+
# New - Sprint 2
|
488 |
+
out_col = [_col for _col in output_df.columns if _col.startswith(inp_col)][0]
|
489 |
+
if is_panel :
|
490 |
+
output_df1 = output_df.groupby([date_col]).agg({out_col:'sum'}).reset_index()
|
491 |
+
y = output_df1[out_col].values.copy()
|
492 |
+
else :
|
493 |
+
y = output_df[out_col].values.copy()
|
494 |
+
|
495 |
+
actual_output_dic[inp_col] = y.copy()
|
496 |
+
actual_input_dic[inp_col] = x.copy()
|
497 |
+
##output cols aggregation
|
498 |
+
output_cols.append(out_col)
|
499 |
+
|
500 |
+
## scale the input
|
501 |
+
power = (np.ceil(np.log(x.max()) / np.log(10) )- 3)
|
502 |
+
if power >= 0 :
|
503 |
+
x = x / 10**power
|
504 |
+
|
505 |
+
|
506 |
+
x = x.astype('float64')
|
507 |
+
y = y.astype('float64')
|
508 |
+
#print('#printing yyyyyyyyy')
|
509 |
+
#print(inp_col)
|
510 |
+
#print(x.max())
|
511 |
+
#print(y.max())
|
512 |
+
# st.write(y.max(),x.max())
|
513 |
+
print(y.max(),x.max())
|
514 |
+
if y.max()<=0.01:
|
515 |
+
if x.max()<=0.01 :
|
516 |
+
st.write("here-here")
|
517 |
+
bounds = ((0, 0, 0, 0), (3 * 0.01, 1000, 1, 0.01))
|
518 |
+
|
519 |
+
else :
|
520 |
+
st.write("here")
|
521 |
+
bounds = ((0, 0, 0, 0), (3 * 0.01, 1000, 1, 0.01))
|
522 |
+
else :
|
523 |
+
bounds = ((0, 0, 0, 0), (3 * y.max(), 1000, 1, x.max()))
|
524 |
+
#bounds = ((y.max(), 3*y.max()),(0,1000),(0,1),(0,x.max()))
|
525 |
+
params,_ = curve_fit(s_curve,x,y,p0=(2*y.max(),0.01,1e-5,x.max()),
|
526 |
+
bounds=bounds,
|
527 |
+
maxfev=int(1e5))
|
528 |
+
mape = (100 * abs(1 - s_curve(x, *params) / y.clip(min=1))).mean()
|
529 |
+
rmse = np.sqrt(((y - s_curve(x,*params))**2).mean())
|
530 |
+
r2_ = r2_score(y, s_curve(x,*params))
|
531 |
+
|
532 |
+
response_curves[inp_col] = {'K' : params[0], 'b' : params[1], 'a' : params[2], 'x0' : params[3]}
|
533 |
+
mapes[inp_col] = mape
|
534 |
+
rmses[inp_col] = rmse
|
535 |
+
r2[inp_col] = r2_
|
536 |
+
powers[inp_col] = power
|
537 |
+
|
538 |
+
|
539 |
+
## conversion rates
|
540 |
+
spend_col = [_col for _col in spend_df.columns if _col.startswith(inp_col.rsplit('_',1)[0])][0]
|
541 |
+
|
542 |
+
#print('#printing spendssss')
|
543 |
+
#print(spend_col)
|
544 |
+
conv = (spend_df.set_index('Week')[spend_col] / input_df.set_index('Date')[inp_col].clip(lower=1)).reset_index()
|
545 |
+
conv.rename(columns={'index':'Week'},inplace=True)
|
546 |
+
conv['year'] = conv.Week.dt.year
|
547 |
+
conv_rates[inp_col] = list(conv.drop('Week',axis=1).mean().to_dict().values())[0]
|
548 |
+
##print('Before',conv_rates[inp_col])
|
549 |
+
# conv_rates[inp_col] = uopx_conv_rates[inp_col]
|
550 |
+
##print('After',(conv_rates[inp_col]))
|
551 |
+
|
552 |
+
|
553 |
+
channel = Channel(name=inp_col,dates=dates,
|
554 |
+
spends=spends,
|
555 |
+
# conversion_rate = np.mean(list(conv_rates[inp_col].values())),
|
556 |
+
conversion_rate = conv_rates[inp_col],
|
557 |
+
response_curve_type='s-curve',
|
558 |
+
response_curve_params={'K' : params[0], 'b' : params[1], 'a' : params[2], 'x0' : params[3]},
|
559 |
+
bounds=np.array([-10,10]))
|
560 |
+
channels[inp_col] = channel
|
561 |
+
if sales is None:
|
562 |
+
sales = channel.actual_sales
|
563 |
+
else:
|
564 |
+
sales += channel.actual_sales
|
565 |
+
# st.write(inp_col, channel.actual_sales)
|
566 |
+
# st.write(output_cols)
|
567 |
+
other_contributions = output_df.drop([*output_cols], axis=1).sum(axis=1, numeric_only = True).values
|
568 |
+
correction = output_df.drop(['Date'],axis=1).sum(axis=1).values - (sales + other_contributions)
|
569 |
+
|
570 |
+
scenario_test_df=pd.DataFrame(columns=['other_contributions','correction', 'sales'])
|
571 |
+
scenario_test_df['other_contributions']=other_contributions
|
572 |
+
scenario_test_df['correction']=correction
|
573 |
+
scenario_test_df['sales']=sales
|
574 |
+
scenario_test_df.to_csv("test/scenario_test_df.csv",index=False)
|
575 |
+
output_df.to_csv("test/output_df.csv",index=False)
|
576 |
+
|
577 |
+
scenario = Scenario(name='default', channels=channels, constant=other_contributions, correction = correction)
|
578 |
+
## setting session variables
|
579 |
+
st.session_state['initialized'] = True
|
580 |
+
st.session_state['actual_df'] = input_df
|
581 |
+
st.session_state['raw_df'] = raw_df
|
582 |
+
st.session_state['contri_df'] = output_df
|
583 |
+
default_scenario_dict = class_to_dict(scenario)
|
584 |
+
st.session_state['default_scenario_dict'] = default_scenario_dict
|
585 |
+
st.session_state['scenario'] = scenario
|
586 |
+
st.session_state['channels_list'] = channel_list
|
587 |
+
st.session_state['optimization_channels'] = {channel_name : False for channel_name in channel_list}
|
588 |
+
st.session_state['rcs'] = response_curves
|
589 |
+
st.session_state['powers'] = powers
|
590 |
+
st.session_state['actual_contribution_df'] = pd.DataFrame(actual_output_dic)
|
591 |
+
st.session_state['actual_input_df'] = pd.DataFrame(actual_input_dic)
|
592 |
+
|
593 |
+
for channel in channels.values():
|
594 |
+
st.session_state[channel.name] = numerize(channel.actual_total_spends * channel.conversion_rate,1)
|
595 |
+
|
596 |
+
st.session_state['xlsx_buffer'] = io.BytesIO()
|
597 |
+
|
598 |
+
|
599 |
+
if Path('../saved_scenarios.pkl').exists():
|
600 |
+
with open('../saved_scenarios.pkl','rb') as f:
|
601 |
+
st.session_state['saved_scenarios'] = pickle.load(f)
|
602 |
+
else:
|
603 |
+
st.session_state['saved_scenarios'] = OrderedDict()
|
604 |
+
|
605 |
+
st.session_state['total_spends_change'] = 0
|
606 |
+
st.session_state['optimization_channels'] = {channel_name : False for channel_name in channel_list}
|
607 |
+
st.session_state['disable_download_button'] = True
|
608 |
+
|
609 |
+
# def initialize_data():
|
610 |
+
# # fetch data from excel
|
611 |
+
# output = pd.read_excel('data.xlsx',sheet_name=None)
|
612 |
+
# raw_df = output['RAW DATA MMM']
|
613 |
+
# contribution_df = output['CONTRIBUTION MMM']
|
614 |
+
# Revenue_df = output['Revenue']
|
615 |
+
|
616 |
+
# ## channels to be shows
|
617 |
+
# channel_list = []
|
618 |
+
# for col in raw_df.columns:
|
619 |
+
# if 'click' in col.lower() or 'spend' in col.lower() or 'imp' in col.lower():
|
620 |
+
# ##print(col)
|
621 |
+
# channel_list.append(col)
|
622 |
+
# else:
|
623 |
+
# pass
|
624 |
+
|
625 |
+
# ## NOTE : Considered only Desktop spends for all calculations
|
626 |
+
# acutal_df = raw_df[raw_df.Region == 'Desktop'].copy()
|
627 |
+
# ## NOTE : Considered one year of data
|
628 |
+
# acutal_df = acutal_df[acutal_df.Date>'2020-12-31']
|
629 |
+
# actual_df = acutal_df.drop('Region',axis=1).sort_values(by='Date')[[*channel_list,'Date']]
|
630 |
+
|
631 |
+
# ##load response curves
|
632 |
+
# with open('./grammarly_response_curves.json','r') as f:
|
633 |
+
# response_curves = json.load(f)
|
634 |
+
|
635 |
+
# ## create channel dict for scenario creation
|
636 |
+
# dates = actual_df.Date.values
|
637 |
+
# channels = {}
|
638 |
+
# rcs = {}
|
639 |
+
# constant = 0.
|
640 |
+
# for i,info_dict in enumerate(response_curves):
|
641 |
+
# name = info_dict.get('name')
|
642 |
+
# response_curve_type = info_dict.get('response_curve')
|
643 |
+
# response_curve_params = info_dict.get('params')
|
644 |
+
# rcs[name] = response_curve_params
|
645 |
+
# if name != 'constant':
|
646 |
+
# spends = actual_df[name].values
|
647 |
+
# channel = Channel(name=name,dates=dates,
|
648 |
+
# spends=spends,
|
649 |
+
# response_curve_type=response_curve_type,
|
650 |
+
# response_curve_params=response_curve_params,
|
651 |
+
# bounds=np.array([-30,30]))
|
652 |
+
|
653 |
+
# channels[name] = channel
|
654 |
+
# else:
|
655 |
+
# constant = info_dict.get('value',0.) * len(dates)
|
656 |
+
|
657 |
+
# ## create scenario
|
658 |
+
# scenario = Scenario(name='default', channels=channels, constant=constant)
|
659 |
+
# default_scenario_dict = class_to_dict(scenario)
|
660 |
+
|
661 |
+
|
662 |
+
# ## setting session variables
|
663 |
+
# st.session_state['initialized'] = True
|
664 |
+
# st.session_state['actual_df'] = actual_df
|
665 |
+
# st.session_state['raw_df'] = raw_df
|
666 |
+
# st.session_state['default_scenario_dict'] = default_scenario_dict
|
667 |
+
# st.session_state['scenario'] = scenario
|
668 |
+
# st.session_state['channels_list'] = channel_list
|
669 |
+
# st.session_state['optimization_channels'] = {channel_name : False for channel_name in channel_list}
|
670 |
+
# st.session_state['rcs'] = rcs
|
671 |
+
# for channel in channels.values():
|
672 |
+
# if channel.name not in st.session_state:
|
673 |
+
# st.session_state[channel.name] = float(channel.actual_total_spends)
|
674 |
+
|
675 |
+
# if 'xlsx_buffer' not in st.session_state:
|
676 |
+
# st.session_state['xlsx_buffer'] = io.BytesIO()
|
677 |
+
|
678 |
+
# ## for saving scenarios
|
679 |
+
# if 'saved_scenarios' not in st.session_state:
|
680 |
+
# if Path('../saved_scenarios.pkl').exists():
|
681 |
+
# with open('../saved_scenarios.pkl','rb') as f:
|
682 |
+
# st.session_state['saved_scenarios'] = pickle.load(f)
|
683 |
+
|
684 |
+
# else:
|
685 |
+
# st.session_state['saved_scenarios'] = OrderedDict()
|
686 |
+
|
687 |
+
# if 'total_spends_change' not in st.session_state:
|
688 |
+
# st.session_state['total_spends_change'] = 0
|
689 |
+
|
690 |
+
# if 'optimization_channels' not in st.session_state:
|
691 |
+
# st.session_state['optimization_channels'] = {channel_name : False for channel_name in channel_list}
|
692 |
+
|
693 |
+
# if 'disable_download_button' not in st.session_state:
|
694 |
+
# st.session_state['disable_download_button'] = True
|
695 |
+
def create_channel_summary(scenario):
|
696 |
+
summary_columns = []
|
697 |
+
|
698 |
+
actual_spends_rows = []
|
699 |
+
|
700 |
+
actual_sales_rows = []
|
701 |
+
|
702 |
+
actual_roi_rows = []
|
703 |
+
|
704 |
+
for channel in scenario.channels.values():
|
705 |
+
|
706 |
+
name_mod = channel.name.replace('_', ' ')
|
707 |
+
|
708 |
+
if name_mod.lower().endswith(' imp'):
|
709 |
+
name_mod = name_mod.replace('Imp', ' Impressions')
|
710 |
+
|
711 |
+
print(name_mod, channel.actual_total_spends, channel.conversion_rate,
|
712 |
+
channel.actual_total_spends * channel.conversion_rate)
|
713 |
+
|
714 |
+
summary_columns.append(name_mod)
|
715 |
+
|
716 |
+
actual_spends_rows.append(format_numbers(float(channel.actual_total_spends * channel.conversion_rate)))
|
717 |
+
|
718 |
+
actual_sales_rows.append(format_numbers((float(channel.actual_total_sales))))
|
719 |
+
|
720 |
+
actual_roi_rows.append(decimal_formater(
|
721 |
+
format_numbers((channel.actual_total_sales) / (channel.actual_total_spends * channel.conversion_rate),
|
722 |
+
include_indicator=False, n_decimals=4), n_decimals=4))
|
723 |
+
|
724 |
+
actual_summary_df = pd.DataFrame([summary_columns, actual_spends_rows, actual_sales_rows, actual_roi_rows]).T
|
725 |
+
|
726 |
+
actual_summary_df.columns = ['Channel', 'Spends', 'Prospects', 'ROI']
|
727 |
+
|
728 |
+
actual_summary_df['Prospects'] = actual_summary_df['Prospects'].map(lambda x: str(x)[1:])
|
729 |
+
|
730 |
+
return actual_summary_df
|
731 |
+
|
732 |
+
|
733 |
+
# def create_channel_summary(scenario):
|
734 |
+
#
|
735 |
+
# # Provided data
|
736 |
+
# data = {
|
737 |
+
# 'Channel': ['Paid Search', 'Ga will cid baixo risco', 'Digital tactic others', 'Fb la tier 1', 'Fb la tier 2', 'Paid social others', 'Programmatic', 'Kwai', 'Indicacao', 'Infleux', 'Influencer'],
|
738 |
+
# 'Spends': ['$ 11.3K', '$ 155.2K', '$ 50.7K', '$ 125.4K', '$ 125.2K', '$ 105K', '$ 3.3M', '$ 47.5K', '$ 55.9K', '$ 632.3K', '$ 48.3K'],
|
739 |
+
# 'Revenue': ['558.0K', '3.5M', '5.2M', '3.1M', '3.1M', '2.1M', '20.8M', '1.6M', '728.4K', '22.9M', '4.8M']
|
740 |
+
# }
|
741 |
+
#
|
742 |
+
# # Create DataFrame
|
743 |
+
# df = pd.DataFrame(data)
|
744 |
+
#
|
745 |
+
# # Convert currency strings to numeric values
|
746 |
+
# df['Spends'] = df['Spends'].replace({'\$': '', 'K': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(int)
|
747 |
+
# df['Revenue'] = df['Revenue'].replace({'\$': '', 'K': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(int)
|
748 |
+
#
|
749 |
+
# # Calculate ROI
|
750 |
+
# df['ROI'] = ((df['Revenue'] - df['Spends']) / df['Spends'])
|
751 |
+
#
|
752 |
+
# # Format columns
|
753 |
+
# format_currency = lambda x: f"${x:,.1f}"
|
754 |
+
# format_roi = lambda x: f"{x:.1f}"
|
755 |
+
#
|
756 |
+
# df['Spends'] = ['$ 11.3K', '$ 155.2K', '$ 50.7K', '$ 125.4K', '$ 125.2K', '$ 105K', '$ 3.3M', '$ 47.5K', '$ 55.9K', '$ 632.3K', '$ 48.3K']
|
757 |
+
# df['Revenue'] = ['$ 536.3K', '$ 3.4M', '$ 5M', '$ 3M', '$ 3M', '$ 2M', '$ 20M', '$ 1.5M', '$ 7.1M', '$ 22M', '$ 4.6M']
|
758 |
+
# df['ROI'] = df['ROI'].apply(format_roi)
|
759 |
+
#
|
760 |
+
# return df
|
761 |
+
|
762 |
+
|
763 |
+
@st.cache(allow_output_mutation=True)
|
764 |
+
def create_contribution_pie(scenario):
|
765 |
+
#c1f7dc
|
766 |
+
colors_map = {col:color for col,color in zip(st.session_state['channels_list'],plotly.colors.n_colors(plotly.colors.hex_to_rgb('#BE6468'), plotly.colors.hex_to_rgb('#E7B8B7'),23))}
|
767 |
+
total_contribution_fig = make_subplots(rows=1, cols=2,subplot_titles=['Spends','Revenue'],specs=[[{"type": "pie"}, {"type": "pie"}]])
|
768 |
+
total_contribution_fig.add_trace(
|
769 |
+
go.Pie(labels=[channel_name_formating(channel_name) for channel_name in st.session_state['channels_list']] + ['Non Media'],
|
770 |
+
values= [round(scenario.channels[channel_name].actual_total_spends * scenario.channels[channel_name].conversion_rate,1) for channel_name in st.session_state['channels_list']] + [0],
|
771 |
+
marker=dict(colors = [plotly.colors.label_rgb(colors_map[channel_name]) for channel_name in st.session_state['channels_list']] + ['#F0F0F0']),
|
772 |
+
hole=0.3),
|
773 |
+
row=1, col=1)
|
774 |
+
|
775 |
+
total_contribution_fig.add_trace(
|
776 |
+
go.Pie(labels=[channel_name_formating(channel_name) for channel_name in st.session_state['channels_list']] + ['Non Media'],
|
777 |
+
values= [scenario.channels[channel_name].actual_total_sales for channel_name in st.session_state['channels_list']] + [scenario.correction.sum() + scenario.constant.sum()],
|
778 |
+
hole=0.3),
|
779 |
+
row=1, col=2)
|
780 |
+
|
781 |
+
total_contribution_fig.update_traces(textposition='inside',texttemplate='%{percent:.1%}')
|
782 |
+
total_contribution_fig.update_layout(uniformtext_minsize=12,title='Channel contribution', uniformtext_mode='hide')
|
783 |
+
return total_contribution_fig
|
784 |
+
|
785 |
+
@st.cache(allow_output_mutation=True)
|
786 |
+
|
787 |
+
# def create_contribuion_stacked_plot(scenario):
|
788 |
+
# weekly_contribution_fig = make_subplots(rows=1, cols=2,subplot_titles=['Spends','Revenue'],specs=[[{"type": "bar"}, {"type": "bar"}]])
|
789 |
+
# raw_df = st.session_state['raw_df']
|
790 |
+
# df = raw_df.sort_values(by='Date')
|
791 |
+
# x = df.Date
|
792 |
+
# weekly_spends_data = []
|
793 |
+
# weekly_sales_data = []
|
794 |
+
# for channel_name in st.session_state['channels_list']:
|
795 |
+
# weekly_spends_data.append((go.Bar(x=x,
|
796 |
+
# y=scenario.channels[channel_name].actual_spends * scenario.channels[channel_name].conversion_rate,
|
797 |
+
# name=channel_name_formating(channel_name),
|
798 |
+
# hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}",
|
799 |
+
# legendgroup=channel_name)))
|
800 |
+
# weekly_sales_data.append((go.Bar(x=x,
|
801 |
+
# y=scenario.channels[channel_name].actual_sales,
|
802 |
+
# name=channel_name_formating(channel_name),
|
803 |
+
# hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
|
804 |
+
# legendgroup=channel_name, showlegend=False)))
|
805 |
+
# for _d in weekly_spends_data:
|
806 |
+
# weekly_contribution_fig.add_trace(_d, row=1, col=1)
|
807 |
+
# for _d in weekly_sales_data:
|
808 |
+
# weekly_contribution_fig.add_trace(_d, row=1, col=2)
|
809 |
+
# weekly_contribution_fig.add_trace(go.Bar(x=x,
|
810 |
+
# y=scenario.constant + scenario.correction,
|
811 |
+
# name='Non Media',
|
812 |
+
# hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}"), row=1, col=2)
|
813 |
+
# weekly_contribution_fig.update_layout(barmode='stack', title='Channel contribuion by week', xaxis_title='Date')
|
814 |
+
# weekly_contribution_fig.update_xaxes(showgrid=False)
|
815 |
+
# weekly_contribution_fig.update_yaxes(showgrid=False)
|
816 |
+
# return weekly_contribution_fig
|
817 |
+
|
818 |
+
# @st.cache(allow_output_mutation=True)
|
819 |
+
# def create_channel_spends_sales_plot(channel):
|
820 |
+
# if channel is not None:
|
821 |
+
# x = channel.dates
|
822 |
+
# _spends = channel.actual_spends * channel.conversion_rate
|
823 |
+
# _sales = channel.actual_sales
|
824 |
+
# channel_sales_spends_fig = make_subplots(specs=[[{"secondary_y": True}]])
|
825 |
+
# channel_sales_spends_fig.add_trace(go.Bar(x=x, y=_sales,marker_color='#c1f7dc',name='Revenue', hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}"), secondary_y = False)
|
826 |
+
# channel_sales_spends_fig.add_trace(go.Scatter(x=x, y=_spends,line=dict(color='#005b96'),name='Spends',hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}"), secondary_y = True)
|
827 |
+
# channel_sales_spends_fig.update_layout(xaxis_title='Date',yaxis_title='Revenue',yaxis2_title='Spends ($)',title='Channel spends and Revenue week wise')
|
828 |
+
# channel_sales_spends_fig.update_xaxes(showgrid=False)
|
829 |
+
# channel_sales_spends_fig.update_yaxes(showgrid=False)
|
830 |
+
# else:
|
831 |
+
# raw_df = st.session_state['raw_df']
|
832 |
+
# df = raw_df.sort_values(by='Date')
|
833 |
+
# x = df.Date
|
834 |
+
# scenario = class_from_dict(st.session_state['default_scenario_dict'])
|
835 |
+
# _sales = scenario.constant + scenario.correction
|
836 |
+
# channel_sales_spends_fig = make_subplots(specs=[[{"secondary_y": True}]])
|
837 |
+
# channel_sales_spends_fig.add_trace(go.Bar(x=x, y=_sales,marker_color='#c1f7dc',name='Revenue', hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}"), secondary_y = False)
|
838 |
+
# # channel_sales_spends_fig.add_trace(go.Scatter(x=x, y=_spends,line=dict(color='#15C39A'),name='Spends',hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}"), secondary_y = True)
|
839 |
+
# channel_sales_spends_fig.update_layout(xaxis_title='Date',yaxis_title='Revenue',yaxis2_title='Spends ($)',title='Channel spends and Revenue week wise')
|
840 |
+
# channel_sales_spends_fig.update_xaxes(showgrid=False)
|
841 |
+
# channel_sales_spends_fig.update_yaxes(showgrid=False)
|
842 |
+
# return channel_sales_spends_fig
|
843 |
+
|
844 |
+
|
845 |
+
# Define a shared color palette
|
846 |
+
|
847 |
+
|
848 |
+
# def create_contribution_pie():
|
849 |
+
# color_palette = ['#F3F3F0', '#5E7D7E', '#2FA1FF', '#00EDED', '#00EAE4', '#304550', '#EDEBEB', '#7FBEFD', '#003059', '#A2F3F3', '#E1D6E2', '#B6B6B6']
|
850 |
+
# total_contribution_fig = make_subplots(rows=1, cols=2, subplot_titles=['Spends', 'Revenue'], specs=[[{"type": "pie"}, {"type": "pie"}]])
|
851 |
+
#
|
852 |
+
# channels_list = ['Paid Search', 'Ga will cid baixo risco', 'Digital tactic others', 'Fb la tier 1', 'Fb la tier 2', 'Paid social others', 'Programmatic', 'Kwai', 'Indicacao', 'Infleux', 'Influencer', 'Non Media']
|
853 |
+
#
|
854 |
+
# # Assign colors from the limited palette to channels
|
855 |
+
# colors_map = {col: color_palette[i % len(color_palette)] for i, col in enumerate(channels_list)}
|
856 |
+
# colors_map['Non Media'] = color_palette[5] # Assign fixed green color for 'Non Media'
|
857 |
+
#
|
858 |
+
# # Hardcoded values for Spends and Revenue
|
859 |
+
# spends_values = [0.5, 3.36, 1.1, 2.7, 2.7, 2.27, 70.6, 1, 1, 13.7, 1, 0]
|
860 |
+
# revenue_values = [1, 4, 5, 3, 3, 2, 50.8, 1.5, 0.7, 13, 0, 16]
|
861 |
+
#
|
862 |
+
# # Add trace for Spends pie chart
|
863 |
+
# total_contribution_fig.add_trace(
|
864 |
+
# go.Pie(
|
865 |
+
# labels=[channel_name for channel_name in channels_list],
|
866 |
+
# values=spends_values,
|
867 |
+
# marker=dict(colors=[colors_map[channel_name] for channel_name in channels_list]),
|
868 |
+
# hole=0.3
|
869 |
+
# ),
|
870 |
+
# row=1, col=1
|
871 |
+
# )
|
872 |
+
#
|
873 |
+
# # Add trace for Revenue pie chart
|
874 |
+
# total_contribution_fig.add_trace(
|
875 |
+
# go.Pie(
|
876 |
+
# labels=[channel_name for channel_name in channels_list],
|
877 |
+
# values=revenue_values,
|
878 |
+
# marker=dict(colors=[colors_map[channel_name] for channel_name in channels_list]),
|
879 |
+
# hole=0.3
|
880 |
+
# ),
|
881 |
+
# row=1, col=2
|
882 |
+
# )
|
883 |
+
#
|
884 |
+
# total_contribution_fig.update_traces(textposition='inside', texttemplate='%{percent:.1%}')
|
885 |
+
# total_contribution_fig.update_layout(uniformtext_minsize=12, title='Channel contribution', uniformtext_mode='hide')
|
886 |
+
# return total_contribution_fig
|
887 |
+
|
888 |
+
def create_contribuion_stacked_plot(scenario):
|
889 |
+
weekly_contribution_fig = make_subplots(rows=1, cols=2, subplot_titles=['Spends', 'Revenue'], specs=[[{"type": "bar"}, {"type": "bar"}]])
|
890 |
+
raw_df = st.session_state['raw_df']
|
891 |
+
df = raw_df.sort_values(by='Date')
|
892 |
+
x = df.Date
|
893 |
+
weekly_spends_data = []
|
894 |
+
weekly_sales_data = []
|
895 |
+
|
896 |
+
for i, channel_name in enumerate(st.session_state['channels_list']):
|
897 |
+
color = color_palette[i % len(color_palette)]
|
898 |
+
|
899 |
+
weekly_spends_data.append(go.Bar(
|
900 |
+
x=x,
|
901 |
+
y=scenario.channels[channel_name].actual_spends * scenario.channels[channel_name].conversion_rate,
|
902 |
+
name=channel_name_formating(channel_name),
|
903 |
+
hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}",
|
904 |
+
legendgroup=channel_name,
|
905 |
+
marker_color=color,
|
906 |
+
))
|
907 |
+
|
908 |
+
weekly_sales_data.append(go.Bar(
|
909 |
+
x=x,
|
910 |
+
y=scenario.channels[channel_name].actual_sales,
|
911 |
+
name=channel_name_formating(channel_name),
|
912 |
+
hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
|
913 |
+
legendgroup=channel_name,
|
914 |
+
showlegend=False,
|
915 |
+
marker_color=color,
|
916 |
+
))
|
917 |
+
|
918 |
+
for _d in weekly_spends_data:
|
919 |
+
weekly_contribution_fig.add_trace(_d, row=1, col=1)
|
920 |
+
for _d in weekly_sales_data:
|
921 |
+
weekly_contribution_fig.add_trace(_d, row=1, col=2)
|
922 |
+
|
923 |
+
weekly_contribution_fig.add_trace(go.Bar(
|
924 |
+
x=x,
|
925 |
+
y=scenario.constant + scenario.correction,
|
926 |
+
name='Non Media',
|
927 |
+
hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
|
928 |
+
marker_color=color_palette[-1],
|
929 |
+
), row=1, col=2)
|
930 |
+
|
931 |
+
weekly_contribution_fig.update_layout(barmode='stack', title='Channel contribution by week', xaxis_title='Date')
|
932 |
+
weekly_contribution_fig.update_xaxes(showgrid=False)
|
933 |
+
weekly_contribution_fig.update_yaxes(showgrid=False)
|
934 |
+
return weekly_contribution_fig
|
935 |
+
|
936 |
+
def create_channel_spends_sales_plot(channel):
|
937 |
+
if channel is not None:
|
938 |
+
x = channel.dates
|
939 |
+
_spends = channel.actual_spends * channel.conversion_rate
|
940 |
+
_sales = channel.actual_sales
|
941 |
+
channel_sales_spends_fig = make_subplots(specs=[[{"secondary_y": True}]])
|
942 |
+
channel_sales_spends_fig.add_trace(go.Bar(
|
943 |
+
x=x,
|
944 |
+
y=_sales,
|
945 |
+
marker_color=color_palette[3], # You can choose a color from the palette
|
946 |
+
name='Revenue',
|
947 |
+
hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
|
948 |
+
), secondary_y=False)
|
949 |
+
|
950 |
+
channel_sales_spends_fig.add_trace(go.Scatter(
|
951 |
+
x=x,
|
952 |
+
y=_spends,
|
953 |
+
line=dict(color=color_palette[2]), # You can choose another color from the palette
|
954 |
+
name='Spends',
|
955 |
+
hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}",
|
956 |
+
), secondary_y=True)
|
957 |
+
|
958 |
+
channel_sales_spends_fig.update_layout(xaxis_title='Date', yaxis_title='Revenue', yaxis2_title='Spends ($)', title='Channel spends and Revenue week-wise')
|
959 |
+
channel_sales_spends_fig.update_xaxes(showgrid=False)
|
960 |
+
channel_sales_spends_fig.update_yaxes(showgrid=False)
|
961 |
+
else:
|
962 |
+
raw_df = st.session_state['raw_df']
|
963 |
+
df = raw_df.sort_values(by='Date')
|
964 |
+
x = df.Date
|
965 |
+
scenario = class_from_dict(st.session_state['default_scenario_dict'])
|
966 |
+
_sales = scenario.constant + scenario.correction
|
967 |
+
channel_sales_spends_fig = make_subplots(specs=[[{"secondary_y": True}]])
|
968 |
+
channel_sales_spends_fig.add_trace(go.Bar(
|
969 |
+
x=x,
|
970 |
+
y=_sales,
|
971 |
+
marker_color=color_palette[0], # You can choose a color from the palette
|
972 |
+
name='Revenue',
|
973 |
+
hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
|
974 |
+
), secondary_y=False)
|
975 |
+
|
976 |
+
channel_sales_spends_fig.update_layout(xaxis_title='Date', yaxis_title='Revenue', yaxis2_title='Spends ($)', title='Channel spends and Revenue week-wise')
|
977 |
+
channel_sales_spends_fig.update_xaxes(showgrid=False)
|
978 |
+
channel_sales_spends_fig.update_yaxes(showgrid=False)
|
979 |
+
|
980 |
+
return channel_sales_spends_fig
|
981 |
+
|
982 |
+
def format_numbers(value, n_decimals=1,include_indicator = True):
|
983 |
+
if include_indicator:
|
984 |
+
return f'{CURRENCY_INDICATOR} {numerize(value,n_decimals)}'
|
985 |
+
else:
|
986 |
+
return f'{numerize(value,n_decimals)}'
|
987 |
+
|
988 |
+
|
989 |
+
def decimal_formater(num_string,n_decimals=1):
|
990 |
+
parts = num_string.split('.')
|
991 |
+
if len(parts) == 1:
|
992 |
+
return num_string+'.' + '0'*n_decimals
|
993 |
+
else:
|
994 |
+
to_be_padded = n_decimals - len(parts[-1])
|
995 |
+
if to_be_padded > 0 :
|
996 |
+
return num_string+'0'*to_be_padded
|
997 |
+
else:
|
998 |
+
return num_string
|
999 |
+
|
1000 |
+
|
1001 |
+
def channel_name_formating(channel_name):
|
1002 |
+
name_mod = channel_name.replace('_', ' ')
|
1003 |
+
if name_mod.lower().endswith(' imp'):
|
1004 |
+
name_mod = name_mod.replace('Imp','Spend')
|
1005 |
+
elif name_mod.lower().endswith(' clicks'):
|
1006 |
+
name_mod = name_mod.replace('Clicks','Spend')
|
1007 |
+
return name_mod
|
1008 |
+
|
1009 |
+
|
1010 |
+
def send_email(email,message):
|
1011 |
+
s = smtplib.SMTP('smtp.gmail.com', 587)
|
1012 |
+
s.starttls()
|
1013 |
+
s.login("geethu4444@gmail.com", "jgydhpfusuremcol")
|
1014 |
+
s.sendmail("geethu4444@gmail.com", email, message)
|
1015 |
+
s.quit()
|
1016 |
+
|
1017 |
+
if __name__ == "__main__":
|
1018 |
+
initialize_data()
|