BlendMMM commited on
Commit
8f35613
1 Parent(s): 2c56ebc

Upload 78 files

Browse files
Files changed (46) hide show
  1. .gitattributes +4 -5
  2. Data_Import (1).py +995 -0
  3. Data_Import .py +1019 -0
  4. Data_prep_functions.py +72 -59
  5. Model/model_0.pkl +3 -0
  6. Model/model_1.pkl +3 -0
  7. Model/model_2.pkl +3 -0
  8. Model/model_3.pkl +3 -0
  9. Model/model_4.pkl +3 -0
  10. Overview_data_test_panel@#app_installs.xlsx +0 -0
  11. Overview_data_test_panel@#revenue.xlsx +0 -0
  12. Overview_data_test_panelreplace_meapp_installs.xlsx +0 -0
  13. README.md +1 -1
  14. Test/merged_df_contri.csv +0 -0
  15. Test/output_df.csv +16 -0
  16. Test/scenario_test_df.csv +16 -0
  17. Test/x_test_contribution.csv +0 -0
  18. Test/x_test_to_save.csv +0 -0
  19. Test/x_train_contribution.csv +0 -0
  20. Test/x_train_to_save.csv +0 -0
  21. best_models.pkl +2 -2
  22. classes.py +130 -106
  23. data_import.pkl +3 -0
  24. data_test_overview_panel_#total_approved_accounts_revenue.xlsx +3 -0
  25. final_df_transformed.pkl +3 -0
  26. metrics_level_data/Overview_data_test_panel@#app_installs.xlsx +0 -0
  27. metrics_level_data/Overview_data_test_panel@#revenue.xlsx +0 -0
  28. model_output.csv +6 -11
  29. pages/10_Optimized_Result_Analysis.py +23 -77
  30. pages/1_Data_Validation.py +158 -188
  31. pages/2_Transformations.py +522 -0
  32. pages/4_Model_Build.py +826 -0
  33. pages/4_Saved_Model_Results.py +461 -267
  34. pages/5_Model_Tuning_with_panel.py +527 -0
  35. pages/6_Model_Result_Overview.py +348 -0
  36. pages/7_Build_Response_Curves.py +185 -0
  37. pages/8_Scenario_Planner.py +458 -167
  38. requirements.txt +94 -102
  39. summary_df.pkl +1 -1
  40. tuned_model.pkl +3 -0
  41. upf_data_converted_old.csv +0 -0
  42. upf_data_converted_old.xlsx +3 -0
  43. upf_data_converted_randomized_resp_metrics.csv +0 -0
  44. upf_data_converted_randomized_resp_metrics.xlsx +3 -0
  45. utilities.py +534 -263
  46. utilities_with_panel.py +1018 -0
.gitattributes CHANGED
@@ -33,9 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
- E0DAF720 filter=lfs diff=lfs merge=lfs -text
37
- Profile_Report.html filter=lfs diff=lfs merge=lfs -text
38
- raw_data_nov7_combined.xlsx filter=lfs diff=lfs merge=lfs -text
39
- raw_data_nov7_combined1.xlsx filter=lfs diff=lfs merge=lfs -text
40
- upf_data_converted.xlsx filter=lfs diff=lfs merge=lfs -text
41
  Pickle_files/main_df filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data_test_overview_panel_\#total_approved_accounts_revenue.xlsx filter=lfs diff=lfs merge=lfs -text
 
 
 
 
37
  Pickle_files/main_df filter=lfs diff=lfs merge=lfs -text
38
+ upf_data_converted_old.xlsx filter=lfs diff=lfs merge=lfs -text
39
+ upf_data_converted_randomized_resp_metrics.xlsx filter=lfs diff=lfs merge=lfs -text
40
+ upf_data_converted.xlsx filter=lfs diff=lfs merge=lfs -text
Data_Import (1).py ADDED
@@ -0,0 +1,995 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Importing necessary libraries
2
+ import streamlit as st
3
+
4
+ st.set_page_config(
5
+ page_title="Data Import",
6
+ page_icon=":shark:",
7
+ layout="wide",
8
+ initial_sidebar_state="collapsed",
9
+ )
10
+
11
+ import pickle
12
+ import pandas as pd
13
+ from utilities import set_header, load_local_css, authentication
14
+
15
+ load_local_css("styles.css")
16
+ set_header()
17
+
18
+
19
+ # Check for authentication status
20
+ authenticator, name, authentication_status, username = authentication()
21
+ if authentication_status != True:
22
+ st.stop()
23
+ else:
24
+ authenticator.logout("Logout", "main")
25
+
26
+
27
+ # Function to validate date column in dataframe
28
+ def validate_date_column(df):
29
+ try:
30
+ # Attempt to convert the 'Date' column to datetime
31
+ df["date"] = pd.to_datetime(df["date"], format="%d-%m-%Y")
32
+ return True
33
+ except:
34
+ return False
35
+
36
+
37
+ # Function to determine data interval
38
+ def determine_data_interval(common_freq):
39
+ if common_freq == 1:
40
+ return "daily"
41
+ elif common_freq == 7:
42
+ return "weekly"
43
+ elif 28 <= common_freq <= 31:
44
+ return "monthly"
45
+ else:
46
+ return "irregular"
47
+
48
+
49
+ # Function to read each uploaded Excel file into a pandas DataFrame and stores them in a dictionary
50
+ st.cache_resource(show_spinner=False)
51
+
52
+
53
+ def files_to_dataframes(uploaded_files):
54
+ df_dict = {}
55
+ for uploaded_file in uploaded_files:
56
+ # Extract file name without extension
57
+ file_name = uploaded_file.name.rsplit(".", 1)[0]
58
+
59
+ # Check for duplicate file names
60
+ if file_name in df_dict:
61
+ st.warning(
62
+ f"Duplicate File: {file_name}. This file will be skipped.",
63
+ icon="⚠️",
64
+ )
65
+ continue
66
+
67
+ # Read the file into a DataFrame
68
+ df = pd.read_excel(uploaded_file)
69
+
70
+ # Convert all column names to lowercase
71
+ df.columns = df.columns.str.lower().str.strip()
72
+
73
+ # Separate numeric and non-numeric columns
74
+ numeric_cols = list(df.select_dtypes(include=["number"]).columns)
75
+ non_numeric_cols = [
76
+ col
77
+ for col in df.select_dtypes(exclude=["number"]).columns
78
+ if col.lower() != "date"
79
+ ]
80
+
81
+ # Check for 'Date' column
82
+ if not (validate_date_column(df) and len(numeric_cols) > 0):
83
+ st.warning(
84
+ f"File Name: {file_name} ➜ Please upload data with Date column in 'DD-MM-YYYY' format and at least one media/exogenous column. This file will be skipped.",
85
+ icon="⚠️",
86
+ )
87
+ continue
88
+
89
+ # Check for interval
90
+ common_freq = common_freq = (
91
+ pd.Series(df["date"].unique()).diff().dt.days.dropna().mode()[0]
92
+ )
93
+ # Calculate the data interval (daily, weekly, monthly or irregular)
94
+ interval = determine_data_interval(common_freq)
95
+ if interval == "irregular":
96
+ st.warning(
97
+ f"File Name: {file_name} ➜ Please upload data in daily, weekly or monthly interval. This file will be skipped.",
98
+ icon="⚠️",
99
+ )
100
+ continue
101
+
102
+ # Store both DataFrames in the dictionary under their respective keys
103
+ df_dict[file_name] = {
104
+ "numeric": numeric_cols,
105
+ "non_numeric": non_numeric_cols,
106
+ "interval": interval,
107
+ "df": df,
108
+ }
109
+
110
+ return df_dict
111
+
112
+
113
+ # Function to adjust dataframe granularity
114
+ def adjust_dataframe_granularity(df, current_granularity, target_granularity):
115
+ # Set index
116
+ df.set_index("date", inplace=True)
117
+
118
+ # Define aggregation rules for resampling
119
+ aggregation_rules = {
120
+ col: "sum" if pd.api.types.is_numeric_dtype(df[col]) else "first"
121
+ for col in df.columns
122
+ }
123
+
124
+ # Initialize resampled_df
125
+ resampled_df = df
126
+ if current_granularity == "daily" and target_granularity == "weekly":
127
+ resampled_df = df.resample("W-MON", closed="left", label="left").agg(
128
+ aggregation_rules
129
+ )
130
+
131
+ elif current_granularity == "daily" and target_granularity == "monthly":
132
+ resampled_df = df.resample("MS", closed="left", label="left").agg(
133
+ aggregation_rules
134
+ )
135
+
136
+ elif current_granularity == "daily" and target_granularity == "daily":
137
+ resampled_df = df.resample("D").agg(aggregation_rules)
138
+
139
+ elif current_granularity in ["weekly", "monthly"] and target_granularity == "daily":
140
+ # For higher to lower granularity, distribute numeric and replicate non-numeric values equally across the new period
141
+ expanded_data = []
142
+ for _, row in df.iterrows():
143
+ if current_granularity == "weekly":
144
+ period_range = pd.date_range(start=row.name, periods=7)
145
+ elif current_granularity == "monthly":
146
+ period_range = pd.date_range(
147
+ start=row.name, periods=row.name.days_in_month
148
+ )
149
+
150
+ for date in period_range:
151
+ new_row = {}
152
+ for col in df.columns:
153
+ if pd.api.types.is_numeric_dtype(df[col]):
154
+ if current_granularity == "weekly":
155
+ new_row[col] = row[col] / 7
156
+ elif current_granularity == "monthly":
157
+ new_row[col] = row[col] / row.name.days_in_month
158
+ else:
159
+ new_row[col] = row[col]
160
+ expanded_data.append((date, new_row))
161
+
162
+ resampled_df = pd.DataFrame(
163
+ [data for _, data in expanded_data],
164
+ index=[date for date, _ in expanded_data],
165
+ )
166
+
167
+ # Reset index
168
+ resampled_df = resampled_df.reset_index().rename(columns={"index": "date"})
169
+
170
+ return resampled_df
171
+
172
+
173
+ # Function to clean and extract unique values of Panel_1 and Panel_2
174
+ st.cache_resource(show_spinner=False)
175
+
176
+
177
+ def clean_and_extract_unique_values(files_dict, selections):
178
+ all_panel1_values = set()
179
+ all_panel2_values = set()
180
+
181
+ for file_name, file_data in files_dict.items():
182
+ df = file_data["df"]
183
+
184
+ # 'Panel_1' and 'Panel_2' selections
185
+ selected_panel1 = selections[file_name].get("Panel_1")
186
+ selected_panel2 = selections[file_name].get("Panel_2")
187
+
188
+ # Clean and standardize Panel_1 column if it exists and is selected
189
+ if (
190
+ selected_panel1
191
+ and selected_panel1 != "N/A"
192
+ and selected_panel1 in df.columns
193
+ ):
194
+ df[selected_panel1] = (
195
+ df[selected_panel1].str.lower().str.strip().str.replace("_", " ")
196
+ )
197
+ all_panel1_values.update(df[selected_panel1].dropna().unique())
198
+
199
+ # Clean and standardize Panel_2 column if it exists and is selected
200
+ if (
201
+ selected_panel2
202
+ and selected_panel2 != "N/A"
203
+ and selected_panel2 in df.columns
204
+ ):
205
+ df[selected_panel2] = (
206
+ df[selected_panel2].str.lower().str.strip().str.replace("_", " ")
207
+ )
208
+ all_panel2_values.update(df[selected_panel2].dropna().unique())
209
+
210
+ # Update the processed DataFrame back in the dictionary
211
+ files_dict[file_name]["df"] = df
212
+
213
+ return all_panel1_values, all_panel2_values
214
+
215
+
216
+ # Function to format values for display
217
+ st.cache_resource(show_spinner=False)
218
+
219
+
220
+ def format_values_for_display(values_list):
221
+ # Capitalize the first letter of each word and replace underscores with spaces
222
+ formatted_list = [value.replace("_", " ").title() for value in values_list]
223
+ # Join values with commas and 'and' before the last value
224
+ if len(formatted_list) > 1:
225
+ return ", ".join(formatted_list[:-1]) + ", and " + formatted_list[-1]
226
+ elif formatted_list:
227
+ return formatted_list[0]
228
+ return "No values available"
229
+
230
+
231
+ # Function to normalizes all data within files_dict to a daily granularity
232
+ st.cache(show_spinner=False, allow_output_mutation=True)
233
+
234
+
235
+ def standardize_data_to_daily(files_dict, selections):
236
+ # Normalize all data to a daily granularity using a provided function
237
+ files_dict = apply_granularity_to_all(files_dict, "daily", selections)
238
+
239
+ # Update the "interval" attribute for each dataset to indicate the new granularity
240
+ for files_name, files_data in files_dict.items():
241
+ files_data["interval"] = "daily"
242
+
243
+ return files_dict
244
+
245
+
246
+ # Function to apply granularity transformation to all DataFrames in files_dict
247
+ st.cache_resource(show_spinner=False)
248
+
249
+
250
+ def apply_granularity_to_all(files_dict, granularity_selection, selections):
251
+ for file_name, file_data in files_dict.items():
252
+ df = file_data["df"].copy()
253
+
254
+ # Handling when Panel_1 or Panel_2 might be 'N/A'
255
+ selected_panel1 = selections[file_name].get("Panel_1")
256
+ selected_panel2 = selections[file_name].get("Panel_2")
257
+
258
+ # Correcting the segment selection logic & handling 'N/A'
259
+ if selected_panel1 != "N/A" and selected_panel2 != "N/A":
260
+ unique_combinations = df[
261
+ [selected_panel1, selected_panel2]
262
+ ].drop_duplicates()
263
+ elif selected_panel1 != "N/A":
264
+ unique_combinations = df[[selected_panel1]].drop_duplicates()
265
+ selected_panel2 = None # Ensure Panel_2 is ignored if N/A
266
+ elif selected_panel2 != "N/A":
267
+ unique_combinations = df[[selected_panel2]].drop_duplicates()
268
+ selected_panel1 = None # Ensure Panel_1 is ignored if N/A
269
+ else:
270
+ # If both are 'N/A', process the entire dataframe as is
271
+ df = adjust_dataframe_granularity(
272
+ df, file_data["interval"], granularity_selection
273
+ )
274
+ files_dict[file_name]["df"] = df
275
+ continue # Skip to the next file
276
+
277
+ transformed_segments = []
278
+ for _, combo in unique_combinations.iterrows():
279
+ if selected_panel1 and selected_panel2:
280
+ segment = df[
281
+ (df[selected_panel1] == combo[selected_panel1])
282
+ & (df[selected_panel2] == combo[selected_panel2])
283
+ ]
284
+ elif selected_panel1:
285
+ segment = df[df[selected_panel1] == combo[selected_panel1]]
286
+ elif selected_panel2:
287
+ segment = df[df[selected_panel2] == combo[selected_panel2]]
288
+
289
+ # Adjust granularity of the segment
290
+ transformed_segment = adjust_dataframe_granularity(
291
+ segment, file_data["interval"], granularity_selection
292
+ )
293
+ transformed_segments.append(transformed_segment)
294
+
295
+ # Combine all transformed segments into a single DataFrame for this file
296
+ transformed_df = pd.concat(transformed_segments, ignore_index=True)
297
+ files_dict[file_name]["df"] = transformed_df
298
+
299
+ return files_dict
300
+
301
+
302
+ # Function to create main dataframe structure
303
+ st.cache_resource(show_spinner=False)
304
+
305
+
306
+ def create_main_dataframe(
307
+ files_dict, all_panel1_values, all_panel2_values, granularity_selection
308
+ ):
309
+ # Determine the global start and end dates across all DataFrames
310
+ global_start = min(df["df"]["date"].min() for df in files_dict.values())
311
+ global_end = max(df["df"]["date"].max() for df in files_dict.values())
312
+
313
+ # Adjust the date_range generation based on the granularity_selection
314
+ if granularity_selection == "weekly":
315
+ # Generate a weekly range, with weeks starting on Monday
316
+ date_range = pd.date_range(start=global_start, end=global_end, freq="W-MON")
317
+ elif granularity_selection == "monthly":
318
+ # Generate a monthly range, starting from the first day of each month
319
+ date_range = pd.date_range(start=global_start, end=global_end, freq="MS")
320
+ else: # Default to daily if not weekly or monthly
321
+ date_range = pd.date_range(start=global_start, end=global_end, freq="D")
322
+
323
+ # Collect all unique Panel_1 and Panel_2 values, excluding 'N/A'
324
+ all_panel1s = all_panel1_values
325
+ all_panel2s = all_panel2_values
326
+
327
+ # Dynamically build the list of dimensions (Panel_1, Panel_2) to include in the main DataFrame based on availability
328
+ dimensions, merge_keys = [], []
329
+ if all_panel1s:
330
+ dimensions.append(all_panel1s)
331
+ merge_keys.append("Panel_1")
332
+ if all_panel2s:
333
+ dimensions.append(all_panel2s)
334
+ merge_keys.append("Panel_2")
335
+
336
+ dimensions.append(date_range) # Date range is always included
337
+ merge_keys.append("date") # Date range is always included
338
+
339
+ # Create a main DataFrame template with the dimensions
340
+ main_df = pd.MultiIndex.from_product(
341
+ dimensions,
342
+ names=[name for name, _ in zip(merge_keys, dimensions)],
343
+ ).to_frame(index=False)
344
+
345
+ return main_df.reset_index(drop=True)
346
+
347
+
348
+ # Function to prepare and merge dataFrames
349
+ st.cache_resource(show_spinner=False)
350
+
351
+
352
+ def merge_into_main_df(main_df, files_dict, selections):
353
+ for file_name, file_data in files_dict.items():
354
+ df = file_data["df"].copy()
355
+
356
+ # Rename selected Panel_1 and Panel_2 columns if not 'N/A'
357
+ selected_panel1 = selections[file_name].get("Panel_1", "N/A")
358
+ selected_panel2 = selections[file_name].get("Panel_2", "N/A")
359
+ if selected_panel1 != "N/A":
360
+ df.rename(columns={selected_panel1: "Panel_1"}, inplace=True)
361
+ if selected_panel2 != "N/A":
362
+ df.rename(columns={selected_panel2: "Panel_2"}, inplace=True)
363
+
364
+ # Merge current DataFrame into main_df based on 'date', and where applicable, 'Panel_1' and 'Panel_2'
365
+ merge_keys = ["date"]
366
+ if "Panel_1" in df.columns:
367
+ merge_keys.append("Panel_1")
368
+ if "Panel_2" in df.columns:
369
+ merge_keys.append("Panel_2")
370
+ main_df = pd.merge(main_df, df, on=merge_keys, how="left")
371
+
372
+ # After all merges, sort by 'date' and reset index for cleanliness
373
+ sort_by = ["date"]
374
+ if "Panel_1" in main_df.columns:
375
+ sort_by.append("Panel_1")
376
+ if "Panel_2" in main_df.columns:
377
+ sort_by.append("Panel_2")
378
+ main_df.sort_values(by=sort_by, inplace=True)
379
+ main_df.reset_index(drop=True, inplace=True)
380
+
381
+ return main_df
382
+
383
+
384
+ # Function to categorize column
385
+ def categorize_column(column_name):
386
+ # Define keywords for each category
387
+ internal_keywords = [
388
+ "Price",
389
+ "Discount",
390
+ "product_price",
391
+ "cost",
392
+ "margin",
393
+ "inventory",
394
+ "sales",
395
+ "revenue",
396
+ "turnover",
397
+ "expense",
398
+ ]
399
+ exogenous_keywords = [
400
+ "GDP",
401
+ "Tax",
402
+ "Inflation",
403
+ "interest_rate",
404
+ "employment_rate",
405
+ "exchange_rate",
406
+ "consumer_spending",
407
+ "retail_sales",
408
+ "oil_prices",
409
+ "weather",
410
+ ]
411
+
412
+ # Check if the column name matches any of the keywords for Internal or Exogenous categories
413
+ for keyword in internal_keywords:
414
+ if keyword.lower() in column_name.lower():
415
+ return "Internal"
416
+ for keyword in exogenous_keywords:
417
+ if keyword.lower() in column_name.lower():
418
+ return "Exogenous"
419
+
420
+ # Default to Media if no match found
421
+ return "Media"
422
+
423
+
424
+ # Function to calculate missing stats and prepare for editable DataFrame
425
+ st.cache_resource(show_spinner=False)
426
+
427
+
428
+ def prepare_missing_stats_df(df):
429
+ missing_stats = []
430
+ for column in df.columns:
431
+ if (
432
+ column == "date" or column == "Panel_2" or column == "Panel_1"
433
+ ): # Skip Date, Panel_1 and Panel_2 column
434
+ continue
435
+
436
+ missing = df[column].isnull().sum()
437
+ pct_missing = round((missing / len(df)) * 100, 2)
438
+
439
+ # Dynamically assign category based on column name
440
+ category = categorize_column(column)
441
+ # category = "Media" # Keep default bin as Media
442
+
443
+ missing_stats.append(
444
+ {
445
+ "Column": column,
446
+ "Missing Values": missing,
447
+ "Missing Percentage": pct_missing,
448
+ "Impute Method": "Fill with 0", # Default value
449
+ "Category": category,
450
+ }
451
+ )
452
+ stats_df = pd.DataFrame(missing_stats)
453
+
454
+ return stats_df
455
+
456
+
457
+ # Function to add API DataFrame details to the files dictionary
458
+ st.cache_resource(show_spinner=False)
459
+
460
+
461
+ def add_api_dataframe_to_dict(main_df, files_dict):
462
+ files_dict["API"] = {
463
+ "numeric": list(main_df.select_dtypes(include=["number"]).columns),
464
+ "non_numeric": [
465
+ col
466
+ for col in main_df.select_dtypes(exclude=["number"]).columns
467
+ if col.lower() != "date"
468
+ ],
469
+ "interval": determine_data_interval(
470
+ pd.Series(main_df["date"].unique()).diff().dt.days.dropna().mode()[0]
471
+ ),
472
+ "df": main_df,
473
+ }
474
+
475
+ return files_dict
476
+
477
+
478
+ # Function to reads an API into a DataFrame, parsing specified columns as datetime
479
+ @st.cache_resource(show_spinner=False)
480
+ def read_API_data():
481
+ return pd.read_excel(r".\upf_data_converted.xlsx", parse_dates=["Date"])
482
+
483
+
484
+ # Function to set the 'Panel_1_Panel_2_Selected' session state variable to False
485
+ def set_Panel_1_Panel_2_Selected_false():
486
+ st.session_state["Panel_1_Panel_2_Selected"] = False
487
+
488
+
489
+ # Function to serialize and save the objects into a pickle file
490
+ @st.cache_resource(show_spinner=False)
491
+ def save_to_pickle(file_path, final_df, bin_dict):
492
+ # Open the file in write-binary mode and dump the objects
493
+ with open(file_path, "wb") as f:
494
+ pickle.dump({"final_df": final_df, "bin_dict": bin_dict}, f)
495
+ # Data is now saved to file
496
+
497
+
498
+ # Function to processes the merged_df DataFrame based on operations defined in edited_df
499
+ @st.cache_resource(show_spinner=False)
500
+ def process_dataframes(merged_df, edited_df, edited_stats_df):
501
+ # Ensure there are operations defined by the user
502
+ if edited_df.empty:
503
+ return merged_df, edited_stats_df # No operations to apply
504
+
505
+ # Perform operations as defined by the user
506
+ for index, row in edited_df.iterrows():
507
+ result_column_name = f"{row['Column 1']}{row['Operator']}{row['Column 2']}"
508
+ col1 = row["Column 1"]
509
+ col2 = row["Column 2"]
510
+ op = row["Operator"]
511
+
512
+ # Apply the specified operation
513
+ if op == "+":
514
+ merged_df[result_column_name] = merged_df[col1] + merged_df[col2]
515
+ elif op == "-":
516
+ merged_df[result_column_name] = merged_df[col1] - merged_df[col2]
517
+ elif op == "*":
518
+ merged_df[result_column_name] = merged_df[col1] * merged_df[col2]
519
+ elif op == "/":
520
+ merged_df[result_column_name] = merged_df[col1] / merged_df[col2].replace(
521
+ 0, 1e-9
522
+ )
523
+
524
+ # Add summary of operation to edited_stats_df
525
+ new_row = {
526
+ "Column": result_column_name,
527
+ "Missing Values": None,
528
+ "Missing Percentage": None,
529
+ "Impute Method": None,
530
+ "Category": row["Category"],
531
+ }
532
+ new_row_df = pd.DataFrame([new_row])
533
+
534
+ # Use pd.concat to add the new_row_df to edited_stats_df
535
+ edited_stats_df = pd.concat(
536
+ [edited_stats_df, new_row_df], ignore_index=True, axis=0
537
+ )
538
+
539
+ # Combine column names from edited_df for cleanup
540
+ combined_columns = set(edited_df["Column 1"]).union(set(edited_df["Column 2"]))
541
+
542
+ # Filter out rows in edited_stats_df and drop columns from merged_df
543
+ edited_stats_df = edited_stats_df[~edited_stats_df["Column"].isin(combined_columns)]
544
+ merged_df.drop(columns=list(combined_columns), errors="ignore", inplace=True)
545
+
546
+ return merged_df, edited_stats_df
547
+
548
+
549
+ # Function to prepare a list of numeric column names and initialize an empty DataFrame with predefined structure
550
+ st.cache_resource(show_spinner=False)
551
+
552
+
553
+ def prepare_numeric_columns_and_default_df(merged_df, edited_stats_df):
554
+ # Get columns categorized as 'Response Metrics'
555
+ columns_response_metrics = edited_stats_df[
556
+ edited_stats_df["Category"] == "Response Metrics"
557
+ ]["Column"].tolist()
558
+
559
+ # Filter numeric columns, excluding those categorized as 'Response Metrics'
560
+ numeric_columns = [
561
+ col
562
+ for col in merged_df.select_dtypes(include=["number"]).columns
563
+ if col not in columns_response_metrics
564
+ ]
565
+
566
+ # Define the structure of the empty DataFrame
567
+ data = {
568
+ "Column 1": pd.Series([], dtype="str"),
569
+ "Operator": pd.Series([], dtype="str"),
570
+ "Column 2": pd.Series([], dtype="str"),
571
+ "Category": pd.Series([], dtype="str"),
572
+ }
573
+ default_df = pd.DataFrame(data)
574
+
575
+ return numeric_columns, default_df
576
+
577
+
578
+ # Initialize 'final_df' in session state
579
+ if "final_df" not in st.session_state:
580
+ st.session_state["final_df"] = pd.DataFrame()
581
+
582
+ # Initialize 'bin_dict' in session state
583
+ if "bin_dict" not in st.session_state:
584
+ st.session_state["bin_dict"] = {}
585
+
586
+ # Initialize 'Panel_1_Panel_2_Selected' in session state
587
+ if "Panel_1_Panel_2_Selected" not in st.session_state:
588
+ st.session_state["Panel_1_Panel_2_Selected"] = False
589
+
590
+
591
+ # Page Title
592
+ st.write("") # Top padding
593
+ st.title("Data Import")
594
+
595
+
596
+ #########################################################################################################################################################
597
+ # Create a dictionary to hold all DataFrames and collect user input to specify "Panel_2" and "Panel_1" columns for each file
598
+ #########################################################################################################################################################
599
+
600
+
601
+ # Read the Excel file, parsing 'Date' column as datetime
602
+ main_df = read_API_data()
603
+
604
+ # Convert all column names to lowercase
605
+ main_df.columns = main_df.columns.str.lower().str.strip()
606
+
607
+ # File uploader
608
+ uploaded_files = st.file_uploader(
609
+ "Upload additional data",
610
+ type=["xlsx"],
611
+ accept_multiple_files=True,
612
+ on_change=set_Panel_1_Panel_2_Selected_false,
613
+ )
614
+
615
+ # Custom HTML for upload instructions
616
+ recommendation_html = f"""
617
+ <div style="text-align: justify;">
618
+ <strong>Recommendation:</strong> For optimal processing, please ensure that all uploaded datasets including panel, media, internal, and exogenous data adhere to the following guidelines: Each dataset must include a <code>Date</code> column formatted as <code>DD-MM-YYYY</code>, be free of missing values.
619
+ </div>
620
+ """
621
+ st.markdown(recommendation_html, unsafe_allow_html=True)
622
+
623
+ # Choose Desired Granularity
624
+ st.markdown("#### Choose Desired Granularity")
625
+ # Granularity Selection
626
+ granularity_selection = st.selectbox(
627
+ "Choose Date Granularity",
628
+ ["Daily", "Weekly", "Monthly"],
629
+ label_visibility="collapsed",
630
+ on_change=set_Panel_1_Panel_2_Selected_false,
631
+ )
632
+ granularity_selection = str(granularity_selection).lower()
633
+
634
+ # Convert files to dataframes
635
+ files_dict = files_to_dataframes(uploaded_files)
636
+
637
+ # Add API Dataframe
638
+ if main_df is not None:
639
+ files_dict = add_api_dataframe_to_dict(main_df, files_dict)
640
+
641
+ # Display a warning message if no files have been uploaded and halt further execution
642
+ if not files_dict:
643
+ st.warning(
644
+ "Please upload at least one file to proceed.",
645
+ icon="⚠️",
646
+ )
647
+ st.stop() # Halts further execution until file is uploaded
648
+
649
+
650
+ # Select Panel_1 and Panel_2 columns
651
+ st.markdown("#### Select Panel columns")
652
+ selections = {}
653
+ with st.expander("Select Panel columns", expanded=False):
654
+ count = 0 # Initialize counter to manage the visibility of labels and keys
655
+ for file_name, file_data in files_dict.items():
656
+ # Determine visibility of the label based on the count
657
+ if count == 0:
658
+ label_visibility = "visible"
659
+ else:
660
+ label_visibility = "collapsed"
661
+
662
+ # Extract non-numeric columns
663
+ non_numeric_cols = file_data["non_numeric"]
664
+
665
+ # Prepare Panel_1 and Panel_2 values for dropdown, adding "N/A" as an option
666
+ panel1_values = non_numeric_cols + ["N/A"]
667
+ panel2_values = non_numeric_cols + ["N/A"]
668
+
669
+ # Skip if only one option is available
670
+ if len(panel1_values) == 1 and len(panel2_values) == 1:
671
+ selected_panel1, selected_panel2 = "N/A", "N/A"
672
+ # Update the selections for Panel_1 and Panel_2 for the current file
673
+ selections[file_name] = {
674
+ "Panel_1": selected_panel1,
675
+ "Panel_2": selected_panel2,
676
+ }
677
+ continue
678
+
679
+ # Create layout columns for File Name, Panel_2, and Panel_1 selections
680
+ file_name_col, Panel_1_col, Panel_2_col = st.columns([2, 4, 4])
681
+
682
+ with file_name_col:
683
+ # Display "File Name" label only for the first file
684
+ if count == 0:
685
+ st.write("File Name")
686
+ else:
687
+ st.write("")
688
+ st.write(file_name) # Display the file name
689
+
690
+ with Panel_1_col:
691
+ # Display a selectbox for Panel_1 values
692
+ selected_panel1 = st.selectbox(
693
+ "Select Panel Level 1",
694
+ panel2_values,
695
+ on_change=set_Panel_1_Panel_2_Selected_false,
696
+ label_visibility=label_visibility, # Control visibility of the label
697
+ key=f"Panel_1_selectbox{count}", # Ensure unique key for each selectbox
698
+ )
699
+
700
+ with Panel_2_col:
701
+ # Display a selectbox for Panel_2 values
702
+ selected_panel2 = st.selectbox(
703
+ "Select Panel Level 2",
704
+ panel1_values,
705
+ on_change=set_Panel_1_Panel_2_Selected_false,
706
+ label_visibility=label_visibility, # Control visibility of the label
707
+ key=f"Panel_2_selectbox{count}", # Ensure unique key for each selectbox
708
+ )
709
+
710
+ # Skip processing if the same column is selected for both Panel_1 and Panel_2 due to potential data integrity issues
711
+ if selected_panel2 == selected_panel1 and not (
712
+ selected_panel2 == "N/A" and selected_panel1 == "N/A"
713
+ ):
714
+ st.warning(
715
+ f"File: {file_name} → The same column cannot serve as both Panel_1 and Panel_2. Please adjust your selections.",
716
+ )
717
+ selected_panel1, selected_panel2 = "N/A", "N/A"
718
+ st.stop()
719
+
720
+ # Update the selections for Panel_1 and Panel_2 for the current file
721
+ selections[file_name] = {
722
+ "Panel_1": selected_panel1,
723
+ "Panel_2": selected_panel2,
724
+ }
725
+
726
+ count += 1 # Increment the counter after processing each file
727
+
728
+ # Accept Panel_1 and Panel_2 selection
729
+ if st.button("Accept and Process", use_container_width=True):
730
+
731
+ # Normalize all data to a daily granularity. This initial standardization simplifies subsequent conversions to other levels of granularity
732
+ with st.spinner("Processing...", cache=True):
733
+ files_dict = standardize_data_to_daily(files_dict, selections)
734
+
735
+ # Convert all data to daily level granularity
736
+ files_dict = apply_granularity_to_all(
737
+ files_dict, granularity_selection, selections
738
+ )
739
+
740
+ # Update the 'files_dict' in the session state
741
+ st.session_state["files_dict"] = files_dict
742
+
743
+ # Set a flag in the session state to indicate that selection has been made
744
+ st.session_state["Panel_1_Panel_2_Selected"] = True
745
+
746
+
747
+ #########################################################################################################################################################
748
+ # Display unique Panel_1 and Panel_2 values
749
+ #########################################################################################################################################################
750
+
751
+
752
+ # Halts further execution until Panel_1 and Panel_2 columns are selected
753
+ if "files_dict" in st.session_state and st.session_state["Panel_1_Panel_2_Selected"]:
754
+ files_dict = st.session_state["files_dict"]
755
+ else:
756
+ st.stop()
757
+
758
+ # Set to store unique values of Panel_1 and Panel_2
759
+ with st.spinner("Fetching Panel values..."):
760
+ all_panel1_values, all_panel2_values = clean_and_extract_unique_values(
761
+ files_dict, selections
762
+ )
763
+
764
+ # List of Panel_1 and Panel_2 columns unique values
765
+ list_of_all_panel1_values = list(all_panel1_values)
766
+ list_of_all_panel2_values = list(all_panel2_values)
767
+
768
+ # Format Panel_1 and Panel_2 values for display
769
+ formatted_panel1_values = format_values_for_display(list_of_all_panel1_values)
770
+ formatted_panel2_values = format_values_for_display(list_of_all_panel2_values)
771
+
772
+ # Unique Panel_1 and Panel_2 values
773
+ st.markdown("#### Unique Panel values")
774
+ # Display Panel_1 and Panel_2 values
775
+ with st.expander("Unique Panel values"):
776
+ st.write("")
777
+ st.markdown(
778
+ f"""
779
+ <style>
780
+ .justify-text {{
781
+ text-align: justify;
782
+ }}
783
+ </style>
784
+ <div class="justify-text">
785
+ <strong>Panel Level 1 Values:</strong> {formatted_panel1_values}<br>
786
+ <strong>Panel Level 2 Values:</strong> {formatted_panel2_values}
787
+ </div>
788
+ """,
789
+ unsafe_allow_html=True,
790
+ )
791
+
792
+ # Display total Panel_1 and Panel_2
793
+ st.write("")
794
+ st.markdown(
795
+ f"""
796
+ <div style="text-align: justify;">
797
+ <strong>Number of Level 1 Panels detected:</strong> {len(list_of_all_panel1_values)}<br>
798
+ <strong>Number of Level 2 Panels detected:</strong> {len(list_of_all_panel2_values)}
799
+ </div>
800
+ """,
801
+ unsafe_allow_html=True,
802
+ )
803
+ st.write("")
804
+
805
+
806
+ #########################################################################################################################################################
807
+ # Merge all DataFrames
808
+ #########################################################################################################################################################
809
+
810
+
811
+ # Merge all DataFrames selected
812
+ main_df = create_main_dataframe(
813
+ files_dict, all_panel1_values, all_panel2_values, granularity_selection
814
+ )
815
+ merged_df = merge_into_main_df(main_df, files_dict, selections)
816
+
817
+
818
+ #########################################################################################################################################################
819
+ # Categorize Variables and Impute Missing Values
820
+ #########################################################################################################################################################
821
+
822
+
823
+ # Create an editable DataFrame in Streamlit
824
+ st.markdown("#### Select Variables Category & Impute Missing Values")
825
+
826
+ # Prepare missing stats DataFrame for editing
827
+ missing_stats_df = prepare_missing_stats_df(merged_df)
828
+
829
+ edited_stats_df = st.data_editor(
830
+ missing_stats_df,
831
+ column_config={
832
+ "Impute Method": st.column_config.SelectboxColumn(
833
+ options=[
834
+ "Drop Column",
835
+ "Fill with Mean",
836
+ "Fill with Median",
837
+ "Fill with 0",
838
+ ],
839
+ required=True,
840
+ default="Fill with 0",
841
+ ),
842
+ "Category": st.column_config.SelectboxColumn(
843
+ options=[
844
+ "Media",
845
+ "Exogenous",
846
+ "Internal",
847
+ "Response Metrics",
848
+ ],
849
+ required=True,
850
+ default="Media",
851
+ ),
852
+ },
853
+ disabled=["Column", "Missing Values", "Missing Percentage"],
854
+ hide_index=True,
855
+ use_container_width=True,
856
+ )
857
+
858
+ # Apply changes based on edited DataFrame
859
+ for i, row in edited_stats_df.iterrows():
860
+ column = row["Column"]
861
+ if row["Impute Method"] == "Drop Column":
862
+ merged_df.drop(columns=[column], inplace=True)
863
+
864
+ elif row["Impute Method"] == "Fill with Mean":
865
+ merged_df[column].fillna(merged_df[column].mean(), inplace=True)
866
+
867
+ elif row["Impute Method"] == "Fill with Median":
868
+ merged_df[column].fillna(merged_df[column].median(), inplace=True)
869
+
870
+ elif row["Impute Method"] == "Fill with 0":
871
+ merged_df[column].fillna(0, inplace=True)
872
+
873
+
874
+ #########################################################################################################################################################
875
+ # Group columns
876
+ #########################################################################################################################################################
877
+
878
+
879
+ # Display Group columns header
880
+ st.markdown("#### Feature engineering")
881
+
882
+ # Prepare the numeric columns and an empty DataFrame for user input
883
+ numeric_columns, default_df = prepare_numeric_columns_and_default_df(
884
+ merged_df, edited_stats_df
885
+ )
886
+
887
+ # Display editable Dataframe
888
+ edited_df = st.data_editor(
889
+ default_df,
890
+ column_config={
891
+ "Column 1": st.column_config.SelectboxColumn(
892
+ options=numeric_columns,
893
+ required=True,
894
+ default=numeric_columns[0],
895
+ width=400,
896
+ ),
897
+ "Operator": st.column_config.SelectboxColumn(
898
+ options=["+", "-", "*", "/"],
899
+ required=True,
900
+ default="+",
901
+ width=100,
902
+ ),
903
+ "Column 2": st.column_config.SelectboxColumn(
904
+ options=numeric_columns,
905
+ required=True,
906
+ default=numeric_columns[0],
907
+ width=400,
908
+ ),
909
+ "Category": st.column_config.SelectboxColumn(
910
+ options=[
911
+ "Media",
912
+ "Exogenous",
913
+ "Internal",
914
+ "Response Metrics",
915
+ ],
916
+ required=True,
917
+ default="Media",
918
+ width=200,
919
+ ),
920
+ },
921
+ num_rows="dynamic",
922
+ )
923
+
924
+ # Process the DataFrame based on user inputs and operations specified in edited_df
925
+ final_df, edited_stats_df = process_dataframes(merged_df, edited_df, edited_stats_df)
926
+
927
+
928
+ #########################################################################################################################################################
929
+ # Display the Final DataFrame and variables
930
+ #########################################################################################################################################################
931
+
932
+
933
+ # Display the Final DataFrame and variables
934
+ st.markdown("#### Final DataFrame")
935
+ st.dataframe(final_df, hide_index=True)
936
+
937
+ # Initialize an empty dictionary to hold categories and their variables
938
+ category_dict = {}
939
+
940
+ # Iterate over each row in the edited DataFrame to populate the dictionary
941
+ for i, row in edited_stats_df.iterrows():
942
+ column = row["Column"]
943
+ category = row["Category"] # The category chosen by the user for this variable
944
+
945
+ # Check if the category already exists in the dictionary
946
+ if category not in category_dict:
947
+ # If not, initialize it with the current column as its first element
948
+ category_dict[category] = [column]
949
+ else:
950
+ # If it exists, append the current column to the list of variables under this category
951
+ category_dict[category].append(column)
952
+
953
+ # Add Date, Panel_1 and Panel_12 in category dictionary
954
+ category_dict.update({"Date": ["date"]})
955
+ if "Panel_1" in final_df.columns:
956
+ category_dict["Panel Level 1"] = ["Panel_1"]
957
+ if "Panel_2" in final_df.columns:
958
+ category_dict["Panel Level 2"] = ["Panel_2"]
959
+
960
+ # Display the dictionary
961
+ st.markdown("#### Variable Category")
962
+ for category, variables in category_dict.items():
963
+ # Check if there are multiple variables to handle "and" insertion correctly
964
+ if len(variables) > 1:
965
+ # Join all but the last variable with ", ", then add " and " before the last variable
966
+ variables_str = ", ".join(variables[:-1]) + " and " + variables[-1]
967
+ else:
968
+ # If there's only one variable, no need for "and"
969
+ variables_str = variables[0]
970
+
971
+ # Display the category and its variables in the desired format
972
+ st.markdown(
973
+ f"<div style='text-align: justify;'><strong>{category}:</strong> {variables_str}</div>",
974
+ unsafe_allow_html=True,
975
+ )
976
+
977
+ # Function to check if Response Metrics is selected
978
+ st.write("")
979
+ response_metrics_col = category_dict.get("Response Metrics", [])
980
+ if len(response_metrics_col) == 0:
981
+ st.warning("Please select Response Metrics column", icon="⚠️")
982
+ st.stop()
983
+ # elif len(response_metrics_col) > 1:
984
+ # st.warning("Please select only one Response Metrics column", icon="⚠️")
985
+ # st.stop()
986
+
987
+ # Store final dataframe and bin dictionary into session state
988
+ st.session_state["final_df"], st.session_state["bin_dict"] = final_df, category_dict
989
+
990
+ # Save the DataFrame and dictionary from the session state to the pickle file
991
+ if st.button("Accept and Save", use_container_width=True):
992
+ save_to_pickle(
993
+ "data_import.pkl", st.session_state["final_df"], st.session_state["bin_dict"]
994
+ )
995
+ st.toast("💾 Saved Successfully!")
Data_Import .py ADDED
@@ -0,0 +1,1019 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Importing necessary libraries
2
+ import streamlit as st
3
+
4
+ st.set_page_config(
5
+ page_title="Data Import",
6
+ page_icon=":shark:",
7
+ layout="wide",
8
+ initial_sidebar_state="collapsed",
9
+ )
10
+
11
+ import pickle
12
+ import pandas as pd
13
+ from utilities import set_header, load_local_css
14
+ import streamlit_authenticator as stauth
15
+ import yaml
16
+ from yaml import SafeLoader
17
+
18
+ load_local_css("styles.css")
19
+ set_header()
20
+
21
+
22
+ for k, v in st.session_state.items():
23
+ if k not in ["logout", "login", "config"] and not k.startswith(
24
+ "FormSubmitter"
25
+ ):
26
+ st.session_state[k] = v
27
+ with open("config.yaml") as file:
28
+ config = yaml.load(file, Loader=SafeLoader)
29
+ st.session_state["config"] = config
30
+ authenticator = stauth.Authenticate(
31
+ config["credentials"],
32
+ config["cookie"]["name"],
33
+ config["cookie"]["key"],
34
+ config["cookie"]["expiry_days"],
35
+ config["preauthorized"],
36
+ )
37
+ st.session_state["authenticator"] = authenticator
38
+ name, authentication_status, username = authenticator.login("Login", "main")
39
+ auth_status = st.session_state.get("authentication_status")
40
+
41
+ if auth_status == True:
42
+ authenticator.logout("Logout", "main")
43
+ is_state_initiaized = st.session_state.get("initialized", False)
44
+
45
+ if not is_state_initiaized:
46
+
47
+ if 'session_name' not in st.session_state:
48
+ st.session_state['session_name']=None
49
+
50
+
51
+ # Function to validate date column in dataframe
52
+ def validate_date_column(df):
53
+ try:
54
+ # Attempt to convert the 'Date' column to datetime
55
+ df["date"] = pd.to_datetime(df["date"], format="%d-%m-%Y")
56
+ return True
57
+ except:
58
+ return False
59
+
60
+
61
+ # Function to determine data interval
62
+ def determine_data_interval(common_freq):
63
+ if common_freq == 1:
64
+ return "daily"
65
+ elif common_freq == 7:
66
+ return "weekly"
67
+ elif 28 <= common_freq <= 31:
68
+ return "monthly"
69
+ else:
70
+ return "irregular"
71
+
72
+
73
+ # Function to read each uploaded Excel file into a pandas DataFrame and stores them in a dictionary
74
+ st.cache_resource(show_spinner=False)
75
+
76
+
77
+ def files_to_dataframes(uploaded_files):
78
+ df_dict = {}
79
+ for uploaded_file in uploaded_files:
80
+ # Extract file name without extension
81
+ file_name = uploaded_file.name.rsplit(".", 1)[0]
82
+
83
+ # Check for duplicate file names
84
+ if file_name in df_dict:
85
+ st.warning(
86
+ f"Duplicate File: {file_name}. This file will be skipped.",
87
+ icon="⚠️",
88
+ )
89
+ continue
90
+
91
+ # Read the file into a DataFrame
92
+ df = pd.read_excel(uploaded_file)
93
+
94
+ # Convert all column names to lowercase
95
+ df.columns = df.columns.str.lower().str.strip()
96
+
97
+ # Separate numeric and non-numeric columns
98
+ numeric_cols = list(df.select_dtypes(include=["number"]).columns)
99
+ non_numeric_cols = [
100
+ col
101
+ for col in df.select_dtypes(exclude=["number"]).columns
102
+ if col.lower() != "date"
103
+ ]
104
+
105
+ # Check for 'Date' column
106
+ if not (validate_date_column(df) and len(numeric_cols) > 0):
107
+ st.warning(
108
+ f"File Name: {file_name} ➜ Please upload data with Date column in 'DD-MM-YYYY' format and at least one media/exogenous column. This file will be skipped.",
109
+ icon="⚠️",
110
+ )
111
+ continue
112
+
113
+ # Check for interval
114
+ common_freq = common_freq = (
115
+ pd.Series(df["date"].unique()).diff().dt.days.dropna().mode()[0]
116
+ )
117
+ # Calculate the data interval (daily, weekly, monthly or irregular)
118
+ interval = determine_data_interval(common_freq)
119
+ if interval == "irregular":
120
+ st.warning(
121
+ f"File Name: {file_name} ➜ Please upload data in daily, weekly or monthly interval. This file will be skipped.",
122
+ icon="⚠️",
123
+ )
124
+ continue
125
+
126
+ # Store both DataFrames in the dictionary under their respective keys
127
+ df_dict[file_name] = {
128
+ "numeric": numeric_cols,
129
+ "non_numeric": non_numeric_cols,
130
+ "interval": interval,
131
+ "df": df,
132
+ }
133
+
134
+ return df_dict
135
+
136
+
137
+ # Function to adjust dataframe granularity
138
+ def adjust_dataframe_granularity(df, current_granularity, target_granularity):
139
+ # Set index
140
+ df.set_index("date", inplace=True)
141
+
142
+ # Define aggregation rules for resampling
143
+ aggregation_rules = {
144
+ col: "sum" if pd.api.types.is_numeric_dtype(df[col]) else "first"
145
+ for col in df.columns
146
+ }
147
+
148
+ # Initialize resampled_df
149
+ resampled_df = df
150
+ if current_granularity == "daily" and target_granularity == "weekly":
151
+ resampled_df = df.resample("W-MON", closed="left", label="left").agg(
152
+ aggregation_rules
153
+ )
154
+
155
+ elif current_granularity == "daily" and target_granularity == "monthly":
156
+ resampled_df = df.resample("MS", closed="left", label="left").agg(
157
+ aggregation_rules
158
+ )
159
+
160
+ elif current_granularity == "daily" and target_granularity == "daily":
161
+ resampled_df = df.resample("D").agg(aggregation_rules)
162
+
163
+ elif current_granularity in ["weekly", "monthly"] and target_granularity == "daily":
164
+ # For higher to lower granularity, distribute numeric and replicate non-numeric values equally across the new period
165
+ expanded_data = []
166
+ for _, row in df.iterrows():
167
+ if current_granularity == "weekly":
168
+ period_range = pd.date_range(start=row.name, periods=7)
169
+ elif current_granularity == "monthly":
170
+ period_range = pd.date_range(
171
+ start=row.name, periods=row.name.days_in_month
172
+ )
173
+
174
+ for date in period_range:
175
+ new_row = {}
176
+ for col in df.columns:
177
+ if pd.api.types.is_numeric_dtype(df[col]):
178
+ if current_granularity == "weekly":
179
+ new_row[col] = row[col] / 7
180
+ elif current_granularity == "monthly":
181
+ new_row[col] = row[col] / row.name.days_in_month
182
+ else:
183
+ new_row[col] = row[col]
184
+ expanded_data.append((date, new_row))
185
+
186
+ resampled_df = pd.DataFrame(
187
+ [data for _, data in expanded_data],
188
+ index=[date for date, _ in expanded_data],
189
+ )
190
+
191
+ # Reset index
192
+ resampled_df = resampled_df.reset_index().rename(columns={"index": "date"})
193
+
194
+ return resampled_df
195
+
196
+
197
+ # Function to clean and extract unique values of Panel_1 and Panel_2
198
+ st.cache_resource(show_spinner=False)
199
+
200
+
201
+ def clean_and_extract_unique_values(files_dict, selections):
202
+ all_panel1_values = set()
203
+ all_panel2_values = set()
204
+
205
+ for file_name, file_data in files_dict.items():
206
+ df = file_data["df"]
207
+
208
+ # 'Panel_1' and 'Panel_2' selections
209
+ selected_panel1 = selections[file_name].get("Panel_1")
210
+ selected_panel2 = selections[file_name].get("Panel_2")
211
+
212
+ # Clean and standardize Panel_1 column if it exists and is selected
213
+ if (
214
+ selected_panel1
215
+ and selected_panel1 != "N/A"
216
+ and selected_panel1 in df.columns
217
+ ):
218
+ df[selected_panel1] = (
219
+ df[selected_panel1].str.lower().str.strip().str.replace("_", " ")
220
+ )
221
+ all_panel1_values.update(df[selected_panel1].dropna().unique())
222
+
223
+ # Clean and standardize Panel_2 column if it exists and is selected
224
+ if (
225
+ selected_panel2
226
+ and selected_panel2 != "N/A"
227
+ and selected_panel2 in df.columns
228
+ ):
229
+ df[selected_panel2] = (
230
+ df[selected_panel2].str.lower().str.strip().str.replace("_", " ")
231
+ )
232
+ all_panel2_values.update(df[selected_panel2].dropna().unique())
233
+
234
+ # Update the processed DataFrame back in the dictionary
235
+ files_dict[file_name]["df"] = df
236
+
237
+ return all_panel1_values, all_panel2_values
238
+
239
+
240
+ # Function to format values for display
241
+ st.cache_resource(show_spinner=False)
242
+
243
+
244
+ def format_values_for_display(values_list):
245
+ # Capitalize the first letter of each word and replace underscores with spaces
246
+ formatted_list = [value.replace("_", " ").title() for value in values_list]
247
+ # Join values with commas and 'and' before the last value
248
+ if len(formatted_list) > 1:
249
+ return ", ".join(formatted_list[:-1]) + ", and " + formatted_list[-1]
250
+ elif formatted_list:
251
+ return formatted_list[0]
252
+ return "No values available"
253
+
254
+
255
+ # Function to normalizes all data within files_dict to a daily granularity
256
+ st.cache(show_spinner=False, allow_output_mutation=True)
257
+
258
+
259
+ def standardize_data_to_daily(files_dict, selections):
260
+ # Normalize all data to a daily granularity using a provided function
261
+ files_dict = apply_granularity_to_all(files_dict, "daily", selections)
262
+
263
+ # Update the "interval" attribute for each dataset to indicate the new granularity
264
+ for files_name, files_data in files_dict.items():
265
+ files_data["interval"] = "daily"
266
+
267
+ return files_dict
268
+
269
+
270
+ # Function to apply granularity transformation to all DataFrames in files_dict
271
+ st.cache_resource(show_spinner=False)
272
+
273
+
274
+ def apply_granularity_to_all(files_dict, granularity_selection, selections):
275
+ for file_name, file_data in files_dict.items():
276
+ df = file_data["df"].copy()
277
+
278
+ # Handling when Panel_1 or Panel_2 might be 'N/A'
279
+ selected_panel1 = selections[file_name].get("Panel_1")
280
+ selected_panel2 = selections[file_name].get("Panel_2")
281
+
282
+ # Correcting the segment selection logic & handling 'N/A'
283
+ if selected_panel1 != "N/A" and selected_panel2 != "N/A":
284
+ unique_combinations = df[
285
+ [selected_panel1, selected_panel2]
286
+ ].drop_duplicates()
287
+ elif selected_panel1 != "N/A":
288
+ unique_combinations = df[[selected_panel1]].drop_duplicates()
289
+ selected_panel2 = None # Ensure Panel_2 is ignored if N/A
290
+ elif selected_panel2 != "N/A":
291
+ unique_combinations = df[[selected_panel2]].drop_duplicates()
292
+ selected_panel1 = None # Ensure Panel_1 is ignored if N/A
293
+ else:
294
+ # If both are 'N/A', process the entire dataframe as is
295
+ df = adjust_dataframe_granularity(
296
+ df, file_data["interval"], granularity_selection
297
+ )
298
+ files_dict[file_name]["df"] = df
299
+ continue # Skip to the next file
300
+
301
+ transformed_segments = []
302
+ for _, combo in unique_combinations.iterrows():
303
+ if selected_panel1 and selected_panel2:
304
+ segment = df[
305
+ (df[selected_panel1] == combo[selected_panel1])
306
+ & (df[selected_panel2] == combo[selected_panel2])
307
+ ]
308
+ elif selected_panel1:
309
+ segment = df[df[selected_panel1] == combo[selected_panel1]]
310
+ elif selected_panel2:
311
+ segment = df[df[selected_panel2] == combo[selected_panel2]]
312
+
313
+ # Adjust granularity of the segment
314
+ transformed_segment = adjust_dataframe_granularity(
315
+ segment, file_data["interval"], granularity_selection
316
+ )
317
+ transformed_segments.append(transformed_segment)
318
+
319
+ # Combine all transformed segments into a single DataFrame for this file
320
+ transformed_df = pd.concat(transformed_segments, ignore_index=True)
321
+ files_dict[file_name]["df"] = transformed_df
322
+
323
+ return files_dict
324
+
325
+
326
+ # Function to create main dataframe structure
327
+ st.cache_resource(show_spinner=False)
328
+
329
+
330
+ def create_main_dataframe(
331
+ files_dict, all_panel1_values, all_panel2_values, granularity_selection
332
+ ):
333
+ # Determine the global start and end dates across all DataFrames
334
+ global_start = min(df["df"]["date"].min() for df in files_dict.values())
335
+ global_end = max(df["df"]["date"].max() for df in files_dict.values())
336
+
337
+ # Adjust the date_range generation based on the granularity_selection
338
+ if granularity_selection == "weekly":
339
+ # Generate a weekly range, with weeks starting on Monday
340
+ date_range = pd.date_range(start=global_start, end=global_end, freq="W-MON")
341
+ elif granularity_selection == "monthly":
342
+ # Generate a monthly range, starting from the first day of each month
343
+ date_range = pd.date_range(start=global_start, end=global_end, freq="MS")
344
+ else: # Default to daily if not weekly or monthly
345
+ date_range = pd.date_range(start=global_start, end=global_end, freq="D")
346
+
347
+ # Collect all unique Panel_1 and Panel_2 values, excluding 'N/A'
348
+ all_panel1s = all_panel1_values
349
+ all_panel2s = all_panel2_values
350
+
351
+ # Dynamically build the list of dimensions (Panel_1, Panel_2) to include in the main DataFrame based on availability
352
+ dimensions, merge_keys = [], []
353
+ if all_panel1s:
354
+ dimensions.append(all_panel1s)
355
+ merge_keys.append("Panel_1")
356
+ if all_panel2s:
357
+ dimensions.append(all_panel2s)
358
+ merge_keys.append("Panel_2")
359
+
360
+ dimensions.append(date_range) # Date range is always included
361
+ merge_keys.append("date") # Date range is always included
362
+
363
+ # Create a main DataFrame template with the dimensions
364
+ main_df = pd.MultiIndex.from_product(
365
+ dimensions,
366
+ names=[name for name, _ in zip(merge_keys, dimensions)],
367
+ ).to_frame(index=False)
368
+
369
+ return main_df.reset_index(drop=True)
370
+
371
+
372
+ # Function to prepare and merge dataFrames
373
+ st.cache_resource(show_spinner=False)
374
+
375
+
376
+ def merge_into_main_df(main_df, files_dict, selections):
377
+ for file_name, file_data in files_dict.items():
378
+ df = file_data["df"].copy()
379
+
380
+ # Rename selected Panel_1 and Panel_2 columns if not 'N/A'
381
+ selected_panel1 = selections[file_name].get("Panel_1", "N/A")
382
+ selected_panel2 = selections[file_name].get("Panel_2", "N/A")
383
+ if selected_panel1 != "N/A":
384
+ df.rename(columns={selected_panel1: "Panel_1"}, inplace=True)
385
+ if selected_panel2 != "N/A":
386
+ df.rename(columns={selected_panel2: "Panel_2"}, inplace=True)
387
+
388
+ # Merge current DataFrame into main_df based on 'date', and where applicable, 'Panel_1' and 'Panel_2'
389
+ merge_keys = ["date"]
390
+ if "Panel_1" in df.columns:
391
+ merge_keys.append("Panel_1")
392
+ if "Panel_2" in df.columns:
393
+ merge_keys.append("Panel_2")
394
+ main_df = pd.merge(main_df, df, on=merge_keys, how="left")
395
+
396
+ # After all merges, sort by 'date' and reset index for cleanliness
397
+ sort_by = ["date"]
398
+ if "Panel_1" in main_df.columns:
399
+ sort_by.append("Panel_1")
400
+ if "Panel_2" in main_df.columns:
401
+ sort_by.append("Panel_2")
402
+ main_df.sort_values(by=sort_by, inplace=True)
403
+ main_df.reset_index(drop=True, inplace=True)
404
+
405
+ return main_df
406
+
407
+
408
+ # Function to categorize column
409
+ def categorize_column(column_name):
410
+ # Define keywords for each category
411
+ internal_keywords = [
412
+ "Price",
413
+ "Discount",
414
+ "product_price",
415
+ "cost",
416
+ "margin",
417
+ "inventory",
418
+ "sales",
419
+ "revenue",
420
+ "turnover",
421
+ "expense",
422
+ ]
423
+ exogenous_keywords = [
424
+ "GDP",
425
+ "Tax",
426
+ "Inflation",
427
+ "interest_rate",
428
+ "employment_rate",
429
+ "exchange_rate",
430
+ "consumer_spending",
431
+ "retail_sales",
432
+ "oil_prices",
433
+ "weather",
434
+ ]
435
+
436
+ # Check if the column name matches any of the keywords for Internal or Exogenous categories
437
+ for keyword in internal_keywords:
438
+ if keyword.lower() in column_name.lower():
439
+ return "Internal"
440
+ for keyword in exogenous_keywords:
441
+ if keyword.lower() in column_name.lower():
442
+ return "Exogenous"
443
+
444
+ # Default to Media if no match found
445
+ return "Media"
446
+
447
+
448
+ # Function to calculate missing stats and prepare for editable DataFrame
449
+ st.cache_resource(show_spinner=False)
450
+
451
+
452
+ def prepare_missing_stats_df(df):
453
+ missing_stats = []
454
+ for column in df.columns:
455
+ if (
456
+ column == "date" or column == "Panel_2" or column == "Panel_1"
457
+ ): # Skip Date, Panel_1 and Panel_2 column
458
+ continue
459
+
460
+ missing = df[column].isnull().sum()
461
+ pct_missing = round((missing / len(df)) * 100, 2)
462
+
463
+ # Dynamically assign category based on column name
464
+ category = categorize_column(column)
465
+ # category = "Media" # Keep default bin as Media
466
+
467
+ missing_stats.append(
468
+ {
469
+ "Column": column,
470
+ "Missing Values": missing,
471
+ "Missing Percentage": pct_missing,
472
+ "Impute Method": "Fill with 0", # Default value
473
+ "Category": category,
474
+ }
475
+ )
476
+ stats_df = pd.DataFrame(missing_stats)
477
+
478
+ return stats_df
479
+
480
+
481
+ # Function to add API DataFrame details to the files dictionary
482
+ st.cache_resource(show_spinner=False)
483
+
484
+
485
+ def add_api_dataframe_to_dict(main_df, files_dict):
486
+ files_dict["API"] = {
487
+ "numeric": list(main_df.select_dtypes(include=["number"]).columns),
488
+ "non_numeric": [
489
+ col
490
+ for col in main_df.select_dtypes(exclude=["number"]).columns
491
+ if col.lower() != "date"
492
+ ],
493
+ "interval": determine_data_interval(
494
+ pd.Series(main_df["date"].unique()).diff().dt.days.dropna().mode()[0]
495
+ ),
496
+ "df": main_df,
497
+ }
498
+
499
+ return files_dict
500
+
501
+
502
+ # Function to reads an API into a DataFrame, parsing specified columns as datetime
503
+ @st.cache_resource(show_spinner=False)
504
+ def read_API_data():
505
+ return pd.read_excel(r".\upf_data_converted_randomized_resp_metrics.xlsx", parse_dates=["Date"])
506
+
507
+
508
+ # Function to set the 'Panel_1_Panel_2_Selected' session state variable to False
509
+ def set_Panel_1_Panel_2_Selected_false():
510
+ st.session_state["Panel_1_Panel_2_Selected"] = False
511
+
512
+
513
+ # Function to serialize and save the objects into a pickle file
514
+ @st.cache_resource(show_spinner=False)
515
+ def save_to_pickle(file_path, final_df, bin_dict):
516
+ # Open the file in write-binary mode and dump the objects
517
+ with open(file_path, "wb") as f:
518
+ pickle.dump({"final_df": final_df, "bin_dict": bin_dict}, f)
519
+ # Data is now saved to file
520
+
521
+
522
+ # Function to processes the merged_df DataFrame based on operations defined in edited_df
523
+ @st.cache_resource(show_spinner=False)
524
+ def process_dataframes(merged_df, edited_df, edited_stats_df):
525
+ # Ensure there are operations defined by the user
526
+ if edited_df.empty:
527
+ return merged_df, edited_stats_df # No operations to apply
528
+
529
+ # Perform operations as defined by the user
530
+ for index, row in edited_df.iterrows():
531
+ result_column_name = f"{row['Column 1']}{row['Operator']}{row['Column 2']}"
532
+ col1 = row["Column 1"]
533
+ col2 = row["Column 2"]
534
+ op = row["Operator"]
535
+
536
+ # Apply the specified operation
537
+ if op == "+":
538
+ merged_df[result_column_name] = merged_df[col1] + merged_df[col2]
539
+ elif op == "-":
540
+ merged_df[result_column_name] = merged_df[col1] - merged_df[col2]
541
+ elif op == "*":
542
+ merged_df[result_column_name] = merged_df[col1] * merged_df[col2]
543
+ elif op == "/":
544
+ merged_df[result_column_name] = merged_df[col1] / merged_df[col2].replace(
545
+ 0, 1e-9
546
+ )
547
+
548
+ # Add summary of operation to edited_stats_df
549
+ new_row = {
550
+ "Column": result_column_name,
551
+ "Missing Values": None,
552
+ "Missing Percentage": None,
553
+ "Impute Method": None,
554
+ "Category": row["Category"],
555
+ }
556
+ new_row_df = pd.DataFrame([new_row])
557
+
558
+ # Use pd.concat to add the new_row_df to edited_stats_df
559
+ edited_stats_df = pd.concat(
560
+ [edited_stats_df, new_row_df], ignore_index=True, axis=0
561
+ )
562
+
563
+ # Combine column names from edited_df for cleanup
564
+ combined_columns = set(edited_df["Column 1"]).union(set(edited_df["Column 2"]))
565
+
566
+ # Filter out rows in edited_stats_df and drop columns from merged_df
567
+ edited_stats_df = edited_stats_df[~edited_stats_df["Column"].isin(combined_columns)]
568
+ merged_df.drop(columns=list(combined_columns), errors="ignore", inplace=True)
569
+
570
+ return merged_df, edited_stats_df
571
+
572
+
573
+ # Function to prepare a list of numeric column names and initialize an empty DataFrame with predefined structure
574
+ st.cache_resource(show_spinner=False)
575
+
576
+
577
+ def prepare_numeric_columns_and_default_df(merged_df, edited_stats_df):
578
+ # Get columns categorized as 'Response Metrics'
579
+ columns_response_metrics = edited_stats_df[
580
+ edited_stats_df["Category"] == "Response Metrics"
581
+ ]["Column"].tolist()
582
+
583
+ # Filter numeric columns, excluding those categorized as 'Response Metrics'
584
+ numeric_columns = [
585
+ col
586
+ for col in merged_df.select_dtypes(include=["number"]).columns
587
+ if col not in columns_response_metrics
588
+ ]
589
+
590
+ # Define the structure of the empty DataFrame
591
+ data = {
592
+ "Column 1": pd.Series([], dtype="str"),
593
+ "Operator": pd.Series([], dtype="str"),
594
+ "Column 2": pd.Series([], dtype="str"),
595
+ "Category": pd.Series([], dtype="str"),
596
+ }
597
+ default_df = pd.DataFrame(data)
598
+
599
+ return numeric_columns, default_df
600
+
601
+
602
+ # Initialize 'final_df' in session state
603
+ if "final_df" not in st.session_state:
604
+ st.session_state["final_df"] = pd.DataFrame()
605
+
606
+ # Initialize 'bin_dict' in session state
607
+ if "bin_dict" not in st.session_state:
608
+ st.session_state["bin_dict"] = {}
609
+
610
+ # Initialize 'Panel_1_Panel_2_Selected' in session state
611
+ if "Panel_1_Panel_2_Selected" not in st.session_state:
612
+ st.session_state["Panel_1_Panel_2_Selected"] = False
613
+
614
+
615
+ # Page Title
616
+ st.write("") # Top padding
617
+ st.title("Data Import")
618
+
619
+
620
+ #########################################################################################################################################################
621
+ # Create a dictionary to hold all DataFrames and collect user input to specify "Panel_2" and "Panel_1" columns for each file
622
+ #########################################################################################################################################################
623
+
624
+
625
+ # Read the Excel file, parsing 'Date' column as datetime
626
+ main_df = read_API_data()
627
+
628
+ # Convert all column names to lowercase
629
+ main_df.columns = main_df.columns.str.lower().str.strip()
630
+
631
+ # File uploader
632
+ uploaded_files = st.file_uploader(
633
+ "Upload additional data",
634
+ type=["xlsx"],
635
+ accept_multiple_files=True,
636
+ on_change=set_Panel_1_Panel_2_Selected_false,
637
+ )
638
+
639
+ # Custom HTML for upload instructions
640
+ recommendation_html = f"""
641
+ <div style="text-align: justify;">
642
+ <strong>Recommendation:</strong> For optimal processing, please ensure that all uploaded datasets including panel, media, internal, and exogenous data adhere to the following guidelines: Each dataset must include a <code>Date</code> column formatted as <code>DD-MM-YYYY</code>, be free of missing values.
643
+ </div>
644
+ """
645
+ st.markdown(recommendation_html, unsafe_allow_html=True)
646
+
647
+ # Choose Desired Granularity
648
+ st.markdown("#### Choose Desired Granularity")
649
+ # Granularity Selection
650
+ granularity_selection = st.selectbox(
651
+ "Choose Date Granularity",
652
+ ["Daily", "Weekly", "Monthly"],
653
+ label_visibility="collapsed",
654
+ on_change=set_Panel_1_Panel_2_Selected_false,
655
+ )
656
+ granularity_selection = str(granularity_selection).lower()
657
+
658
+ # Convert files to dataframes
659
+ files_dict = files_to_dataframes(uploaded_files)
660
+
661
+ # Add API Dataframe
662
+ if main_df is not None:
663
+ files_dict = add_api_dataframe_to_dict(main_df, files_dict)
664
+
665
+ # Display a warning message if no files have been uploaded and halt further execution
666
+ if not files_dict:
667
+ st.warning(
668
+ "Please upload at least one file to proceed.",
669
+ icon="⚠️",
670
+ )
671
+ st.stop() # Halts further execution until file is uploaded
672
+
673
+
674
+ # Select Panel_1 and Panel_2 columns
675
+ st.markdown("#### Select Panel columns")
676
+ selections = {}
677
+ with st.expander("Select Panel columns", expanded=False):
678
+ count = 0 # Initialize counter to manage the visibility of labels and keys
679
+ for file_name, file_data in files_dict.items():
680
+ # Determine visibility of the label based on the count
681
+ if count == 0:
682
+ label_visibility = "visible"
683
+ else:
684
+ label_visibility = "collapsed"
685
+
686
+ # Extract non-numeric columns
687
+ non_numeric_cols = file_data["non_numeric"]
688
+
689
+ # Prepare Panel_1 and Panel_2 values for dropdown, adding "N/A" as an option
690
+ panel1_values = non_numeric_cols + ["N/A"]
691
+ panel2_values = non_numeric_cols + ["N/A"]
692
+
693
+ # Skip if only one option is available
694
+ if len(panel1_values) == 1 and len(panel2_values) == 1:
695
+ selected_panel1, selected_panel2 = "N/A", "N/A"
696
+ # Update the selections for Panel_1 and Panel_2 for the current file
697
+ selections[file_name] = {
698
+ "Panel_1": selected_panel1,
699
+ "Panel_2": selected_panel2,
700
+ }
701
+ continue
702
+
703
+ # Create layout columns for File Name, Panel_2, and Panel_1 selections
704
+ file_name_col, Panel_1_col, Panel_2_col = st.columns([2, 4, 4])
705
+
706
+ with file_name_col:
707
+ # Display "File Name" label only for the first file
708
+ if count == 0:
709
+ st.write("File Name")
710
+ else:
711
+ st.write("")
712
+ st.write(file_name) # Display the file name
713
+
714
+ with Panel_1_col:
715
+ # Display a selectbox for Panel_1 values
716
+ selected_panel1 = st.selectbox(
717
+ "Select Panel Level 1",
718
+ panel2_values,
719
+ on_change=set_Panel_1_Panel_2_Selected_false,
720
+ label_visibility=label_visibility, # Control visibility of the label
721
+ key=f"Panel_1_selectbox{count}", # Ensure unique key for each selectbox
722
+ )
723
+
724
+ with Panel_2_col:
725
+ # Display a selectbox for Panel_2 values
726
+ selected_panel2 = st.selectbox(
727
+ "Select Panel Level 2",
728
+ panel1_values,
729
+ on_change=set_Panel_1_Panel_2_Selected_false,
730
+ label_visibility=label_visibility, # Control visibility of the label
731
+ key=f"Panel_2_selectbox{count}", # Ensure unique key for each selectbox
732
+ )
733
+
734
+ # Skip processing if the same column is selected for both Panel_1 and Panel_2 due to potential data integrity issues
735
+ if selected_panel2 == selected_panel1 and not (
736
+ selected_panel2 == "N/A" and selected_panel1 == "N/A"
737
+ ):
738
+ st.warning(
739
+ f"File: {file_name} → The same column cannot serve as both Panel_1 and Panel_2. Please adjust your selections.",
740
+ )
741
+ selected_panel1, selected_panel2 = "N/A", "N/A"
742
+ st.stop()
743
+
744
+ # Update the selections for Panel_1 and Panel_2 for the current file
745
+ selections[file_name] = {
746
+ "Panel_1": selected_panel1,
747
+ "Panel_2": selected_panel2,
748
+ }
749
+
750
+ count += 1 # Increment the counter after processing each file
751
+
752
+ # Accept Panel_1 and Panel_2 selection
753
+ if st.button("Accept and Process", use_container_width=True):
754
+
755
+ # Normalize all data to a daily granularity. This initial standardization simplifies subsequent conversions to other levels of granularity
756
+ with st.spinner("Processing..."):
757
+ files_dict = standardize_data_to_daily(files_dict, selections)
758
+
759
+ # Convert all data to daily level granularity
760
+ files_dict = apply_granularity_to_all(
761
+ files_dict, granularity_selection, selections
762
+ )
763
+
764
+ # Update the 'files_dict' in the session state
765
+ st.session_state["files_dict"] = files_dict
766
+
767
+ # Set a flag in the session state to indicate that selection has been made
768
+ st.session_state["Panel_1_Panel_2_Selected"] = True
769
+
770
+
771
+ #########################################################################################################################################################
772
+ # Display unique Panel_1 and Panel_2 values
773
+ #########################################################################################################################################################
774
+
775
+
776
+ # Halts further execution until Panel_1 and Panel_2 columns are selected
777
+ if "files_dict" in st.session_state and st.session_state["Panel_1_Panel_2_Selected"]:
778
+ files_dict = st.session_state["files_dict"]
779
+ else:
780
+ st.stop()
781
+
782
+ # Set to store unique values of Panel_1 and Panel_2
783
+ with st.spinner("Fetching Panel values..."):
784
+ all_panel1_values, all_panel2_values = clean_and_extract_unique_values(
785
+ files_dict, selections
786
+ )
787
+
788
+ # List of Panel_1 and Panel_2 columns unique values
789
+ list_of_all_panel1_values = list(all_panel1_values)
790
+ list_of_all_panel2_values = list(all_panel2_values)
791
+
792
+ # Format Panel_1 and Panel_2 values for display
793
+ formatted_panel1_values = format_values_for_display(list_of_all_panel1_values)
794
+ formatted_panel2_values = format_values_for_display(list_of_all_panel2_values)
795
+
796
+ # Unique Panel_1 and Panel_2 values
797
+ st.markdown("#### Unique Panel values")
798
+ # Display Panel_1 and Panel_2 values
799
+ with st.expander("Unique Panel values"):
800
+ st.write("")
801
+ st.markdown(
802
+ f"""
803
+ <style>
804
+ .justify-text {{
805
+ text-align: justify;
806
+ }}
807
+ </style>
808
+ <div class="justify-text">
809
+ <strong>Panel Level 1 Values:</strong> {formatted_panel1_values}<br>
810
+ <strong>Panel Level 2 Values:</strong> {formatted_panel2_values}
811
+ </div>
812
+ """,
813
+ unsafe_allow_html=True,
814
+ )
815
+
816
+ # Display total Panel_1 and Panel_2
817
+ st.write("")
818
+ st.markdown(
819
+ f"""
820
+ <div style="text-align: justify;">
821
+ <strong>Number of Level 1 Panels detected:</strong> {len(list_of_all_panel1_values)}<br>
822
+ <strong>Number of Level 2 Panels detected:</strong> {len(list_of_all_panel2_values)}
823
+ </div>
824
+ """,
825
+ unsafe_allow_html=True,
826
+ )
827
+ st.write("")
828
+
829
+
830
+ #########################################################################################################################################################
831
+ # Merge all DataFrames
832
+ #########################################################################################################################################################
833
+
834
+
835
+ # Merge all DataFrames selected
836
+ main_df = create_main_dataframe(
837
+ files_dict, all_panel1_values, all_panel2_values, granularity_selection
838
+ )
839
+ merged_df = merge_into_main_df(main_df, files_dict, selections)
840
+
841
+
842
+ #########################################################################################################################################################
843
+ # Categorize Variables and Impute Missing Values
844
+ #########################################################################################################################################################
845
+
846
+
847
+ # Create an editable DataFrame in Streamlit
848
+ st.markdown("#### Select Variables Category & Impute Missing Values")
849
+
850
+ # Prepare missing stats DataFrame for editing
851
+ missing_stats_df = prepare_missing_stats_df(merged_df)
852
+
853
+ edited_stats_df = st.data_editor(
854
+ missing_stats_df,
855
+ column_config={
856
+ "Impute Method": st.column_config.SelectboxColumn(
857
+ options=[
858
+ "Drop Column",
859
+ "Fill with Mean",
860
+ "Fill with Median",
861
+ "Fill with 0",
862
+ ],
863
+ required=True,
864
+ default="Fill with 0",
865
+ ),
866
+ "Category": st.column_config.SelectboxColumn(
867
+ options=[
868
+ "Media",
869
+ "Exogenous",
870
+ "Internal",
871
+ "Response Metrics",
872
+ ],
873
+ required=True,
874
+ default="Media",
875
+ ),
876
+ },
877
+ disabled=["Column", "Missing Values", "Missing Percentage"],
878
+ hide_index=True,
879
+ use_container_width=True,
880
+ )
881
+
882
+ # Apply changes based on edited DataFrame
883
+ for i, row in edited_stats_df.iterrows():
884
+ column = row["Column"]
885
+ if row["Impute Method"] == "Drop Column":
886
+ merged_df.drop(columns=[column], inplace=True)
887
+
888
+ elif row["Impute Method"] == "Fill with Mean":
889
+ merged_df[column].fillna(merged_df[column].mean(), inplace=True)
890
+
891
+ elif row["Impute Method"] == "Fill with Median":
892
+ merged_df[column].fillna(merged_df[column].median(), inplace=True)
893
+
894
+ elif row["Impute Method"] == "Fill with 0":
895
+ merged_df[column].fillna(0, inplace=True)
896
+
897
+
898
+ #########################################################################################################################################################
899
+ # Group columns
900
+ #########################################################################################################################################################
901
+
902
+
903
+ # Display Group columns header
904
+ st.markdown("#### Feature engineering")
905
+
906
+ # Prepare the numeric columns and an empty DataFrame for user input
907
+ numeric_columns, default_df = prepare_numeric_columns_and_default_df(
908
+ merged_df, edited_stats_df
909
+ )
910
+
911
+ # Display editable Dataframe
912
+ edited_df = st.data_editor(
913
+ default_df,
914
+ column_config={
915
+ "Column 1": st.column_config.SelectboxColumn(
916
+ options=numeric_columns,
917
+ required=True,
918
+ default=numeric_columns[0],
919
+ width=400,
920
+ ),
921
+ "Operator": st.column_config.SelectboxColumn(
922
+ options=["+", "-", "*", "/"],
923
+ required=True,
924
+ default="+",
925
+ width=100,
926
+ ),
927
+ "Column 2": st.column_config.SelectboxColumn(
928
+ options=numeric_columns,
929
+ required=True,
930
+ default=numeric_columns[0],
931
+ width=400,
932
+ ),
933
+ "Category": st.column_config.SelectboxColumn(
934
+ options=[
935
+ "Media",
936
+ "Exogenous",
937
+ "Internal",
938
+ "Response Metrics",
939
+ ],
940
+ required=True,
941
+ default="Media",
942
+ width=200,
943
+ ),
944
+ },
945
+ num_rows="dynamic",
946
+ )
947
+
948
+ # Process the DataFrame based on user inputs and operations specified in edited_df
949
+ final_df, edited_stats_df = process_dataframes(merged_df, edited_df, edited_stats_df)
950
+
951
+
952
+ #########################################################################################################################################################
953
+ # Display the Final DataFrame and variables
954
+ #########################################################################################################################################################
955
+
956
+
957
+ # Display the Final DataFrame and variables
958
+ st.markdown("#### Final DataFrame")
959
+ st.dataframe(final_df, hide_index=True)
960
+
961
+ # Initialize an empty dictionary to hold categories and their variables
962
+ category_dict = {}
963
+
964
+ # Iterate over each row in the edited DataFrame to populate the dictionary
965
+ for i, row in edited_stats_df.iterrows():
966
+ column = row["Column"]
967
+ category = row["Category"] # The category chosen by the user for this variable
968
+
969
+ # Check if the category already exists in the dictionary
970
+ if category not in category_dict:
971
+ # If not, initialize it with the current column as its first element
972
+ category_dict[category] = [column]
973
+ else:
974
+ # If it exists, append the current column to the list of variables under this category
975
+ category_dict[category].append(column)
976
+
977
+ # Add Date, Panel_1 and Panel_12 in category dictionary
978
+ category_dict.update({"Date": ["date"]})
979
+ if "Panel_1" in final_df.columns:
980
+ category_dict["Panel Level 1"] = ["Panel_1"]
981
+ if "Panel_2" in final_df.columns:
982
+ category_dict["Panel Level 2"] = ["Panel_2"]
983
+
984
+ # Display the dictionary
985
+ st.markdown("#### Variable Category")
986
+ for category, variables in category_dict.items():
987
+ # Check if there are multiple variables to handle "and" insertion correctly
988
+ if len(variables) > 1:
989
+ # Join all but the last variable with ", ", then add " and " before the last variable
990
+ variables_str = ", ".join(variables[:-1]) + " and " + variables[-1]
991
+ else:
992
+ # If there's only one variable, no need for "and"
993
+ variables_str = variables[0]
994
+
995
+ # Display the category and its variables in the desired format
996
+ st.markdown(
997
+ f"<div style='text-align: justify;'><strong>{category}:</strong> {variables_str}</div>",
998
+ unsafe_allow_html=True,
999
+ )
1000
+
1001
+ # Function to check if Response Metrics is selected
1002
+ st.write("")
1003
+ response_metrics_col = category_dict.get("Response Metrics", [])
1004
+ if len(response_metrics_col) == 0:
1005
+ st.warning("Please select Response Metrics column", icon="⚠️")
1006
+ st.stop()
1007
+ # elif len(response_metrics_col) > 1:
1008
+ # st.warning("Please select only one Response Metrics column", icon="⚠️")
1009
+ # st.stop()
1010
+
1011
+ # Store final dataframe and bin dictionary into session state
1012
+ st.session_state["final_df"], st.session_state["bin_dict"] = final_df, category_dict
1013
+
1014
+ # Save the DataFrame and dictionary from the session state to the pickle file
1015
+ if st.button("Accept and Save", use_container_width=True):
1016
+ save_to_pickle(
1017
+ "data_import.pkl", st.session_state["final_df"], st.session_state["bin_dict"]
1018
+ )
1019
+ st.toast("💾 Saved Successfully!")
Data_prep_functions.py CHANGED
@@ -86,76 +86,89 @@ def create_dual_axis_line_chart(date_series, promo_price_series, non_promo_price
86
  def to_percentage(value):
87
  return f'{value * 100:.1f}%'
88
 
89
-
90
- def plot_actual_vs_predicted(date, y, predicted_values, model, target_column=None, flag=None, repeat_all_years=False, is_panel=False):
91
- if flag is not None:
92
- fig = make_subplots(specs=[[{"secondary_y": True}]])
93
- else:
94
- fig = go.Figure()
95
-
96
- if is_panel:
97
- df = pd.DataFrame()
98
- df['date'] = date
99
- df['Actual'] = y
100
- df['Predicted'] = predicted_values
101
- df_agg = df.groupby('date').agg({'Actual': 'sum', 'Predicted': 'sum'}).reset_index()
102
- df_agg.columns = ['date', 'Actual', 'Predicted']
103
- assert len(df_agg) == pd.Series(date).nunique()
104
-
105
- fig.add_trace(go.Scatter(x=df_agg['date'], y=df_agg['Actual'], mode='lines', name='Actual', line=dict(color='#08083B')))
106
- fig.add_trace(go.Scatter(x=df_agg['date'], y=df_agg['Predicted'], mode='lines', name='Predicted', line=dict(color='#11B6BD')))
107
-
108
- else:
109
- fig.add_trace(go.Scatter(x=date, y=y, mode='lines', name='Actual', line=dict(color='#08083B')))
110
- fig.add_trace(go.Scatter(x=date, y=predicted_values, mode='lines', name='Predicted', line=dict(color='#11B6BD')))
111
-
112
- line_values = []
 
 
 
113
  if flag:
114
- min_date, max_date = flag[0], flag[1]
115
- min_week = datetime.strptime(str(min_date), "%Y-%m-%d").strftime("%U")
116
- max_week = datetime.strptime(str(max_date), "%Y-%m-%d").strftime("%U")
117
- month = pd.to_datetime(min_date).month
118
- day = pd.to_datetime(min_date).day
119
-
120
- if repeat_all_years:
121
- line_values = list(pd.Series(date).map(lambda x: 1 if (pd.Timestamp(x).week >= int(min_week)) & (pd.Timestamp(x).week <= int(max_week)) else 0))
122
- assert len(line_values) == len(date)
123
- fig.add_trace(go.Scatter(x=date, y=line_values, mode='lines', name='Flag', line=dict(color='#FF5733')), secondary_y=True)
124
- else:
125
- line_values = list(pd.Series(date).map(lambda x: 1 if (pd.Timestamp(x) >= pd.Timestamp(min_date)) and (pd.Timestamp(x) <= pd.Timestamp(max_date)) else 0))
126
- fig.add_trace(go.Scatter(x=date, y=line_values, mode='lines', name='Flag', line=dict(color='#FF5733')), secondary_y=True)
127
-
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  mape = mean_absolute_percentage_error(y, predicted_values)
 
 
129
  r2 = r2_score(y, predicted_values)
130
  adjr2 = 1 - (1 - r2) * (len(y) - 1) / (len(y) - len(model.fe_params) - 1)
131
 
 
132
  metrics_table = pd.DataFrame({
133
- 'Metric': ['MAPE', 'R-squared', 'AdjR-squared'],
134
- 'Value': [mape, r2, adjr2]
135
  })
136
-
137
- # Convert date to datetime
138
- date = pd.to_datetime(date)
139
-
140
- # Calculate the number of days between each tick based on the date range
141
- date_range = (max(date) - min(date)).days
142
- #x_axis_tick_spacing = max(1, date_range // 50) # Divide the date range by 14 to get approximately 15 ticks
143
-
144
  fig.update_layout(
145
- xaxis=dict(title='Date', tickangle=-30),
146
- yaxis=dict(title=target_column),
 
147
  )
148
-
149
  fig.add_annotation(
150
- text=f"MAPE: {mape * 100:0.1f}%, Adjr2: {adjr2 * 100:.1f}%",
151
- xref="paper",
152
- yref="paper",
153
- x=0.95,
154
- y=1.2,
155
- showarrow=False,
156
  )
157
-
158
- return metrics_table, line_values, fig
 
159
 
160
  def plot_residual_predicted(actual, predicted, df):
161
  df_=df.copy()
 
86
  def to_percentage(value):
87
  return f'{value * 100:.1f}%'
88
 
89
+ def plot_actual_vs_predicted(date, y, predicted_values, model,target_column=None, flag=None, repeat_all_years=False, is_panel=False):
90
+ if flag is not None :
91
+ fig = make_subplots(specs=[[{"secondary_y": True}]])
92
+ else :
93
+ fig = go.Figure()
94
+
95
+ if is_panel :
96
+ df=pd.DataFrame()
97
+ df['date'] = date
98
+ df['Actual'] = y
99
+ df['Predicted'] = predicted_values
100
+ df_agg = df.groupby('date').agg({'Actual':'sum', 'Predicted':'sum'}).reset_index()
101
+ df_agg.columns = ['date', 'Actual', 'Predicted']
102
+ assert len(df_agg) == pd.Series(date).nunique()
103
+ # date = df_agg['date']
104
+ # y = df_agg['Actual']
105
+ # predicted_values = df_agg['Predicted']
106
+ # ymax = df_agg['Actual'].max() # Sprint3 - ymax to set y value for flag
107
+
108
+ fig.add_trace(go.Scatter(x=df_agg['date'], y=df_agg['Actual'], mode='lines', name='Actual', line=dict(color='#08083B')))
109
+ fig.add_trace(go.Scatter(x=df_agg['date'], y=df_agg['Predicted'], mode='lines', name='Predicted', line=dict(color='#11B6BD')))
110
+
111
+ else :
112
+ fig.add_trace(go.Scatter(x=date, y=y, mode='lines', name='Actual', line=dict(color='#08083B')))
113
+ fig.add_trace(go.Scatter(x=date, y=predicted_values, mode='lines', name='Predicted', line=dict(color='#11B6BD')))
114
+
115
+ line_values=[]
116
  if flag:
117
+ min_date, max_date = flag[0], flag[1]
118
+ min_week = datetime.strptime(str(min_date), "%Y-%m-%d").strftime("%U")
119
+ max_week = datetime.strptime(str(max_date), "%Y-%m-%d").strftime("%U")
120
+ month=pd.to_datetime(min_date).month
121
+ day=pd.to_datetime(min_date).day
122
+ #st.write(pd.to_datetime(min_date).week)
123
+ #st.write(min_week)
124
+ # Initialize an empty list to store line values
125
+
126
+ # Sprint3 change : put flags to secondary axis, & made their y value to 1 instead of 5M
127
+ if repeat_all_years:
128
+ #line_values=list(pd.to_datetime((pd.Series(date)).dt.week).map(lambda x: 10000 if x==min_week else 0 ))
129
+ #st.write(pd.Series(date).map(lambda x: pd.Timestamp(x).week))
130
+ line_values=list(pd.Series(date).map(lambda x: 1 if (pd.Timestamp(x).week >=int(min_week)) & (pd.Timestamp(x).week <=int(max_week)) else 0))
131
+ assert len(line_values) == len(date)
132
+ #st.write(line_values)
133
+ fig.add_trace(go.Scatter(x=date, y=line_values, mode='lines', name='Flag', line=dict(color='#FF5733')),secondary_y=True)
134
+ else:
135
+ line_values = []
136
+
137
+ line_values = list(pd.Series(date).map(lambda x: 1 if (pd.Timestamp(x) >= pd.Timestamp(min_date)) and (pd.Timestamp(x) <= pd.Timestamp(max_date)) else 0))
138
+
139
+ #st.write(line_values)
140
+ fig.add_trace(go.Scatter(x=date, y=line_values, mode='lines', name='Flag', line=dict(color='#FF5733')),secondary_y=True)
141
+
142
+
143
+ # Calculate MAPE
144
  mape = mean_absolute_percentage_error(y, predicted_values)
145
+
146
+ # Calculate AdjR2 # Assuming X is your feature matrix
147
  r2 = r2_score(y, predicted_values)
148
  adjr2 = 1 - (1 - r2) * (len(y) - 1) / (len(y) - len(model.fe_params) - 1)
149
 
150
+ # Create a table to display the metrics
151
  metrics_table = pd.DataFrame({
152
+ 'Metric': ['MAPE', 'R-squared', 'AdjR-squared'],
153
+ 'Value': [mape, r2, adjr2]
154
  })
155
+ # st.write(metrics_table)
 
 
 
 
 
 
 
156
  fig.update_layout(
157
+ xaxis=dict(title='Date'),
158
+ yaxis=dict(title=target_column),
159
+ xaxis_tickangle=-30
160
  )
 
161
  fig.add_annotation(
162
+ text=f"MAPE: {mape*100:0.1f}%, Adjr2: {adjr2 *100:.1f}%",
163
+ xref="paper",
164
+ yref="paper",
165
+ x=0.95, # Adjust these values to position the annotation
166
+ y=1.2,
167
+ showarrow=False,
168
  )
169
+ # print("{}{}"*20, len(line_values))
170
+ #metrics_table.set_index(['Metric'],inplace=True)
171
+ return metrics_table,line_values, fig
172
 
173
  def plot_residual_predicted(actual, predicted, df):
174
  df_=df.copy()
Model/model_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e25f247a6804043e242b2a688b9b5ca840bce3da95bfd52863f33cd1a83ce2e2
3
+ size 3160085
Model/model_1.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8179ad18c0894ab80fc5bc7daf85da4c29a0d79989a04fdfb3fe448bae00c582
3
+ size 3160085
Model/model_2.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc881f53a6a3dbca759c116f200606d946a48a1342dbabf75c84802df9cacd0d
3
+ size 3160100
Model/model_3.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e70ce7759767772d382d41e509022338fb35efc361367d488d876494ff0a915e
3
+ size 3160100
Model/model_4.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4227165221399f4430d82db0afdb68af986789d38efee3cabbba07db2b286759
3
+ size 3160079
Overview_data_test_panel@#app_installs.xlsx ADDED
Binary file (28.1 kB). View file
 
Overview_data_test_panel@#revenue.xlsx ADDED
Binary file (28.1 kB). View file
 
Overview_data_test_panelreplace_meapp_installs.xlsx ADDED
Binary file (28.1 kB). View file
 
README.md CHANGED
@@ -5,7 +5,7 @@ colorFrom: indigo
5
  colorTo: pink
6
  sdk: streamlit
7
  sdk_version: 1.32.1
8
- app_file: Data_Import.py
9
  pinned: false
10
  ---
11
 
 
5
  colorTo: pink
6
  sdk: streamlit
7
  sdk_version: 1.32.1
8
+ app_file: app.py
9
  pinned: false
10
  ---
11
 
Test/merged_df_contri.csv ADDED
The diff for this file is too large to render. See raw diff
 
Test/output_df.csv ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Date,ga_app,kwai,fb_level_achieved_tier_1,fb_level_achieved_tier_2,paid_search,programmatic,digital_tactic_others,const
2
+ 2023-08-28,-0.35397136801899115,66.71033836253882,174.81135610042415,3594.2897643195533,150.11933145463811,297.6127449335578,3.5410008622400904,17088.40000435223
3
+ 2023-09-04,-0.27103315177908194,72.74975974406102,194.2113426820265,3860.2811580984967,158.86380529049313,335.6495710921645,3.925829952759959,18800.68227530415
4
+ 2023-09-11,-0.18972320624535735,73.56577516680593,203.22388059872446,3779.8710519990336,154.802475164174,311.6912405544196,4.084632969053109,18923.5133215299
5
+ 2023-09-18,-0.13280624437175015,72.56271222233786,200.12443813391138,3681.8929038825913,155.7098287205689,311.05112450305245,4.051292175357099,19046.344367755646
6
+ 2023-09-25,-0.09296437106022509,73.70727325917034,205.74779198138953,3858.65253006403,155.02816278138727,312.86845990465827,3.9265582664040735,19169.175413981393
7
+ 2023-10-02,-0.06507505974215756,72.34256892327214,205.58713073299748,3726.8536377627233,159.8242700571235,315.5994755570924,3.955202754813552,19292.006460207143
8
+ 2023-10-09,-0.045552541819510295,74.38740927114137,207.0439308259877,3845.7054965140105,166.1387318784968,318.3770263805087,3.9670100767811185,19414.83750643289
9
+ 2023-10-16,-0.031886779273657205,73.92804257031634,209.0350517896794,3749.259107713571,158.5179131618084,308.27664915352324,3.935545074442725,19537.66855265864
10
+ 2023-10-23,-0.02232074549156004,74.2265721786869,214.96921278574305,3766.838626589657,155.11867956784573,298.7838125908522,3.8717920437881834,19660.499598884388
11
+ 2023-10-30,-0.015624521844092026,73.13776666139266,215.11994117361186,3861.8716038759217,150.99199274844668,305.8173177680258,3.8593412414854895,19783.330645110138
12
+ 2023-11-06,-0.010937165290864418,73.92209125196376,208.19044332496705,3939.163063071122,155.63698971642444,320.41327017703395,3.844088730158042,19906.161691335885
13
+ 2023-11-13,-0.007656015703605092,75.65843124761166,208.86440994169482,3793.1062744683286,156.5242431409553,320.3204189984107,4.021312960163909,20028.99273756163
14
+ 2023-11-20,-0.005359210992523565,73.88051276100926,218.40774072300528,3684.900260569517,163.258344706366,322.7402649826382,4.0473156754345965,20151.823783787382
15
+ 2023-11-27,-0.0037514476947664945,72.1846283175467,213.20545855013495,3856.792298375503,167.13396999671053,332.60329700992924,3.949159871187085,20274.65483001313
16
+ 2023-12-04,-0.002626013386336546,72.23564873518644,203.08444230779233,3848.078121929866,167.24638929455608,325.2003051931162,3.9989148636147225,20397.485876238876
Test/scenario_test_df.csv ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ other_contributions,correction,sales
2
+ 17088.04603298421,-215.4682810582599,4502.552817091212
3
+ 18800.41124215237,74.62945753481836,4551.0520093251835
4
+ 18923.323598323655,-24.472395662971394,4551.711452115181
5
+ 19046.211561511274,-125.71083540064501,4551.1031350384665
6
+ 19169.082449610334,59.723662814169074,4550.207113442869
7
+ 19291.941385147402,-62.72601966545335,4546.888305453476
8
+ 19414.791953891072,67.80597281407609,4547.8136321328475
9
+ 19537.636665879367,-49.327276753389015,4552.279586216728
10
+ 19660.477278138897,-34.96735624499706,4548.776052001568
11
+ 19783.315020588292,63.3505618488889,4547.4474016199965
12
+ 19906.150754170594,157.53118273497603,4543.63876353669
13
+ 20028.98508154593,8.48155599979873,4550.013534757365
14
+ 20151.81842457639,-76.79487376436737,4544.029313182335
15
+ 20274.651078565435,90.96984069810424,4554.898971422908
16
+ 20397.48325022549,65.02213269566346,4554.821689628467
Test/x_test_contribution.csv ADDED
The diff for this file is too large to render. See raw diff
 
Test/x_test_to_save.csv CHANGED
The diff for this file is too large to render. See raw diff
 
Test/x_train_contribution.csv ADDED
The diff for this file is too large to render. See raw diff
 
Test/x_train_to_save.csv CHANGED
The diff for this file is too large to render. See raw diff
 
best_models.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dff906988316f9b0e935e828d967162a5f23b402d69a1de1fcd884225cd6a349
3
- size 3755214
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63e3de089d3f2a199a396228c6c0cf7f5db60c36fe3b7a6fb5cf3e74a92ae304
3
+ size 4095026
classes.py CHANGED
@@ -16,21 +16,15 @@ def class_to_dict(class_instance):
16
  attr_dict["modified_spends"] = class_instance.modified_spends
17
  attr_dict["modified_sales"] = class_instance.modified_sales
18
  attr_dict["response_curve_type"] = class_instance.response_curve_type
19
- attr_dict["response_curve_params"] = (
20
- class_instance.response_curve_params
21
- )
22
  attr_dict["penalty"] = class_instance.penalty
23
  attr_dict["bounds"] = class_instance.bounds
24
  attr_dict["actual_total_spends"] = class_instance.actual_total_spends
25
  attr_dict["actual_total_sales"] = class_instance.actual_total_sales
26
- attr_dict["modified_total_spends"] = (
27
- class_instance.modified_total_spends
28
- )
29
  attr_dict["modified_total_sales"] = class_instance.modified_total_sales
30
  attr_dict["actual_mroi"] = class_instance.get_marginal_roi("actual")
31
- attr_dict["modified_mroi"] = class_instance.get_marginal_roi(
32
- "modified"
33
- )
34
 
35
  elif isinstance(class_instance, Scenario):
36
  attr_dict["type"] = "Scenario"
@@ -43,9 +37,7 @@ def class_to_dict(class_instance):
43
  attr_dict["correction"] = class_instance.correction
44
  attr_dict["actual_total_spends"] = class_instance.actual_total_spends
45
  attr_dict["actual_total_sales"] = class_instance.actual_total_sales
46
- attr_dict["modified_total_spends"] = (
47
- class_instance.modified_total_spends
48
- )
49
  attr_dict["modified_total_sales"] = class_instance.modified_total_sales
50
 
51
  return attr_dict
@@ -95,9 +87,7 @@ class Channel:
95
  self.modified_sales = self.calculate_sales()
96
  self.modified_total_spends = self.modified_spends.sum()
97
  self.modified_total_sales = self.modified_sales.sum()
98
- self.delta_spends = (
99
- self.modified_total_spends - self.actual_total_spends
100
- )
101
  self.delta_sales = self.modified_total_sales - self.actual_total_sales
102
 
103
  def update_penalty(self, penalty):
@@ -119,8 +109,7 @@ class Channel:
119
  x = np.where(
120
  x < self.upper_limit,
121
  x,
122
- self.upper_limit
123
- + (x - self.upper_limit) * self.upper_limit / x,
124
  )
125
  if self.response_curve_type == "s-curve":
126
  if self.power >= 0:
@@ -169,9 +158,7 @@ class Channel:
169
  self.modified_sales = self.calculate_sales()
170
  self.modified_total_spends = self.modified_spends.sum()
171
  self.modified_total_sales = self.modified_sales.sum()
172
- self.delta_spends = (
173
- self.modified_total_spends - self.actual_total_spends
174
- )
175
  self.delta_sales = self.modified_total_sales - self.actual_total_sales
176
 
177
  def intialize(self):
@@ -208,9 +195,7 @@ class Scenario:
208
  self.actual_total_sales = self.calculate_actual_total_sales()
209
  self.modified_total_sales = self.calculate_modified_total_sales()
210
  self.modified_total_spends = self.calculate_modified_total_spends()
211
- self.delta_spends = (
212
- self.modified_total_spends - self.actual_total_spends
213
- )
214
  self.delta_sales = self.modified_total_sales - self.actual_total_sales
215
 
216
  def update_penalty(self, value):
@@ -220,9 +205,7 @@ class Scenario:
220
  def calculate_modified_total_spends(self):
221
  total_actual_spends = 0.0
222
  for channel in self.channels.values():
223
- total_actual_spends += (
224
- channel.actual_total_spends * channel.conversion_rate
225
- )
226
  return total_actual_spends
227
 
228
  def calculate_modified_total_spends(self):
@@ -251,12 +234,47 @@ class Scenario:
251
  self.channels[channel_name].update(modified_spends)
252
  self.modified_total_sales = self.calculate_modified_total_sales()
253
  self.modified_total_spends = self.calculate_modified_total_spends()
254
- self.delta_spends = (
255
- self.modified_total_spends - self.actual_total_spends
256
- )
257
  self.delta_sales = self.modified_total_sales - self.actual_total_sales
258
 
259
- def optimize_spends(self, sales_percent, channels_list, algo="COBYLA"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  desired_sales = self.actual_total_sales * (1 + sales_percent / 100.0)
261
 
262
  def constraint(x):
@@ -285,7 +303,7 @@ class Scenario:
285
  x0=initial_point,
286
  constraints=constraints,
287
  method=algo,
288
- options={"maxiter": int(2e7), "catol": 1},
289
  )
290
 
291
  for channel_name, modified_spends in zip(channels_list, res.x):
@@ -317,14 +335,11 @@ class Scenario:
317
  for channel_name in channels_list:
318
  _channel_class = self.channels[channel_name]
319
  channel_bounds = _channel_class.bounds
320
- channel_actual_total_spends = (
321
- _channel_class.actual_total_spends
322
- * ((1 + spends_percent / 100))
323
  )
324
  old_spends.append(channel_actual_total_spends)
325
- bounds.append(
326
- (1 + channel_bounds / 100) * channel_actual_total_spends
327
- )
328
 
329
  def objective_function(x):
330
  for channel_name, modified_spends in zip(channels_list, x):
@@ -332,12 +347,12 @@ class Scenario:
332
  return -1 * self.modified_total_sales
333
 
334
  res = minimize(
335
- lambda x : objective_function(x) / 1e8,
336
  method="trust-constr",
337
  x0=old_spends,
338
  constraints=constraint,
339
  bounds=bounds,
340
- options={"maxiter": int(1e7), 'xtol' : 100},
341
  )
342
  # res = dual_annealing(
343
  # objective_function,
@@ -361,81 +376,91 @@ class Scenario:
361
  channel_data = []
362
 
363
  summary_rows = []
364
- actual_list.append({
365
- "name": "Total",
366
- "Spends": self.actual_total_spends,
367
- "Sales": self.actual_total_sales,
368
- })
369
- modified_list.append({
370
- "name": "Total",
371
- "Spends": self.modified_total_spends,
372
- "Sales": self.modified_total_sales,
373
- })
 
 
 
 
374
  for channel in self.channels.values():
375
  name_mod = channel.name.replace("_", " ")
376
  if name_mod.lower().endswith(" imp"):
377
  name_mod = name_mod.replace("Imp", " Impressions")
378
- summary_rows.append([
379
- name_mod,
380
- channel.actual_total_spends,
381
- channel.modified_total_spends,
382
- channel.actual_total_sales,
383
- channel.modified_total_sales,
384
- round(
385
- channel.actual_total_sales / channel.actual_total_spends, 2
386
- ),
387
- round(
388
- channel.modified_total_sales
389
- / channel.modified_total_spends,
390
- 2,
391
- ),
392
- channel.get_marginal_roi("actual"),
393
- channel.get_marginal_roi("modified"),
394
- ])
395
  data[channel.name] = channel.modified_spends
396
  data["Date"] = channel.dates
397
  data["Sales"] = (
398
  data.get("Sales", np.zeros((len(channel.dates),)))
399
  + channel.modified_sales
400
  )
401
- actual_list.append({
402
- "name": channel.name,
403
- "Spends": channel.actual_total_spends,
404
- "Sales": channel.actual_total_sales,
405
- "ROI": round(
406
- channel.actual_total_sales / channel.actual_total_spends, 2
407
- ),
408
- })
409
- modified_list.append({
410
- "name": channel.name,
411
- "Spends": channel.modified_total_spends,
412
- "Sales": channel.modified_total_sales,
413
- "ROI": round(
414
- channel.modified_total_sales
415
- / channel.modified_total_spends,
416
- 2,
417
- ),
418
- "Marginal ROI": channel.get_marginal_roi("modified"),
419
- })
420
-
421
- channel_data.append({
422
- "channel": channel.name,
423
- "spends_act": channel.actual_total_spends,
424
- "spends_mod": channel.modified_total_spends,
425
- "sales_act": channel.actual_total_sales,
426
- "sales_mod": channel.modified_total_sales,
427
- })
428
- summary_rows.append([
429
- "Total",
430
- self.actual_total_spends,
431
- self.modified_total_spends,
432
- self.actual_total_sales,
433
- self.modified_total_sales,
434
- round(self.actual_total_sales / self.actual_total_spends, 2),
435
- round(self.modified_total_sales / self.modified_total_spends, 2),
436
- 0.0,
437
- 0.0,
438
- ])
 
 
 
 
 
 
 
439
  details["Actual"] = actual_list
440
  details["Modified"] = modified_list
441
  columns_index = pd.MultiIndex.from_product(
@@ -467,8 +492,7 @@ class Scenario:
467
  def from_dict(cls, attr_dict):
468
  channels_list = attr_dict["channels"]
469
  channels = {
470
- channel["name"]: class_from_dict(channel)
471
- for channel in channels_list
472
  }
473
  return Scenario(
474
  name=attr_dict["name"],
 
16
  attr_dict["modified_spends"] = class_instance.modified_spends
17
  attr_dict["modified_sales"] = class_instance.modified_sales
18
  attr_dict["response_curve_type"] = class_instance.response_curve_type
19
+ attr_dict["response_curve_params"] = class_instance.response_curve_params
 
 
20
  attr_dict["penalty"] = class_instance.penalty
21
  attr_dict["bounds"] = class_instance.bounds
22
  attr_dict["actual_total_spends"] = class_instance.actual_total_spends
23
  attr_dict["actual_total_sales"] = class_instance.actual_total_sales
24
+ attr_dict["modified_total_spends"] = class_instance.modified_total_spends
 
 
25
  attr_dict["modified_total_sales"] = class_instance.modified_total_sales
26
  attr_dict["actual_mroi"] = class_instance.get_marginal_roi("actual")
27
+ attr_dict["modified_mroi"] = class_instance.get_marginal_roi("modified")
 
 
28
 
29
  elif isinstance(class_instance, Scenario):
30
  attr_dict["type"] = "Scenario"
 
37
  attr_dict["correction"] = class_instance.correction
38
  attr_dict["actual_total_spends"] = class_instance.actual_total_spends
39
  attr_dict["actual_total_sales"] = class_instance.actual_total_sales
40
+ attr_dict["modified_total_spends"] = class_instance.modified_total_spends
 
 
41
  attr_dict["modified_total_sales"] = class_instance.modified_total_sales
42
 
43
  return attr_dict
 
87
  self.modified_sales = self.calculate_sales()
88
  self.modified_total_spends = self.modified_spends.sum()
89
  self.modified_total_sales = self.modified_sales.sum()
90
+ self.delta_spends = self.modified_total_spends - self.actual_total_spends
 
 
91
  self.delta_sales = self.modified_total_sales - self.actual_total_sales
92
 
93
  def update_penalty(self, penalty):
 
109
  x = np.where(
110
  x < self.upper_limit,
111
  x,
112
+ self.upper_limit + (x - self.upper_limit) * self.upper_limit / x,
 
113
  )
114
  if self.response_curve_type == "s-curve":
115
  if self.power >= 0:
 
158
  self.modified_sales = self.calculate_sales()
159
  self.modified_total_spends = self.modified_spends.sum()
160
  self.modified_total_sales = self.modified_sales.sum()
161
+ self.delta_spends = self.modified_total_spends - self.actual_total_spends
 
 
162
  self.delta_sales = self.modified_total_sales - self.actual_total_sales
163
 
164
  def intialize(self):
 
195
  self.actual_total_sales = self.calculate_actual_total_sales()
196
  self.modified_total_sales = self.calculate_modified_total_sales()
197
  self.modified_total_spends = self.calculate_modified_total_spends()
198
+ self.delta_spends = self.modified_total_spends - self.actual_total_spends
 
 
199
  self.delta_sales = self.modified_total_sales - self.actual_total_sales
200
 
201
  def update_penalty(self, value):
 
205
  def calculate_modified_total_spends(self):
206
  total_actual_spends = 0.0
207
  for channel in self.channels.values():
208
+ total_actual_spends += channel.actual_total_spends * channel.conversion_rate
 
 
209
  return total_actual_spends
210
 
211
  def calculate_modified_total_spends(self):
 
234
  self.channels[channel_name].update(modified_spends)
235
  self.modified_total_sales = self.calculate_modified_total_sales()
236
  self.modified_total_spends = self.calculate_modified_total_spends()
237
+ self.delta_spends = self.modified_total_spends - self.actual_total_spends
 
 
238
  self.delta_sales = self.modified_total_sales - self.actual_total_sales
239
 
240
+ # def optimize_spends(self, sales_percent, channels_list, algo="COBYLA"):
241
+ # desired_sales = self.actual_total_sales * (1 + sales_percent / 100.0)
242
+
243
+ # def constraint(x):
244
+ # for ch, spends in zip(channels_list, x):
245
+ # self.update(ch, spends)
246
+ # return self.modified_total_sales - desired_sales
247
+
248
+ # bounds = []
249
+ # for ch in channels_list:
250
+ # bounds.append(
251
+ # (1 + np.array([-50.0, 100.0]) / 100.0)
252
+ # * self.channels[ch].actual_total_spends
253
+ # )
254
+
255
+ # initial_point = []
256
+ # for bound in bounds:
257
+ # initial_point.append(bound[0])
258
+
259
+ # power = np.ceil(np.log(sum(initial_point)) / np.log(10))
260
+
261
+ # constraints = [NonlinearConstraint(constraint, -1.0, 1.0)]
262
+
263
+ # res = minimize(
264
+ # lambda x: sum(x) / 10 ** (power),
265
+ # bounds=bounds,
266
+ # x0=initial_point,
267
+ # constraints=constraints,
268
+ # method=algo,
269
+ # options={"maxiter": int(2e7), "catol": 1},
270
+ # )
271
+
272
+ # for channel_name, modified_spends in zip(channels_list, res.x):
273
+ # self.update(channel_name, modified_spends)
274
+
275
+ # return zip(channels_list, res.x)
276
+
277
+ def optimize_spends(self, sales_percent, channels_list, algo="trust-constr"):
278
  desired_sales = self.actual_total_sales * (1 + sales_percent / 100.0)
279
 
280
  def constraint(x):
 
303
  x0=initial_point,
304
  constraints=constraints,
305
  method=algo,
306
+ options={"maxiter": int(2e7), "xtol": 100},
307
  )
308
 
309
  for channel_name, modified_spends in zip(channels_list, res.x):
 
335
  for channel_name in channels_list:
336
  _channel_class = self.channels[channel_name]
337
  channel_bounds = _channel_class.bounds
338
+ channel_actual_total_spends = _channel_class.actual_total_spends * (
339
+ (1 + spends_percent / 100)
 
340
  )
341
  old_spends.append(channel_actual_total_spends)
342
+ bounds.append((1 + channel_bounds / 100) * channel_actual_total_spends)
 
 
343
 
344
  def objective_function(x):
345
  for channel_name, modified_spends in zip(channels_list, x):
 
347
  return -1 * self.modified_total_sales
348
 
349
  res = minimize(
350
+ lambda x: objective_function(x) / 1e8,
351
  method="trust-constr",
352
  x0=old_spends,
353
  constraints=constraint,
354
  bounds=bounds,
355
+ options={"maxiter": int(1e7), "xtol": 100},
356
  )
357
  # res = dual_annealing(
358
  # objective_function,
 
376
  channel_data = []
377
 
378
  summary_rows = []
379
+ actual_list.append(
380
+ {
381
+ "name": "Total",
382
+ "Spends": self.actual_total_spends,
383
+ "Sales": self.actual_total_sales,
384
+ }
385
+ )
386
+ modified_list.append(
387
+ {
388
+ "name": "Total",
389
+ "Spends": self.modified_total_spends,
390
+ "Sales": self.modified_total_sales,
391
+ }
392
+ )
393
  for channel in self.channels.values():
394
  name_mod = channel.name.replace("_", " ")
395
  if name_mod.lower().endswith(" imp"):
396
  name_mod = name_mod.replace("Imp", " Impressions")
397
+ summary_rows.append(
398
+ [
399
+ name_mod,
400
+ channel.actual_total_spends,
401
+ channel.modified_total_spends,
402
+ channel.actual_total_sales,
403
+ channel.modified_total_sales,
404
+ round(channel.actual_total_sales / channel.actual_total_spends, 2),
405
+ round(
406
+ channel.modified_total_sales / channel.modified_total_spends,
407
+ 2,
408
+ ),
409
+ channel.get_marginal_roi("actual"),
410
+ channel.get_marginal_roi("modified"),
411
+ ]
412
+ )
 
413
  data[channel.name] = channel.modified_spends
414
  data["Date"] = channel.dates
415
  data["Sales"] = (
416
  data.get("Sales", np.zeros((len(channel.dates),)))
417
  + channel.modified_sales
418
  )
419
+ actual_list.append(
420
+ {
421
+ "name": channel.name,
422
+ "Spends": channel.actual_total_spends,
423
+ "Sales": channel.actual_total_sales,
424
+ "ROI": round(
425
+ channel.actual_total_sales / channel.actual_total_spends, 2
426
+ ),
427
+ }
428
+ )
429
+ modified_list.append(
430
+ {
431
+ "name": channel.name,
432
+ "Spends": channel.modified_total_spends,
433
+ "Sales": channel.modified_total_sales,
434
+ "ROI": round(
435
+ channel.modified_total_sales / channel.modified_total_spends,
436
+ 2,
437
+ ),
438
+ "Marginal ROI": channel.get_marginal_roi("modified"),
439
+ }
440
+ )
441
+
442
+ channel_data.append(
443
+ {
444
+ "channel": channel.name,
445
+ "spends_act": channel.actual_total_spends,
446
+ "spends_mod": channel.modified_total_spends,
447
+ "sales_act": channel.actual_total_sales,
448
+ "sales_mod": channel.modified_total_sales,
449
+ }
450
+ )
451
+ summary_rows.append(
452
+ [
453
+ "Total",
454
+ self.actual_total_spends,
455
+ self.modified_total_spends,
456
+ self.actual_total_sales,
457
+ self.modified_total_sales,
458
+ round(self.actual_total_sales / self.actual_total_spends, 2),
459
+ round(self.modified_total_sales / self.modified_total_spends, 2),
460
+ 0.0,
461
+ 0.0,
462
+ ]
463
+ )
464
  details["Actual"] = actual_list
465
  details["Modified"] = modified_list
466
  columns_index = pd.MultiIndex.from_product(
 
492
  def from_dict(cls, attr_dict):
493
  channels_list = attr_dict["channels"]
494
  channels = {
495
+ channel["name"]: class_from_dict(channel) for channel in channels_list
 
496
  }
497
  return Scenario(
498
  name=attr_dict["name"],
data_import.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d759e0caf40a5cf6ddfe5c391289fa964363652dba2ffe919fa1ab7c6b4399ec
3
+ size 2246178
data_test_overview_panel_#total_approved_accounts_revenue.xlsx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:763047805d36dca3502a6ed9c6dcee9a0c99c945ee92bb61a7c0f6647486a96c
3
+ size 1637428
final_df_transformed.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d775eda5ee0172e1511622b69b301023cdf2c5dbe74bb62d79264fe926eee1b
3
+ size 19479046
metrics_level_data/Overview_data_test_panel@#app_installs.xlsx ADDED
Binary file (28.1 kB). View file
 
metrics_level_data/Overview_data_test_panel@#revenue.xlsx ADDED
Binary file (28.1 kB). View file
 
model_output.csv CHANGED
@@ -1,11 +1,6 @@
1
- ,Model_object,Model_iteration,Feature_set,MAPE,R2,ADJR2
2
- 0,Model/model_0.pkl,0,"['paid_search_clicks', 'kwai_clicks', 'fb_level_achieved_tier_2_clicks_lag_2', 'fb_level_achieved_tier_1_impressions', 'ga_app_clicks', 'digital_tactic_others_clicks_lag_2', 'programmatic_clicks']",0.2101108376942587,0.8443530956877969,0.8442167683191552
3
- 1,Model/model_1.pkl,1,"['paid_search_clicks', 'kwai_clicks', 'fb_level_achieved_tier_2_clicks_lag_2', 'fb_level_achieved_tier_1_impressions', 'ga_app_clicks', 'digital_tactic_others_clicks_lag_2', 'programmatic_clicks_lag_3']",0.21209032951119616,0.8459839652330053,0.8458490663036549
4
- 2,Model/model_2.pkl,2,"['paid_search_clicks', 'kwai_clicks', 'fb_level_achieved_tier_2_clicks_lag_2', 'fb_level_achieved_tier_1_impressions', 'ga_app_clicks', 'digital_tactic_others_clicks_lag_2', 'programmatic_impressions']",0.21016185105024765,0.8443545867054447,0.8442182606427493
5
- 3,Model/model_3.pkl,3,"['paid_search_clicks', 'kwai_clicks', 'fb_level_achieved_tier_2_clicks_lag_2', 'fb_level_achieved_tier_1_impressions', 'ga_app_clicks', 'digital_tactic_others_clicks_lag_2', 'programmatic_impressions_lag_3']",0.21224939270932452,0.8462289218635773,0.8460942374858302
6
- 4,Model/model_4.pkl,4,"['paid_search_clicks', 'kwai_clicks', 'fb_level_achieved_tier_2_clicks_lag_2', 'fb_level_achieved_tier_1_impressions', 'ga_app_clicks', 'digital_tactic_others_clicks', 'programmatic_clicks']",0.21018683127739526,0.8421437296960563,0.8420054671970414
7
- 5,Model/model_5.pkl,5,"['paid_search_clicks', 'kwai_clicks', 'fb_level_achieved_tier_2_clicks_lag_2', 'fb_level_achieved_tier_1_impressions', 'ga_app_clicks', 'digital_tactic_others_clicks', 'programmatic_clicks_lag_3']",0.21221059311555665,0.8436849097221487,0.843547997105539
8
- 6,Model/model_6.pkl,6,"['paid_search_clicks', 'kwai_clicks', 'fb_level_achieved_tier_2_clicks_lag_2', 'fb_level_achieved_tier_1_impressions', 'ga_app_clicks', 'digital_tactic_others_clicks', 'programmatic_impressions']",0.21023311688137142,0.8421414101917525,0.8420031456611397
9
- 7,Model/model_7.pkl,7,"['paid_search_clicks', 'kwai_clicks', 'fb_level_achieved_tier_2_clicks_lag_2', 'fb_level_achieved_tier_1_impressions', 'ga_app_clicks', 'digital_tactic_others_clicks', 'programmatic_impressions_lag_3']",0.21230002407340917,0.8438639613954715,0.843727205605903
10
- 8,Model/model_8.pkl,8,"['paid_search_clicks', 'kwai_clicks', 'fb_level_achieved_tier_2_clicks_lag_2', 'fb_level_achieved_tier_1_impressions', 'ga_app_clicks', 'digital_tactic_others_impressions_lag_2', 'programmatic_clicks']",0.21138525009178905,0.8446253227642725,0.8444892338327598
11
- 9,Model/model_9.pkl,9,"['paid_search_clicks', 'kwai_clicks', 'fb_level_achieved_tier_2_clicks_lag_2', 'fb_level_achieved_tier_1_impressions', 'ga_app_clicks', 'digital_tactic_others_impressions_lag_2', 'programmatic_clicks_lag_3']",0.2123701406564611,0.8464957579981922,0.8463613073357782
 
1
+ ,Model_object,Model_iteration,Feature_set,MAPE,R2,ADJR2,pos_count
2
+ 0,Model/model_0.pkl,0,"['paid_search_clicks_adstock_0_7_lag_1', 'kwai_clicks_adstock_0_7_lag_2', 'fb_level_achieved_tier_2_clicks_adstock_0_7_lag_2', 'fb_level_achieved_tier_1_impressions_adstock_0_7_lag_2', 'ga_app_clicks_adstock_0_7_lag_2', 'digital_tactic_others_clicks_adstock_0_7_lag_2', 'programmatic_clicks_adstock_0_7_lag_2']",0.217990735975396,0.8737098317237447,0.8735992172119913,8
3
+ 1,Model/model_1.pkl,1,"['paid_search_clicks_adstock_0_7_lag_1', 'kwai_clicks_adstock_0_7_lag_2', 'fb_level_achieved_tier_2_clicks_adstock_0_7_lag_2', 'fb_level_achieved_tier_1_impressions_adstock_0_7_lag_2', 'ga_app_clicks_adstock_0_7_lag_2', 'digital_tactic_others_clicks_adstock_0_7_lag_2', 'programmatic_clicks_adstock_0_7_lag_1']",0.2179731139181846,0.873704484501189,0.8735938653059323,8
4
+ 2,Model/model_2.pkl,2,"['paid_search_clicks_adstock_0_7_lag_1', 'kwai_clicks_adstock_0_7_lag_2', 'fb_level_achieved_tier_2_clicks_adstock_0_7_lag_2', 'fb_level_achieved_tier_1_impressions_adstock_0_7_lag_2', 'ga_app_clicks_adstock_0_7_lag_2', 'digital_tactic_others_clicks_adstock_0_7_lag_2', 'programmatic_impressions_adstock_0_7_lag_2']",0.22282859947602898,0.8741134168513375,0.8740031558300612,7
5
+ 3,Model/model_3.pkl,3,"['paid_search_clicks_adstock_0_7_lag_1', 'kwai_clicks_adstock_0_7_lag_2', 'fb_level_achieved_tier_2_clicks_adstock_0_7_lag_2', 'fb_level_achieved_tier_1_impressions_adstock_0_7_lag_2', 'ga_app_clicks_adstock_0_7_lag_2', 'digital_tactic_others_clicks_adstock_0_7_lag_2', 'programmatic_impressions_adstock_0_7_lag_1']",0.22288787053617995,0.8740146663445868,0.8739043188301239,8
6
+ 4,Model/model_4.pkl,4,"['paid_search_clicks_adstock_0_7_lag_1', 'kwai_clicks_adstock_0_7_lag_2', 'fb_level_achieved_tier_2_clicks_adstock_0_7_lag_2', 'fb_level_achieved_tier_1_impressions_adstock_0_7_lag_2', 'ga_app_clicks_adstock_0_7_lag_2', 'digital_tactic_others_clicks_adstock_0_7_lag_2', 'programmatic_cost_adstock_0_7_lag_2']",0.21714189338473494,0.8736897844153089,0.8735791523446015,8
 
 
 
 
 
pages/10_Optimized_Result_Analysis.py CHANGED
@@ -14,15 +14,7 @@ import plotly.express as px
14
  import numpy as np
15
  import plotly.graph_objects as go
16
  import pandas as pd
17
- from plotly.subplots import make_subplots
18
 
19
- def format_number(x):
20
- if x >= 1_000_000:
21
- return f'{x / 1_000_000:.2f}M'
22
- elif x >= 1_000:
23
- return f'{x / 1_000:.2f}K'
24
- else:
25
- return f'{x:.2f}'
26
 
27
  def summary_plot(data, x, y, title, text_column, color, format_as_percent=False, format_as_decimal=False):
28
  fig = px.bar(data, x=x, y=y, orientation='h',
@@ -104,13 +96,11 @@ spends_data=pd.read_excel('Overview_data_test.xlsx')
104
 
105
  with open('summary_df.pkl', 'rb') as file:
106
  summary_df_sorted = pickle.load(file)
107
- #st.write(summary_df_sorted)
108
 
109
  selected_scenario= st.selectbox('Select Saved Scenarios',['S1','S2'])
110
 
111
  st.header('Optimized Spends Overview')
112
  ___columns=st.columns(3)
113
- summary_df_sorted=summary_df_sorted.sort_values(by=['Optimized_spend'],ascending=False)
114
  with ___columns[2]:
115
  fig=summary_plot(summary_df_sorted, x='Delta_percent', y='Channel_name', title='Delta', text_column='Delta_percent',color='Channel_name')
116
  st.plotly_chart(fig,use_container_width=True)
@@ -344,75 +334,31 @@ with st.expander("Return Forecast by Media Channel"):
344
 
345
  summary_df_sorted=summary_df_sorted.merge(effectiveness_df,left_on="Channel_name",right_on='Channel')
346
 
347
- #
348
- summary_df_sorted['Efficiency'] = summary_df_sorted['ResponseMetricValue'] / summary_df_sorted['Optimized_spend']
349
- summary_df_sorted=summary_df_sorted.sort_values(by='Optimized_spend',ascending=True)
350
- #st.dataframe(summary_df_sorted)
351
-
352
- channel_colors = px.colors.qualitative.Plotly
353
-
354
- fig = make_subplots(rows=1, cols=3, subplot_titles=('Optimized Spends', 'Effectiveness', 'Efficiency'), horizontal_spacing=0.05)
355
-
356
- for i, channel in enumerate(summary_df_sorted['Channel_name'].unique()):
357
- channel_df = summary_df_sorted[summary_df_sorted['Channel_name'] == channel]
358
- channel_color = channel_colors[i % len(channel_colors)]
359
-
360
- fig.add_trace(go.Bar(x=channel_df['Optimized_spend'],
361
- y=channel_df['Channel_name'],
362
- text=channel_df['Optimized_spend'].apply(format_number),
363
- marker_color=channel_color,
364
- orientation='h'), row=1, col=1)
365
-
366
- fig.add_trace(go.Bar(x=channel_df['ResponseMetricValue'],
367
- y=channel_df['Channel_name'],
368
- text=channel_df['ResponseMetricValue'].apply(format_number),
369
- marker_color=channel_color,
370
- orientation='h', showlegend=False), row=1, col=2)
371
-
372
- fig.add_trace(go.Bar(x=channel_df['Efficiency'],
373
- y=channel_df['Channel_name'],
374
- text=channel_df['Efficiency'].apply(format_number),
375
- marker_color=channel_color,
376
- orientation='h', showlegend=False), row=1, col=3)
377
-
378
- fig.update_layout(
379
- height=600,
380
- width=900,
381
- title='Media Channel Performance',
382
- showlegend=False
383
- )
384
-
385
- fig.update_yaxes(showticklabels=False ,row=1, col=2 )
386
- fig.update_yaxes(showticklabels=False, row=1, col=3)
387
-
388
- fig.update_xaxes(showticklabels=False, row=1, col=1)
389
- fig.update_xaxes(showticklabels=False, row=1, col=2)
390
- fig.update_xaxes(showticklabels=False, row=1, col=3)
391
-
392
-
393
- st.plotly_chart(fig, use_container_width=True)
394
-
395
 
396
-
397
- # columns= st.columns(3)
398
- # with columns[0]:
399
- # fig=summary_plot(summary_df_sorted, x='Optimized_spend', y='Channel_name', title='', text_column='Optimized_spend',color='Channel_name')
400
- # st.plotly_chart(fig,use_container_width=True)
401
- # with columns[1]:
402
 
403
- # # effectiveness=(selected_metric.groupby(by=['MediaChannelName'])['ResponseMetricValue'].sum()).values
404
- # # effectiveness_df=pd.DataFrame({'Channel':st.session_state['raw_data']['MediaChannelName'].unique(),"ResponseMetricValue":effectiveness})
405
- # # # effectiveness.reset_index(inplace=True)
406
- # # # st.dataframe(effectiveness.head())
407
-
408
-
409
- # fig=summary_plot(summary_df_sorted, x='ResponseMetricValue', y='Channel_name', title='Effectiveness', text_column='ResponseMetricValue',color='Channel_name')
410
- # st.plotly_chart(fig,use_container_width=True)
411
-
412
- # with columns[2]:
413
- # fig=summary_plot(summary_df_sorted, x='Efficiency', y='Channel_name', title='Efficiency', text_column='Efficiency',color='Channel_name',format_as_decimal=True)
414
- # st.plotly_chart(fig,use_container_width=True)
415
-
 
416
 
417
  # Create figure with subplots
418
  # fig = make_subplots(rows=1, cols=2)
 
14
  import numpy as np
15
  import plotly.graph_objects as go
16
  import pandas as pd
 
17
 
 
 
 
 
 
 
 
18
 
19
  def summary_plot(data, x, y, title, text_column, color, format_as_percent=False, format_as_decimal=False):
20
  fig = px.bar(data, x=x, y=y, orientation='h',
 
96
 
97
  with open('summary_df.pkl', 'rb') as file:
98
  summary_df_sorted = pickle.load(file)
 
99
 
100
  selected_scenario= st.selectbox('Select Saved Scenarios',['S1','S2'])
101
 
102
  st.header('Optimized Spends Overview')
103
  ___columns=st.columns(3)
 
104
  with ___columns[2]:
105
  fig=summary_plot(summary_df_sorted, x='Delta_percent', y='Channel_name', title='Delta', text_column='Delta_percent',color='Channel_name')
106
  st.plotly_chart(fig,use_container_width=True)
 
334
 
335
  summary_df_sorted=summary_df_sorted.merge(effectiveness_df,left_on="Channel_name",right_on='Channel')
336
 
337
+ # st.dataframe(summary_df_sorted.head(2))
338
+ summary_df_sorted['Efficiency']=summary_df_sorted['ResponseMetricValue']/summary_df_sorted['Optimized_spend']
339
+ # # # st.dataframe(summary_df_sorted.head(2))
340
+ # st.dataframe(summary_df_sorted.head(2))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
 
342
+ columns= st.columns(3)
343
+ with columns[0]:
344
+ fig=summary_plot(summary_df_sorted, x='Optimized_spend', y='Channel_name', title='', text_column='Optimized_spend',color='Channel_name')
345
+ st.plotly_chart(fig,use_container_width=True)
346
+ with columns[1]:
 
347
 
348
+ # effectiveness=(selected_metric.groupby(by=['MediaChannelName'])['ResponseMetricValue'].sum()).values
349
+ # effectiveness_df=pd.DataFrame({'Channel':st.session_state['raw_data']['MediaChannelName'].unique(),"ResponseMetricValue":effectiveness})
350
+ # # effectiveness.reset_index(inplace=True)
351
+ # # st.dataframe(effectiveness.head())
352
+ fig=summary_plot(summary_df_sorted, x='ResponseMetricValue', y='Channel_name', title='Effectiveness', text_column='ResponseMetricValue',color='Channel_name')
353
+ st.plotly_chart(fig,use_container_width=True)
354
+
355
+ with columns[2]:
356
+ fig=summary_plot(summary_df_sorted, x='Efficiency', y='Channel_name', title='Efficiency', text_column='Efficiency',color='Channel_name',format_as_decimal=True)
357
+ st.plotly_chart(fig,use_container_width=True)
358
+
359
+ import plotly.express as px
360
+ import plotly.graph_objects as go
361
+ from plotly.subplots import make_subplots
362
 
363
  # Create figure with subplots
364
  # fig = make_subplots(rows=1, cols=2)
pages/1_Data_Validation.py CHANGED
@@ -9,7 +9,7 @@ from streamlit_pandas_profiling import st_profile_report
9
  import streamlit as st
10
  import streamlit.components.v1 as components
11
  import sweetviz as sv
12
- from utilities import set_header,initialize_data,load_local_css
13
  from st_aggrid import GridOptionsBuilder,GridUpdateMode
14
  from st_aggrid import GridOptionsBuilder
15
  from st_aggrid import AgGrid
@@ -17,8 +17,7 @@ import base64
17
  import os
18
  import tempfile
19
  from ydata_profiling import ProfileReport
20
-
21
- from streamlit_pandas_profiling import st_profile_report
22
 
23
  st.set_page_config(
24
  page_title="Data Validation",
@@ -31,68 +30,52 @@ set_header()
31
 
32
 
33
 
34
- #preprocessing
35
- # with open('Categorised_data.pkl', 'rb') as file:
36
- # Categorised_data = pickle.load(file)
37
- # with open("edited_dataframe.pkl", 'rb') as file:
38
-
39
-
40
- # df = pickle.load(file)
41
- # date=df.index
42
- # df.reset_index(inplace=True)
43
- # df['date'] = pd.to_datetime(date)
44
-
45
-
46
- #prospects=pd.read_excel('EDA_Data.xlsx',sheet_name='Prospects')
47
- #spends=pd.read_excel('EDA_Data.xlsx',sheet_name='SPEND INPUT')
48
- #spends.columns=['Week','Streaming (Spends)','TV (Spends)','Search (Spends)','Digital (Spends)']
49
- #df=pd.concat([df,spends],axis=1)
50
 
51
- #df['date'] =pd.to_datetime(df['date']).dt.strftime('%m/%d/%Y')
52
- #df['Prospects']=prospects['Prospects']
53
- #df.drop(['Week'],axis=1,inplace=True)
54
 
55
- # Deserialize and load the objects from the pickle file
56
- # Deserialize and load the objects from the pickle file
57
  with open('data_import.pkl', 'rb') as f:
58
  data = pickle.load(f)
59
 
60
- # Accessing the loaded objects
61
  st.session_state['cleaned_data']= data['final_df']
62
  st.session_state['category_dict'] = data['bin_dict']
63
 
64
  st.title('Data Validation and Insights')
65
 
66
 
67
- # with open("Pickle_files/main_df",'rb') as f:
68
- # st.session_state['cleaned_data']= pickle.load(f)
69
- # with open("Pickle_files/category_dict",'rb') as c:
70
- # st.session_state['category_dict']=pickle.load(c)
71
-
72
- # st.write(st.session_state['cleaned_data'])
73
-
74
  target_variables=[st.session_state['category_dict'][key] for key in st.session_state['category_dict'].keys() if key =='Response Metrics']
75
 
76
-
77
  target_column = st.selectbox('Select the Target Feature/Dependent Variable (will be used in all charts as reference)',list(*target_variables))
78
  st.session_state['target_column']=target_column
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
 
81
- fig=line_plot_target(st.session_state['cleaned_data'], target=target_column, title=f'{target_column} Over Time')
82
- st.plotly_chart(fig, use_container_width=True)
83
 
84
 
85
- media_channel=list(*[st.session_state['category_dict'][key] for key in st.session_state['category_dict'].keys() if key =='Media'])
86
- # st.write(media_channel)
87
 
88
- Non_media_channel=[col for col in st.session_state['cleaned_data'].columns if col not in media_channel]
89
 
90
 
91
- st.markdown('### Annual Data Summary')
92
- st.dataframe(summary(st.session_state['cleaned_data'], media_channel+[target_column], spends=None,Target=True), use_container_width=True)
93
 
94
- if st.checkbox('Show raw data'):
95
- st.write(pd.concat([pd.to_datetime(st.session_state['cleaned_data']['date']).dt.strftime('%m/%d/%Y'),st.session_state['cleaned_data'].select_dtypes(np.number).applymap(format_numbers)],axis=1))
96
  col1 = st.columns(1)
97
 
98
  if "selected_feature" not in st.session_state:
@@ -114,29 +97,30 @@ def generate_profile_report(df):
114
  return report_path
115
 
116
 
117
- st.header('Univariate and Bivariate Analysis')
118
- eda_columns=st.columns(2)
119
- with eda_columns[0]:
120
- if st.button('Generate Profile Report'):
121
- with st.spinner('Generating Report'):
122
- report_file = generate_profile_report(st.session_state['cleaned_data'])
123
-
124
- if os.path.exists(report_file):
125
- with open(report_file, 'rb') as f:
126
- st.success('Report Generated')
127
- st.download_button(
128
- label="Download EDA Report",
129
- data=f.read(),
130
- file_name="pandas_profiling_report.html",
131
- mime="text/html"
132
- )
133
- else:
134
- st.warning("Report generation failed. Unable to find the report file.")
 
135
 
136
  with eda_columns[1]:
137
- if st.button('Generate Sweetviz Report'):
138
  with st.spinner('Generating Report'):
139
- report_file = generate_report_with_target(st.session_state['cleaned_data'], target_column)
140
 
141
  if os.path.exists(report_file):
142
  with open(report_file, 'rb') as f:
@@ -152,130 +136,116 @@ with eda_columns[1]:
152
 
153
 
154
 
155
- st.warning('Work in Progress')
156
-
157
- # selected_media = st.selectbox('Select media', np.unique([Categorised_data[col]['VB'] for col in media_channel]))
158
- # # selected_feature=st.multiselect('Select Metric', df.columns[df.columns.str.contains(selected_media,case=False)])
159
- # st.session_state["selected_feature"]=st.selectbox('Select Metric',[col for col in media_channel if Categorised_data[col]['VB'] in selected_media ] )
160
- # spends_features=[col for col in df.columns if 'spends' in col.lower() or 'cost' in col.lower()]
161
- # spends_feature=[col for col in spends_features if col.split('_')[0] in st.session_state["selected_feature"].split('_')[0]]
162
- # #st.write(spends_features)
163
- # #st.write(spends_feature)
164
- # #st.write(selected_feature)
165
-
166
-
167
- # val_variables=[col for col in media_channel if col!='date']
168
- # if len(spends_feature)==0:
169
- # st.warning('No spends varaible available for the selected metric in data')
170
-
171
- # else:
172
- # st.write(f'Selected spends variable {spends_feature[0]} if wrong please name the varaibles properly')
173
- # # Create the dual-axis line plot
174
- # fig_row1 = line_plot(df, x_col='date', y1_cols=[st.session_state["selected_feature"]], y2_cols=[target_column], title=f'Analysis of {st.session_state["selected_feature"]} and {[target_column][0]} Over Time')
175
- # st.plotly_chart(fig_row1, use_container_width=True)
176
- # st.markdown('### Annual Data Summary')
177
- # st.dataframe(summary(df,[st.session_state["selected_feature"]],spends=spends_feature[0]),use_container_width=True)
178
- # if st.button('Validate'):
179
- # st.session_state['Validation'].append(st.session_state["selected_feature"])
180
-
181
- # if st.checkbox('Validate all'):
182
- # st.session_state['Validation'].extend(val_variables)
183
- # st.success('All media variables are validated ✅')
184
- # if len(set(st.session_state['Validation']).intersection(val_variables))!=len(val_variables):
185
- # #st.write(st.session_state['Validation'])
186
- # validation_data=pd.DataFrame({'Variables':val_variables,
187
- # 'Validated':[1 if col in st.session_state['Validation'] else 0 for col in val_variables],
188
- # 'Bucket':[Categorised_data[col]['VB'] for col in val_variables]})
189
- # gd=GridOptionsBuilder.from_dataframe(validation_data)
190
- # gd.configure_pagination(enabled=True)
191
- # gd.configure_selection(use_checkbox=True,selection_mode='multiple')
192
- # #gd.configure_selection_toggle_all(None, show_toggle_all=True)
193
- # #gd.configure_columns_auto_size_mode(GridOptionsBuilder.configure_columns)
194
- # gridoptions=gd.build()
195
- # #st.text(st.session_state['Validation'])
196
- # table = AgGrid(validation_data,gridOptions=gridoptions,update_mode=GridUpdateMode.SELECTION_CHANGED,fit_columns_on_grid_load=True)
197
- # #st.table(table)
198
- # selected_rows = table["selected_rows"]
199
- # st.session_state['Validation'].extend([col['Variables'] for col in selected_rows])
200
- # not_validated_variables = [col for col in val_variables if col not in st.session_state["Validation"]]
201
- # if not_validated_variables:
202
- # not_validated_message = f'The following variables are not validated:\n{" , ".join(not_validated_variables)}'
203
- # st.warning(not_validated_message)
204
-
205
-
206
-
207
- # st.header('2. Non Media Variables')
208
- # selected_columns_row = [col for col in df.columns if ("imp" not in col.lower()) and ('cli' not in col.lower() ) and ('spend' not in col.lower()) and col!='date']
209
- # selected_columns_row4 = st.selectbox('Select Channel',selected_columns_row )
210
- # if not selected_columns_row4:
211
- # st.warning('Please select at least one.')
212
- # else:
213
- # # Create the dual-axis line plot
214
- # fig_row4 = line_plot(df, x_col='date', y1_cols=[selected_columns_row4], y2_cols=[target_column], title=f'Analysis of {selected_columns_row4} and {target_column} Over Time')
215
- # st.plotly_chart(fig_row4, use_container_width=True)
216
- # selected_non_media=selected_columns_row4
217
- # sum_df = df[['date', selected_non_media,target_column]]
218
- # sum_df['Year']=pd.to_datetime(df['date']).dt.year
219
- # #st.dataframe(df)
220
- # #st.dataframe(sum_df.head(2))
221
- # sum_df=sum_df.groupby('Year').agg('sum')
222
- # sum_df.loc['Grand Total']=sum_df.sum()
223
- # sum_df=sum_df.applymap(format_numbers)
224
- # sum_df.fillna('-',inplace=True)
225
- # sum_df=sum_df.replace({"0.0":'-','nan':'-'})
226
- # st.markdown('### Annual Data Summary')
227
- # st.dataframe(sum_df,use_container_width=True)
228
-
229
- # # if st.checkbox('Validate',key='2'):
230
- # # st.session_state['Validation'].append(selected_columns_row4)
231
- # # val_variables=[col for col in media_channel if col!='date']
232
- # # if st.checkbox('Validate all'):
233
- # # st.session_state['Validation'].extend(val_variables)
234
- # # validation_data=pd.DataFrame({'Variables':val_variables,
235
- # # 'Validated':[1 if col in st.session_state['Validation'] else 0 for col in val_variables],
236
- # # 'Bucket':[Categorised_data[col]['VB'] for col in val_variables]})
237
- # # gd=GridOptionsBuilder.from_dataframe(validation_data)
238
- # # gd.configure_pagination(enabled=True)
239
- # # gd.configure_selection(use_checkbox=True,selection_mode='multiple')
240
- # # #gd.configure_selection_toggle_all(None, show_toggle_all=True)
241
- # # #gd.configure_columns_auto_size_mode(GridOptionsBuilder.configure_columns)
242
- # # gridoptions=gd.build()
243
- # # #st.text(st.session_state['Validation'])
244
- # # table = AgGrid(validation_data,gridOptions=gridoptions,update_mode=GridUpdateMode.SELECTION_CHANGED,fit_columns_on_grid_load=True)
245
- # # #st.table(table)
246
- # # selected_rows = table["selected_rows"]
247
- # # st.session_state['Validation'].extend([col['Variables'] for col in selected_rows])
248
- # # not_validated_variables = [col for col in val_variables if col not in st.session_state["Validation"]]
249
- # # if not_validated_variables:
250
- # # not_validated_message = f'The following variables are not validated:\n{" , ".join(not_validated_variables)}'
251
- # # st.warning(not_validated_message)
252
-
253
- # options = list(df.select_dtypes(np.number).columns)
254
- # st.markdown(' ')
255
- # st.markdown(' ')
256
- # st.markdown('# Exploratory Data Analysis')
257
- # st.markdown(' ')
258
-
259
- # selected_options = []
260
- # num_columns = 4
261
- # num_rows = -(-len(options) // num_columns) # Ceiling division to calculate rows
262
-
263
- # # Create a grid of checkboxes
264
- # st.header('Select Features for Correlation Plot')
265
- # tick=False
266
- # if st.checkbox('Select all'):
267
- # tick=True
268
- # selected_options = []
269
- # for row in range(num_rows):
270
- # cols = st.columns(num_columns)
271
- # for col in cols:
272
- # if options:
273
- # option = options.pop(0)
274
- # selected = col.checkbox(option,value=tick)
275
- # if selected:
276
- # selected_options.append(option)
277
- # # Display selected options
278
- # #st.write('You selected:', selected_options)
279
- # st.pyplot(correlation_plot(df,selected_options,target_column))
280
-
281
 
 
9
  import streamlit as st
10
  import streamlit.components.v1 as components
11
  import sweetviz as sv
12
+ from utilities import set_header,load_local_css
13
  from st_aggrid import GridOptionsBuilder,GridUpdateMode
14
  from st_aggrid import GridOptionsBuilder
15
  from st_aggrid import AgGrid
 
17
  import os
18
  import tempfile
19
  from ydata_profiling import ProfileReport
20
+ import re
 
21
 
22
  st.set_page_config(
23
  page_title="Data Validation",
 
30
 
31
 
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
 
 
 
34
 
 
 
35
  with open('data_import.pkl', 'rb') as f:
36
  data = pickle.load(f)
37
 
 
38
  st.session_state['cleaned_data']= data['final_df']
39
  st.session_state['category_dict'] = data['bin_dict']
40
 
41
  st.title('Data Validation and Insights')
42
 
43
 
 
 
 
 
 
 
 
44
  target_variables=[st.session_state['category_dict'][key] for key in st.session_state['category_dict'].keys() if key =='Response Metrics']
45
 
 
46
  target_column = st.selectbox('Select the Target Feature/Dependent Variable (will be used in all charts as reference)',list(*target_variables))
47
  st.session_state['target_column']=target_column
48
+ panels=st.session_state['category_dict']['Panel Level 1'][0]
49
+ selected_panels=st.multiselect('Please choose the panels you wish to analyze.If no panels are selected, insights will be derived from the overall data.',st.session_state['cleaned_data'][panels].unique())
50
+ aggregation_dict = {item: 'sum' if key == 'Media' else 'mean' for key, value in st.session_state['category_dict'].items() for item in value if item not in ['date','Panel_1']}
51
+
52
+ with st.expander('**Reponse Metric Analysis**'):
53
+
54
+ if len(selected_panels)>0:
55
+ st.session_state['Cleaned_data_panel']=st.session_state['cleaned_data'][st.session_state['cleaned_data']['Panel_1'].isin(selected_panels)]
56
+
57
+ st.session_state['Cleaned_data_panel']=st.session_state['Cleaned_data_panel'].groupby(by='date').agg(aggregation_dict)
58
+ st.session_state['Cleaned_data_panel']=st.session_state['Cleaned_data_panel'].reset_index()
59
+ else:
60
+ st.session_state['Cleaned_data_panel']=st.session_state['cleaned_data'].groupby(by='date').agg(aggregation_dict)
61
+ st.session_state['Cleaned_data_panel']=st.session_state['Cleaned_data_panel'].reset_index()
62
 
63
 
64
+ fig=line_plot_target(st.session_state['Cleaned_data_panel'], target=target_column, title=f'{target_column} Over Time')
65
+ st.plotly_chart(fig, use_container_width=True)
66
 
67
 
68
+ media_channel=list(*[st.session_state['category_dict'][key] for key in st.session_state['category_dict'].keys() if key =='Media'])
69
+ # st.write(media_channel)
70
 
71
+ Non_media_variables=list(*[st.session_state['category_dict'][key] for key in st.session_state['category_dict'].keys() if key =='Exogenous' or key=='Internal'])
72
 
73
 
74
+ st.markdown('### Annual Data Summary')
75
+ st.dataframe(summary(st.session_state['Cleaned_data_panel'], media_channel+[target_column], spends=None,Target=True), use_container_width=True)
76
 
77
+ if st.checkbox('Show raw data'):
78
+ st.write(pd.concat([pd.to_datetime(st.session_state['Cleaned_data_panel']['date']).dt.strftime('%m/%d/%Y'),st.session_state['Cleaned_data_panel'].select_dtypes(np.number).applymap(format_numbers)],axis=1))
79
  col1 = st.columns(1)
80
 
81
  if "selected_feature" not in st.session_state:
 
97
  return report_path
98
 
99
 
100
+ #st.header()
101
+ with st.expander('Univariate and Bivariate Report'):
102
+ eda_columns=st.columns(2)
103
+ with eda_columns[0]:
104
+ if st.button('Generate Profile Report',help='Univariate report which inlcudes all statistical analysis'):
105
+ with st.spinner('Generating Report'):
106
+ report_file = generate_profile_report(st.session_state['Cleaned_data_panel'])
107
+
108
+ if os.path.exists(report_file):
109
+ with open(report_file, 'rb') as f:
110
+ st.success('Report Generated')
111
+ st.download_button(
112
+ label="Download EDA Report",
113
+ data=f.read(),
114
+ file_name="pandas_profiling_report.html",
115
+ mime="text/html"
116
+ )
117
+ else:
118
+ st.warning("Report generation failed. Unable to find the report file.")
119
 
120
  with eda_columns[1]:
121
+ if st.button('Generate Sweetviz Report',help='Bivariate report for selected response metric'):
122
  with st.spinner('Generating Report'):
123
+ report_file = generate_report_with_target(st.session_state['Cleaned_data_panel'], target_column)
124
 
125
  if os.path.exists(report_file):
126
  with open(report_file, 'rb') as f:
 
136
 
137
 
138
 
139
+ #st.warning('Work in Progress')
140
+ with st.expander('Media Variables Analysis'):
141
+ # Get the selected feature
142
+ st.session_state["selected_feature"]= st.selectbox('Select media', [col for col in media_channel if 'cost' not in col.lower() and 'spend' not in col.lower()])
143
+
144
+ # Filter spends features based on the selected feature
145
+ spends_features = [col for col in st.session_state['Cleaned_data_panel'].columns if any(keyword in col.lower() for keyword in ['cost', 'spend'])]
146
+ spends_feature = [col for col in spends_features if re.split(r'_cost|_spend', col.lower())[0] in st.session_state["selected_feature"]]
147
+
148
+ if 'validation' not in st.session_state:
149
+ st.session_state['validation']=[]
150
+
151
+
152
+ val_variables=[col for col in media_channel if col!='date']
153
+ if len(spends_feature)==0:
154
+ st.warning('No spends varaible available for the selected metric in data')
155
+
156
+ else:
157
+ fig_row1 = line_plot(st.session_state['Cleaned_data_panel'], x_col='date', y1_cols=[st.session_state["selected_feature"]], y2_cols=[target_column], title=f'Analysis of {st.session_state["selected_feature"]} and {[target_column][0]} Over Time')
158
+ st.plotly_chart(fig_row1, use_container_width=True)
159
+ st.markdown('### Summary')
160
+ st.dataframe(summary(st.session_state['cleaned_data'],[st.session_state["selected_feature"]],spends=spends_feature[0]),use_container_width=True)
161
+
162
+ cols2=st.columns(2)
163
+ with cols2[0]:
164
+ if st.button('Validate'):
165
+ st.session_state['validation'].append(st.session_state["selected_feature"])
166
+ with cols2[1]:
167
+ if st.checkbox('Validate all'):
168
+ st.session_state['validation'].extend(val_variables)
169
+ st.success('All media variables are validated ✅')
170
+
171
+ if len(set(st.session_state['validation']).intersection(val_variables))!=len(val_variables):
172
+ validation_data=pd.DataFrame({'Validate':[True if col in st.session_state['validation'] else False for col in val_variables],
173
+ 'Variables':val_variables
174
+ })
175
+ cols3=st.columns([1,30])
176
+ with cols3[1]:
177
+ validation_df=st.data_editor(validation_data,
178
+ # column_config={
179
+ # 'Validate':st.column_config.CheckboxColumn(wi)
180
+
181
+ # },
182
+ column_config={
183
+ "Validate": st.column_config.CheckboxColumn(
184
+ default=False,
185
+ width=100,
186
+ ),
187
+ 'Variables':st.column_config.TextColumn(
188
+ width=1000
189
+
190
+ )
191
+ },hide_index=True)
192
+
193
+ selected_rows = validation_df[validation_df['Validate']==True]['Variables']
194
+
195
+ #st.write(selected_rows)
196
+
197
+ st.session_state['validation'].extend(selected_rows)
198
+
199
+ not_validated_variables = [col for col in val_variables if col not in st.session_state["validation"]]
200
+ if not_validated_variables:
201
+ not_validated_message = f'The following variables are not validated:\n{" , ".join(not_validated_variables)}'
202
+ st.warning(not_validated_message)
203
+
204
+
205
+
206
+ with st.expander('Non Media Variables Analysis'):
207
+ selected_columns_row4 = st.selectbox('Select Channel',Non_media_variables,index=1)
208
+ # # Create the dual-axis line plot
209
+ fig_row4 = line_plot(st.session_state['Cleaned_data_panel'], x_col='date', y1_cols=[selected_columns_row4], y2_cols=[target_column], title=f'Analysis of {selected_columns_row4} and {target_column} Over Time')
210
+ st.plotly_chart(fig_row4, use_container_width=True)
211
+ selected_non_media=selected_columns_row4
212
+ sum_df = st.session_state['Cleaned_data_panel'][['date', selected_non_media,target_column]]
213
+ sum_df['Year']=pd.to_datetime(st.session_state['Cleaned_data_panel']['date']).dt.year
214
+ #st.dataframe(df)
215
+ #st.dataframe(sum_df.head(2))
216
+ sum_df=sum_df.groupby('Year').agg('sum')
217
+ sum_df.loc['Grand Total']=sum_df.sum()
218
+ sum_df=sum_df.applymap(format_numbers)
219
+ sum_df.fillna('-',inplace=True)
220
+ sum_df=sum_df.replace({"0.0":'-','nan':'-'})
221
+ st.markdown('### Summary')
222
+ st.dataframe(sum_df,use_container_width=True)
223
+
224
+
225
+ with st.expander('Correlation Analysis'):
226
+ options = list(st.session_state['Cleaned_data_panel'].select_dtypes(np.number).columns)
227
+
228
+ # selected_options = []
229
+ # num_columns = 4
230
+ # num_rows = -(-len(options) // num_columns) # Ceiling division to calculate rows
231
+
232
+ # # Create a grid of checkboxes
233
+ # st.header('Select Features for Correlation Plot')
234
+ # tick=False
235
+ # if st.checkbox('Select all'):
236
+ # tick=True
237
+ # selected_options = []
238
+ # for row in range(num_rows):
239
+ # cols = st.columns(num_columns)
240
+ # for col in cols:
241
+ # if options:
242
+ # option = options.pop(0)
243
+ # selected = col.checkbox(option,value=tick)
244
+ # if selected:
245
+ # selected_options.append(option)
246
+ # # Display selected options
247
+
248
+ selected_options=st.multiselect('Select Variables For correlation plot',[var for var in options if var!= target_column],default=options[3])
249
+
250
+ st.pyplot(correlation_plot(st.session_state['Cleaned_data_panel'],selected_options,target_column))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
pages/2_Transformations.py ADDED
@@ -0,0 +1,522 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Importing necessary libraries
2
+ import streamlit as st
3
+
4
+ st.set_page_config(
5
+ page_title="Transformations",
6
+ page_icon=":shark:",
7
+ layout="wide",
8
+ initial_sidebar_state="collapsed",
9
+ )
10
+
11
+ import pickle
12
+ import numpy as np
13
+ import pandas as pd
14
+ from utilities import set_header, load_local_css
15
+ import streamlit_authenticator as stauth
16
+ import yaml
17
+ from yaml import SafeLoader
18
+
19
+ load_local_css("styles.css")
20
+ set_header()
21
+
22
+ # Check for authentication status
23
+ for k, v in st.session_state.items():
24
+ if k not in ["logout", "login", "config"] and not k.startswith(
25
+ "FormSubmitter"
26
+ ):
27
+ st.session_state[k] = v
28
+ with open("config.yaml") as file:
29
+ config = yaml.load(file, Loader=SafeLoader)
30
+ st.session_state["config"] = config
31
+ authenticator = stauth.Authenticate(
32
+ config["credentials"],
33
+ config["cookie"]["name"],
34
+ config["cookie"]["key"],
35
+ config["cookie"]["expiry_days"],
36
+ config["preauthorized"],
37
+ )
38
+ st.session_state["authenticator"] = authenticator
39
+ name, authentication_status, username = authenticator.login("Login", "main")
40
+ auth_status = st.session_state.get("authentication_status")
41
+
42
+ if auth_status == True:
43
+ authenticator.logout("Logout", "main")
44
+ is_state_initiaized = st.session_state.get("initialized", False)
45
+
46
+ if not is_state_initiaized:
47
+
48
+ if 'session_name' not in st.session_state:
49
+ st.session_state['session_name']=None
50
+
51
+
52
+ # Deserialize and load the objects from the pickle file
53
+ with open("data_import.pkl", "rb") as f:
54
+ data = pickle.load(f)
55
+
56
+ # Accessing the loaded objects
57
+ final_df_loaded = data["final_df"]
58
+ bin_dict_loaded = data["bin_dict"]
59
+
60
+ # Initialize session state
61
+ if "transformed_columns_dict" not in st.session_state:
62
+ st.session_state["transformed_columns_dict"] = {} # Default empty dictionary
63
+
64
+ if "final_df" not in st.session_state:
65
+ st.session_state["final_df"] = final_df_loaded # Default as original dataframe
66
+
67
+ if "summary_string" not in st.session_state:
68
+ st.session_state["summary_string"] = None # Default as None
69
+
70
+ # Extract original columns for specified categories
71
+ original_columns = {
72
+ category: bin_dict_loaded[category]
73
+ for category in ["Media", "Internal", "Exogenous"]
74
+ if category in bin_dict_loaded
75
+ }
76
+
77
+ # Retrive Panel columns
78
+ panel_1 = bin_dict_loaded.get("Panel Level 1")
79
+ panel_2 = bin_dict_loaded.get("Panel Level 2")
80
+
81
+ # # For testing on non panel level
82
+ # final_df_loaded = final_df_loaded.drop("Panel_1", axis=1)
83
+ # final_df_loaded = final_df_loaded.groupby("date").mean().reset_index()
84
+ # panel_1 = None
85
+
86
+ # Apply transformations on panel level
87
+ st.write("")
88
+ if panel_1:
89
+ panel = panel_1 + panel_2 if panel_2 else panel_1
90
+ else:
91
+ panel = []
92
+
93
+
94
+ # Function to build transformation widgets
95
+ def transformation_widgets(category, transform_params, date_granularity):
96
+ # Transformation Options
97
+ transformation_options = {
98
+ "Media": ["Lag", "Moving Average", "Saturation", "Power", "Adstock"],
99
+ "Internal": ["Lead", "Lag", "Moving Average"],
100
+ "Exogenous": ["Lead", "Lag", "Moving Average"],
101
+ }
102
+
103
+ with st.expander(f"{category} Transformations"):
104
+
105
+ # Let users select which transformations to apply
106
+ transformations_to_apply = st.multiselect(
107
+ "Select transformations to apply",
108
+ options=transformation_options[category],
109
+ default=[],
110
+ key=f"transformation_{category}",
111
+ )
112
+
113
+ # Determine the number of transformations to put in each column
114
+ transformations_per_column = (
115
+ len(transformations_to_apply) // 2 + len(transformations_to_apply) % 2
116
+ )
117
+
118
+ # Create two columns
119
+ col1, col2 = st.columns(2)
120
+
121
+ # Assign transformations to each column
122
+ transformations_col1 = transformations_to_apply[:transformations_per_column]
123
+ transformations_col2 = transformations_to_apply[transformations_per_column:]
124
+
125
+ # Define a helper function to create widgets for each transformation
126
+ def create_transformation_widgets(column, transformations):
127
+ with column:
128
+ for transformation in transformations:
129
+ # Conditionally create widgets for selected transformations
130
+ if transformation == "Lead":
131
+ st.markdown(f"**Lead ({date_granularity})**")
132
+ lead = st.slider(
133
+ "Lead periods",
134
+ 1,
135
+ 10,
136
+ (1, 2),
137
+ 1,
138
+ key=f"lead_{category}",
139
+ label_visibility="collapsed",
140
+ )
141
+ start = lead[0]
142
+ end = lead[1]
143
+ step = 1
144
+ transform_params[category]["Lead"] = np.arange(
145
+ start, end + step, step
146
+ )
147
+
148
+ if transformation == "Lag":
149
+ st.markdown(f"**Lag ({date_granularity})**")
150
+ lag = st.slider(
151
+ "Lag periods",
152
+ 1,
153
+ 10,
154
+ (1, 2),
155
+ 1,
156
+ key=f"lag_{category}",
157
+ label_visibility="collapsed",
158
+ )
159
+ start = lag[0]
160
+ end = lag[1]
161
+ step = 1
162
+ transform_params[category]["Lag"] = np.arange(
163
+ start, end + step, step
164
+ )
165
+
166
+ if transformation == "Moving Average":
167
+ st.markdown(f"**Moving Average ({date_granularity})**")
168
+ window = st.slider(
169
+ "Window size for Moving Average",
170
+ 1,
171
+ 10,
172
+ (1, 2),
173
+ 1,
174
+ key=f"ma_{category}",
175
+ label_visibility="collapsed",
176
+ )
177
+ start = window[0]
178
+ end = window[1]
179
+ step = 1
180
+ transform_params[category]["Moving Average"] = np.arange(
181
+ start, end + step, step
182
+ )
183
+
184
+ if transformation == "Saturation":
185
+ st.markdown("**Saturation (%)**")
186
+ saturation_point = st.slider(
187
+ f"Saturation Percentage",
188
+ 0,
189
+ 100,
190
+ (10, 20),
191
+ 10,
192
+ key=f"sat_{category}",
193
+ label_visibility="collapsed",
194
+ )
195
+ start = saturation_point[0]
196
+ end = saturation_point[1]
197
+ step = 10
198
+ transform_params[category]["Saturation"] = np.arange(
199
+ start, end + step, step
200
+ )
201
+
202
+ if transformation == "Power":
203
+ st.markdown("**Power**")
204
+ power = st.slider(
205
+ f"Power",
206
+ 0,
207
+ 10,
208
+ (2, 4),
209
+ 1,
210
+ key=f"power_{category}",
211
+ label_visibility="collapsed",
212
+ )
213
+ start = power[0]
214
+ end = power[1]
215
+ step = 1
216
+ transform_params[category]["Power"] = np.arange(
217
+ start, end + step, step
218
+ )
219
+
220
+ if transformation == "Adstock":
221
+ st.markdown("**Adstock**")
222
+ rate = st.slider(
223
+ f"Factor ({category})",
224
+ 0.0,
225
+ 1.0,
226
+ (0.5, 0.7),
227
+ 0.05,
228
+ key=f"adstock_{category}",
229
+ label_visibility="collapsed",
230
+ )
231
+ start = rate[0]
232
+ end = rate[1]
233
+ step = 0.05
234
+ adstock_range = [
235
+ round(a, 3) for a in np.arange(start, end + step, step)
236
+ ]
237
+ transform_params[category]["Adstock"] = adstock_range
238
+
239
+ # Create widgets in each column
240
+ create_transformation_widgets(col1, transformations_col1)
241
+ create_transformation_widgets(col2, transformations_col2)
242
+
243
+
244
+ # Function to apply Lag transformation
245
+ def apply_lag(df, lag):
246
+ return df.shift(lag)
247
+
248
+
249
+ # Function to apply Lead transformation
250
+ def apply_lead(df, lead):
251
+ return df.shift(-lead)
252
+
253
+
254
+ # Function to apply Moving Average transformation
255
+ def apply_moving_average(df, window_size):
256
+ return df.rolling(window=window_size).mean()
257
+
258
+
259
+ # Function to apply Saturation transformation
260
+ def apply_saturation(df, saturation_percent_100):
261
+ # Convert saturation percentage from 100-based to fraction
262
+ saturation_percent = saturation_percent_100 / 100.0
263
+
264
+ # Calculate saturation point and steepness
265
+ column_max = df.max()
266
+ column_min = df.min()
267
+ saturation_point = (column_min + column_max) / 2
268
+
269
+ numerator = np.log(
270
+ (1 / (saturation_percent if saturation_percent != 1 else 1 - 1e-9)) - 1
271
+ )
272
+ denominator = np.log(saturation_point / max(column_max, 1e-9))
273
+
274
+ steepness = numerator / max(
275
+ denominator, 1e-9
276
+ ) # Avoid division by zero with a small constant
277
+
278
+ # Apply the saturation transformation
279
+ transformed_series = df.apply(
280
+ lambda x: (1 / (1 + (saturation_point / x) ** steepness)) * x
281
+ )
282
+
283
+ return transformed_series
284
+
285
+
286
+ # Function to apply Power transformation
287
+ def apply_power(df, power):
288
+ return df**power
289
+
290
+
291
+ # Function to apply Adstock transformation
292
+ def apply_adstock(df, factor):
293
+ x = 0
294
+ # Use the walrus operator to update x iteratively with the Adstock formula
295
+ adstock_var = [x := x * factor + v for v in df]
296
+ ans = pd.Series(adstock_var, index=df.index)
297
+ return ans
298
+
299
+
300
+ # Function to generate transformed columns names
301
+ @st.cache_resource(show_spinner=False)
302
+ def generate_transformed_columns(original_columns, transform_params):
303
+ transformed_columns, summary = {}, {}
304
+
305
+ for category, columns in original_columns.items():
306
+ for column in columns:
307
+ transformed_columns[column] = []
308
+ summary_details = (
309
+ []
310
+ ) # List to hold transformation details for the current column
311
+
312
+ if category in transform_params:
313
+ for transformation, values in transform_params[category].items():
314
+ # Generate transformed column names for each value
315
+ for value in values:
316
+ transformed_name = f"{column}@{transformation}_{value}"
317
+ transformed_columns[column].append(transformed_name)
318
+
319
+ # Format the values list as a string with commas and "and" before the last item
320
+ if len(values) > 1:
321
+ formatted_values = (
322
+ ", ".join(map(str, values[:-1])) + " and " + str(values[-1])
323
+ )
324
+ else:
325
+ formatted_values = str(values[0])
326
+
327
+ # Add transformation details
328
+ summary_details.append(f"{transformation} ({formatted_values})")
329
+
330
+ # Only add to summary if there are transformation details for the column
331
+ if summary_details:
332
+ formatted_summary = "⮕ ".join(summary_details)
333
+ # Use <strong> tags to make the column name bold
334
+ summary[column] = f"<strong>{column}</strong>: {formatted_summary}"
335
+
336
+ # Generate a comprehensive summary string for all columns
337
+ summary_items = [
338
+ f"{idx + 1}. {details}" for idx, details in enumerate(summary.values())
339
+ ]
340
+
341
+ summary_string = "\n".join(summary_items)
342
+
343
+ return transformed_columns, summary_string
344
+
345
+
346
+ # Function to apply transformations to DataFrame slices based on specified categories and parameters
347
+ @st.cache_resource(show_spinner=False)
348
+ def apply_category_transformations(df, bin_dict, transform_params, panel):
349
+ # Dictionary for function mapping
350
+ transformation_functions = {
351
+ "Lead": apply_lead,
352
+ "Lag": apply_lag,
353
+ "Moving Average": apply_moving_average,
354
+ "Saturation": apply_saturation,
355
+ "Power": apply_power,
356
+ "Adstock": apply_adstock,
357
+ }
358
+
359
+ # Initialize category_df as an empty DataFrame
360
+ category_df = pd.DataFrame()
361
+
362
+ # Iterate through each category specified in transform_params
363
+ for category in ["Media", "Internal", "Exogenous"]:
364
+ if (
365
+ category not in transform_params
366
+ or category not in bin_dict
367
+ or not transform_params[category]
368
+ ):
369
+ continue # Skip categories without transformations
370
+
371
+ # Slice the DataFrame based on the columns specified in bin_dict for the current category
372
+ df_slice = df[bin_dict[category] + panel]
373
+
374
+ # Iterate through each transformation and its parameters for the current category
375
+ for transformation, parameters in transform_params[category].items():
376
+ transformation_function = transformation_functions[transformation]
377
+
378
+ # Check if there is panel data to group by
379
+ if len(panel) > 0:
380
+ # Apply the transformation to each group
381
+ category_df = pd.concat(
382
+ [
383
+ df_slice.groupby(panel)
384
+ .transform(transformation_function, p)
385
+ .add_suffix(f"@{transformation}_{p}")
386
+ for p in parameters
387
+ ],
388
+ axis=1,
389
+ )
390
+
391
+ # Replace all NaN or null values in category_df with 0
392
+ category_df.fillna(0, inplace=True)
393
+
394
+ # Update df_slice
395
+ df_slice = pd.concat(
396
+ [df[panel], category_df],
397
+ axis=1,
398
+ )
399
+
400
+ else:
401
+ for p in parameters:
402
+ # Apply the transformation function to each column
403
+ temp_df = df_slice.apply(
404
+ lambda x: transformation_function(x, p), axis=0
405
+ ).rename(lambda x: f"{x}@{transformation}_{p}", axis="columns")
406
+ # Concatenate the transformed DataFrame slice to the category DataFrame
407
+ category_df = pd.concat([category_df, temp_df], axis=1)
408
+
409
+ # Replace all NaN or null values in category_df with 0
410
+ category_df.fillna(0, inplace=True)
411
+
412
+ # Update df_slice
413
+ df_slice = pd.concat(
414
+ [df[panel], category_df],
415
+ axis=1,
416
+ )
417
+
418
+ # If category_df has been modified, concatenate it with the panel and response metrics from the original DataFrame
419
+ if not category_df.empty:
420
+ final_df = pd.concat([df, category_df], axis=1)
421
+ else:
422
+ # If no transformations were applied, use the original DataFrame
423
+ final_df = df
424
+
425
+ return final_df
426
+
427
+
428
+ # Function to infers the granularity of the date column in a DataFrame
429
+ @st.cache_resource(show_spinner=False)
430
+ def infer_date_granularity(df):
431
+ # Find the most common difference
432
+ common_freq = pd.Series(df["date"].unique()).diff().dt.days.dropna().mode()[0]
433
+
434
+ # Map the most common difference to a granularity
435
+ if common_freq == 1:
436
+ return "daily"
437
+ elif common_freq == 7:
438
+ return "weekly"
439
+ elif 28 <= common_freq <= 31:
440
+ return "monthly"
441
+ else:
442
+ return "irregular"
443
+
444
+
445
+ #########################################################################################################################################################
446
+ # User input for transformations
447
+ #########################################################################################################################################################
448
+
449
+
450
+ # Infer date granularity
451
+ date_granularity = infer_date_granularity(final_df_loaded)
452
+
453
+ # Initialize the main dictionary to store the transformation parameters for each category
454
+ transform_params = {"Media": {}, "Internal": {}, "Exogenous": {}}
455
+
456
+ # User input for transformations
457
+ st.markdown("### Select Transformations to Apply")
458
+ for category in ["Media", "Internal", "Exogenous"]:
459
+ # Skip Internal
460
+ if category == "Internal":
461
+ continue
462
+
463
+ transformation_widgets(category, transform_params, date_granularity)
464
+
465
+
466
+ #########################################################################################################################################################
467
+ # Apply transformations
468
+ #########################################################################################################################################################
469
+
470
+
471
+ # Apply category-based transformations to the DataFrame
472
+ if st.button("Accept and Proceed", use_container_width=True):
473
+ with st.spinner("Applying transformations..."):
474
+ final_df = apply_category_transformations(
475
+ final_df_loaded, bin_dict_loaded, transform_params, panel
476
+ )
477
+
478
+ # Generate a dictionary mapping original column names to lists of transformed column names
479
+ transformed_columns_dict, summary_string = generate_transformed_columns(
480
+ original_columns, transform_params
481
+ )
482
+
483
+ # Store into transformed dataframe and summary session state
484
+ st.session_state["final_df"] = final_df
485
+ st.session_state["summary_string"] = summary_string
486
+
487
+
488
+ #########################################################################################################################################################
489
+ # Display the transformed DataFrame and summary
490
+ #########################################################################################################################################################
491
+
492
+
493
+ # Display the transformed DataFrame in the Streamlit app
494
+ st.markdown("### Transformed DataFrame")
495
+ st.dataframe(st.session_state["final_df"], hide_index=True)
496
+
497
+ # Total rows and columns
498
+ total_rows, total_columns = st.session_state["final_df"].shape
499
+ st.markdown(
500
+ f"<p style='text-align: justify;'>The transformed DataFrame contains <strong>{total_rows}</strong> rows and <strong>{total_columns}</strong> columns.</p>",
501
+ unsafe_allow_html=True,
502
+ )
503
+
504
+ # Display the summary of transformations as markdown
505
+ if st.session_state["summary_string"]:
506
+ with st.expander("Summary of Transformations"):
507
+ st.markdown("### Summary of Transformations")
508
+ st.markdown(st.session_state["summary_string"], unsafe_allow_html=True)
509
+
510
+ @st.cache_resource(show_spinner=False)
511
+ def save_to_pickle(file_path, final_df):
512
+ # Open the file in write-binary mode and dump the objects
513
+ with open(file_path, "wb") as f:
514
+ pickle.dump({"final_df_transformed": final_df}, f)
515
+ # Data is now saved to file
516
+
517
+ if st.button("Accept and Save", use_container_width=True):
518
+
519
+ save_to_pickle(
520
+ "final_df_transformed.pkl", st.session_state["final_df"]
521
+ )
522
+ st.toast("💾 Saved Successfully!")
pages/4_Model_Build.py ADDED
@@ -0,0 +1,826 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ MMO Build Sprint 3
3
+ additions : adding more variables to session state for saved model : random effect, predicted train & test
4
+
5
+ MMO Build Sprint 4
6
+ additions : ability to run models for different response metrics
7
+ '''
8
+
9
+ import streamlit as st
10
+ import pandas as pd
11
+ import plotly.express as px
12
+ import plotly.graph_objects as go
13
+ from Eda_functions import format_numbers
14
+ import numpy as np
15
+ import pickle
16
+ from st_aggrid import AgGrid
17
+ from st_aggrid import GridOptionsBuilder, GridUpdateMode
18
+ from utilities import set_header, load_local_css
19
+ from st_aggrid import GridOptionsBuilder
20
+ import time
21
+ import itertools
22
+ import statsmodels.api as sm
23
+ import numpy as npc
24
+ import re
25
+ import itertools
26
+ from sklearn.metrics import mean_absolute_error, r2_score, mean_absolute_percentage_error
27
+ from sklearn.preprocessing import MinMaxScaler
28
+ import os
29
+ import matplotlib.pyplot as plt
30
+ from statsmodels.stats.outliers_influence import variance_inflation_factor
31
+
32
+ st.set_option('deprecation.showPyplotGlobalUse', False)
33
+ import statsmodels.api as sm
34
+ import statsmodels.formula.api as smf
35
+
36
+ from datetime import datetime
37
+ import seaborn as sns
38
+ from Data_prep_functions import *
39
+
40
+
41
+
42
+ def get_random_effects(media_data, panel_col, mdf):
43
+ random_eff_df = pd.DataFrame(columns=[panel_col, "random_effect"])
44
+
45
+ for i, market in enumerate(media_data[panel_col].unique()):
46
+ print(i, end='\r')
47
+ intercept = mdf.random_effects[market].values[0]
48
+ random_eff_df.loc[i, 'random_effect'] = intercept
49
+ random_eff_df.loc[i, panel_col] = market
50
+
51
+ return random_eff_df
52
+
53
+
54
+ def mdf_predict(X_df, mdf, random_eff_df):
55
+ X = X_df.copy()
56
+ X['fixed_effect'] = mdf.predict(X)
57
+ X = pd.merge(X, random_eff_df, on=panel_col, how='left')
58
+ X['pred'] = X['fixed_effect'] + X['random_effect']
59
+ # X.to_csv('Test/megred_df.csv',index=False)
60
+ X.drop(columns=['fixed_effect', 'random_effect'], inplace=True)
61
+ return X['pred']
62
+
63
+
64
+ st.set_page_config(
65
+ page_title="Model Build",
66
+ page_icon=":shark:",
67
+ layout="wide",
68
+ initial_sidebar_state='collapsed'
69
+ )
70
+
71
+ load_local_css('styles.css')
72
+ set_header()
73
+
74
+ st.title('1. Build Your Model')
75
+
76
+ with open("data_import.pkl", "rb") as f:
77
+ data = pickle.load(f)
78
+
79
+ st.session_state['bin_dict'] = data["bin_dict"]
80
+
81
+ #st.write(data["bin_dict"])
82
+
83
+ with open("final_df_transformed.pkl", "rb") as f:
84
+ data = pickle.load(f)
85
+
86
+ # Accessing the loaded objects
87
+ media_data = data["final_df_transformed"]
88
+
89
+ # Sprint4 - available response metrics is a list of all reponse metrics in the data
90
+ ## these will be put in a drop down
91
+
92
+ st.session_state['media_data']=media_data
93
+
94
+ if 'available_response_metrics' not in st.session_state:
95
+ # st.session_state['available_response_metrics'] = ['Total Approved Accounts - Revenue',
96
+ # 'Total Approved Accounts - Appsflyer',
97
+ # 'Account Requests - Appsflyer',
98
+ # 'App Installs - Appsflyer']
99
+
100
+ st.session_state['available_response_metrics']= st.session_state['bin_dict']["Response Metrics"]
101
+ # Sprint4
102
+ if "is_tuned_model" not in st.session_state:
103
+ st.session_state["is_tuned_model"] = {}
104
+ for resp_metric in st.session_state['available_response_metrics'] :
105
+ resp_metric=resp_metric.lower().replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_")
106
+ st.session_state["is_tuned_model"][resp_metric] = False
107
+
108
+ # Sprint4 - used_response_metrics is a list of resp metrics for which user has created & saved a model
109
+ if 'used_response_metrics' not in st.session_state:
110
+ st.session_state['used_response_metrics'] = []
111
+
112
+ # Sprint4 - saved_model_names
113
+ if 'saved_model_names' not in st.session_state:
114
+ st.session_state['saved_model_names'] = []
115
+
116
+ # if "model_save_flag" not in st.session_state:
117
+ # st.session_state["model_save_flag"]=False
118
+ # def reset_save():
119
+ # st.session_state["model_save_flag"]=False
120
+ # def set_save():
121
+ # st.session_state["model_save_flag"]=True
122
+ # Sprint4 - select a response metric
123
+
124
+
125
+ sel_target_col = st.selectbox("Select the response metric",
126
+ st.session_state['available_response_metrics'])
127
+ # , on_change=reset_save())
128
+ target_col = sel_target_col.lower().replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_")
129
+
130
+ new_name_dct={col:col.lower().replace('.','_').lower().replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in media_data.columns}
131
+
132
+ media_data.columns=[col.lower().replace('.','_').replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in media_data.columns]
133
+
134
+ #st.write(st.session_state['bin_dict'])
135
+ panel_col = [col.lower().replace('.','_').replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in st.session_state['bin_dict']['Panel Level 1'] ] [0]# set the panel column
136
+ date_col = 'date'
137
+
138
+ #st.write(media_data)
139
+
140
+ is_panel = True if len(panel_col)>0 else False
141
+
142
+ if 'is_panel' not in st.session_state:
143
+ st.session_state['is_panel']=False
144
+
145
+
146
+
147
+ # if st.toggle('Apply Transformations on DMA/Panel Level'):
148
+ # media_data = pd.read_csv(r'C:\Users\SrishtiVerma\Mastercard\Sprint2\upf_data_converted_randomized_resp_metrics.csv')
149
+ # media_data.columns = [i.lower().replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for i in
150
+ # media_data.columns]
151
+ # dma = st.selectbox('Select the Level of data ',
152
+ # [col for col in media_data.columns if col.lower() in ['dma', 'panel', 'markets']])
153
+ # # is_panel = True
154
+ # # st.session_state['is_panel']=True
155
+ #
156
+ # else:
157
+ # # """ code to aggregate data on date """
158
+ # media_data = pd.read_excel(r'C:\Users\SrishtiVerma\Mastercard\Sprint1\Tactic Level Models\Tactic_level_data_imp_clicks_spends.xlsx')
159
+ # media_data.columns = [i.lower().replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for i in
160
+ # media_data.columns]
161
+ # dma = None
162
+ # # is_panel = False
163
+ # # st.session_state['is_panel']=False
164
+
165
+ #media_data = st.session_state["final_df"]
166
+
167
+
168
+
169
+ # st.write(media_data.columns)
170
+
171
+ media_data.sort_values(date_col, inplace=True)
172
+ media_data.reset_index(drop=True, inplace=True)
173
+
174
+ date = media_data[date_col]
175
+ st.session_state['date'] = date
176
+ # revenue=media_data[target_col]
177
+ y = media_data[target_col]
178
+
179
+ if is_panel:
180
+ spends_data = media_data[
181
+ [c for c in media_data.columns if "_cost" in c.lower() or "_spend" in c.lower()] + [date_col, panel_col]]
182
+ # Sprint3 - spends for resp curves
183
+ else:
184
+ spends_data = media_data[
185
+ [c for c in media_data.columns if "_cost" in c.lower() or "_spend" in c.lower()] + [date_col]]
186
+
187
+ y = media_data[target_col]
188
+ # media_data.drop([target_col],axis=1,inplace=True)
189
+ media_data.drop([date_col], axis=1, inplace=True)
190
+ media_data.reset_index(drop=True, inplace=True)
191
+
192
+ # dma_dict={ dm:media_data[media_data[dma]==dm] for dm in media_data[dma].unique()}
193
+
194
+ # st.markdown('## Select the Range of Transformations')
195
+ columns = st.columns(2)
196
+
197
+ old_shape = media_data.shape
198
+
199
+ if "old_shape" not in st.session_state:
200
+ st.session_state['old_shape'] = old_shape
201
+
202
+ # with columns[0]:
203
+ # slider_value_adstock = st.slider('Select Adstock Range (only applied to media)', 0.0, 1.0, (0.2, 0.4), step=0.1,
204
+ # format="%.2f")
205
+ # with columns[1]:
206
+ # slider_value_lag = st.slider('Select Lag Range (applied to media, seasonal, macroeconomic variables)', 1, 7, (1, 3),
207
+ # step=1)
208
+
209
+
210
+ # with columns[2]:
211
+ # slider_value_power=st.slider('Select Power range (only applied to media )',0,4,(1,2),step=1)
212
+
213
+ # with columns[1]:
214
+ # st.number_input('Select the range of half saturation point ',min_value=1,max_value=5)
215
+ # st.number_input('Select the range of ')
216
+
217
+ # Section 1 - Transformations Functions
218
+ # def lag(data, features, lags, dma=None):
219
+ # if dma:
220
+ #
221
+ # transformed_data = pd.concat(
222
+ # [data.groupby([dma])[features].shift(lag).add_suffix(f'_lag_{lag}') for lag in lags], axis=1)
223
+ # # transformed_data = transformed_data.fillna(method='bfill')
224
+ # transformed_data = transformed_data.bfill() # Sprint4 - fillna getting deprecated
225
+ # return pd.concat([transformed_data, data], axis=1)
226
+ #
227
+ # else:
228
+ #
229
+ # # ''' data should be aggregated on date'''
230
+ #
231
+ # transformed_data = pd.concat([data[features].shift(lag).add_suffix(f'_lag_{lag}') for lag in lags], axis=1)
232
+ # # transformed_data = transformed_data.fillna(method='bfill')
233
+ # transformed_data = transformed_data.bfill()
234
+ #
235
+ # return pd.concat([transformed_data, data], axis=1)
236
+ #
237
+ #
238
+ # # adstock
239
+ # def adstock(df, alphas, cutoff, features, dma=None):
240
+ # if dma:
241
+ # transformed_data = pd.DataFrame()
242
+ # for d in df[dma].unique():
243
+ # dma_sub_df = df[df[dma] == d]
244
+ # n = len(dma_sub_df)
245
+ #
246
+ # weights = np.array(
247
+ # [[[alpha ** (i - j) if i >= j and j >= i - cutoff else 0. for j in range(n)] for i in range(n)] for
248
+ # alpha in alphas])
249
+ # X = dma_sub_df[features].to_numpy()
250
+ #
251
+ # res = pd.DataFrame(np.hstack(weights @ X),
252
+ # columns=[f'{col}_adstock_{alpha}' for alpha in alphas for col in features])
253
+ #
254
+ # transformed_data = pd.concat([transformed_data, res], axis=0)
255
+ # transformed_data.reset_index(drop=True, inplace=True)
256
+ # return pd.concat([transformed_data, df], axis=1)
257
+ #
258
+ # else:
259
+ #
260
+ # n = len(df)
261
+ #
262
+ # weights = np.array(
263
+ # [[[alpha ** (i - j) if i >= j and j >= i - cutoff else 0. for j in range(n)] for i in range(n)] for alpha in
264
+ # alphas])
265
+ #
266
+ # X = df[features].to_numpy()
267
+ # res = pd.DataFrame(np.hstack(weights @ X),
268
+ # columns=[f'{col}_adstock_{alpha}' for alpha in alphas for col in features])
269
+ # return pd.concat([res, df], axis=1)
270
+
271
+
272
+ # Section 2 - Begin Transformations
273
+
274
+ if 'media_data' not in st.session_state:
275
+ st.session_state['media_data'] = pd.DataFrame()
276
+
277
+ # Sprint3
278
+ if "orig_media_data" not in st.session_state:
279
+ st.session_state['orig_media_data'] = pd.DataFrame()
280
+
281
+ # Sprint3 additions
282
+ if 'random_effects' not in st.session_state:
283
+ st.session_state['random_effects'] = pd.DataFrame()
284
+ if 'pred_train' not in st.session_state:
285
+ st.session_state['pred_train'] = []
286
+ if 'pred_test' not in st.session_state:
287
+ st.session_state['pred_test'] = []
288
+ # end of Sprint3 additions
289
+
290
+ # variables_to_be_transformed=[col for col in media_data.columns if col.lower() not in ['dma','panel'] ] # change for buckets
291
+ # variables_to_be_transformed = [col for col in media_data.columns if
292
+ # '_clicks' in col.lower() or '_impress' in col.lower()] # srishti - change
293
+ #
294
+ # with columns[0]:
295
+ # if st.button('Apply Transformations'):
296
+ # with st.spinner('Applying Transformations'):
297
+ # transformed_data_lag = lag(media_data, features=variables_to_be_transformed,
298
+ # lags=np.arange(slider_value_lag[0], slider_value_lag[1] + 1, 1), dma=dma)
299
+ #
300
+ # # variables_to_be_transformed=[col for col in list(transformed_data_lag.columns) if col not in ['Date','DMA','Panel']] #change for buckets
301
+ # variables_to_be_transformed = [col for col in media_data.columns if
302
+ # '_clicks' in col.lower() or '_impress' in col.lower()] # srishti - change
303
+ #
304
+ # transformed_data_adstock = adstock(df=transformed_data_lag,
305
+ # alphas=np.arange(slider_value_adstock[0], slider_value_adstock[1], 0.1),
306
+ # cutoff=8, features=variables_to_be_transformed, dma=dma)
307
+ #
308
+ # # st.success('Done')
309
+ # st.success("Transformations complete!")
310
+ #
311
+ # st.write(f'old shape {old_shape}, new shape {transformed_data_adstock.shape}')
312
+ #
313
+ # transformed_data_adstock.columns = [c.replace(".", "_") for c in
314
+ # transformed_data_adstock.columns] # srishti
315
+ # st.session_state['media_data'] = transformed_data_adstock # srishti
316
+ # # Sprint3
317
+ # orig_media_data = media_data.copy()
318
+ # orig_media_data[date_col] = date
319
+ # orig_media_data[target_col] = y
320
+ # st.session_state['orig_media_data'] = orig_media_data # srishti
321
+ #
322
+ # # with st.spinner('Applying Transformations'):
323
+ # # time.sleep(2)
324
+ # # st.success("Transformations complete!")
325
+ #
326
+ # # if st.session_state['media_data'].shape[1]>old_shape[1]:
327
+ # # with columns[0]:
328
+ # # st.write(f'Total no.of variables before transformation: {old_shape[1]}, Total no.of variables after transformation: {st.session_state["media_data"].shape[1]}')
329
+ # # st.write(f'Total no.of variables after transformation: {st.session_state["media_data"].shape[1]}')
330
+
331
+ # Section 3 - Create combinations
332
+
333
+ # bucket=['paid_search', 'kwai','indicacao','infleux', 'influencer','FB: Level Achieved - Tier 1 Impressions',
334
+ # ' FB: Level Achieved - Tier 2 Impressions','paid_social_others',
335
+ # ' GA App: Will And Cid Pequena Baixo Risco Clicks',
336
+ # 'digital_tactic_others',"programmatic"
337
+ # ]
338
+
339
+ # srishti - bucket names changed
340
+ bucket = ['paid_search', 'kwai', 'indicacao', 'infleux', 'influencer', 'fb_level_achieved_tier_2',
341
+ 'fb_level_achieved_tier_1', 'paid_social_others',
342
+ 'ga_app',
343
+ 'digital_tactic_others', "programmatic"
344
+ ]
345
+
346
+ with columns[0]:
347
+ if st.button('Create Combinations of Variables'):
348
+
349
+ top_3_correlated_features = []
350
+ # # for col in st.session_state['media_data'].columns[:19]:
351
+ # original_cols = [c for c in st.session_state['media_data'].columns if
352
+ # "_clicks" in c.lower() or "_impressions" in c.lower()]
353
+ #original_cols = [c for c in original_cols if "_lag" not in c.lower() and "_adstock" not in c.lower()]
354
+
355
+ original_cols=st.session_state['bin_dict']['Media'] + st.session_state['bin_dict']['Internal']
356
+
357
+ original_cols=[col.lower().replace('.','_').replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in original_cols]
358
+
359
+ #st.write(original_cols)
360
+ # for col in st.session_state['media_data'].columns[:19]:
361
+ for col in original_cols: # srishti - new
362
+ corr_df = pd.concat([st.session_state['media_data'].filter(regex=col),
363
+ y], axis=1).corr()[target_col].iloc[:-1]
364
+ top_3_correlated_features.append(list(corr_df.sort_values(ascending=False).head(2).index))
365
+ flattened_list = [item for sublist in top_3_correlated_features for item in sublist]
366
+ # all_features_set={var:[col for col in flattened_list if var in col] for var in bucket}
367
+ all_features_set = {var: [col for col in flattened_list if var in col] for var in bucket if
368
+ len([col for col in flattened_list if var in col]) > 0} # srishti
369
+
370
+ channels_all = [values for values in all_features_set.values()]
371
+ st.session_state['combinations'] = list(itertools.product(*channels_all))
372
+ # if 'combinations' not in st.session_state:
373
+ # st.session_state['combinations']=combinations_all
374
+
375
+ st.session_state['final_selection'] = st.session_state['combinations']
376
+ st.success('Done')
377
+
378
+ # revenue.reset_index(drop=True,inplace=True)
379
+ y.reset_index(drop=True, inplace=True)
380
+ if 'Model_results' not in st.session_state:
381
+ st.session_state['Model_results'] = {'Model_object': [],
382
+ 'Model_iteration': [],
383
+ 'Feature_set': [],
384
+ 'MAPE': [],
385
+ 'R2': [],
386
+ 'ADJR2': [],
387
+ 'pos_count': []
388
+ }
389
+
390
+
391
+ def reset_model_result_dct():
392
+ st.session_state['Model_results'] = {'Model_object': [],
393
+ 'Model_iteration': [],
394
+ 'Feature_set': [],
395
+ 'MAPE': [],
396
+ 'R2': [],
397
+ 'ADJR2': [],
398
+ 'pos_count': []
399
+ }
400
+
401
+ # if st.button('Build Model'):
402
+
403
+
404
+ if 'iterations' not in st.session_state:
405
+ st.session_state['iterations'] = 0
406
+
407
+ if 'final_selection' not in st.session_state:
408
+ st.session_state['final_selection'] = False
409
+
410
+ save_path = r"Model/"
411
+ with columns[1]:
412
+ if st.session_state['final_selection']:
413
+ st.write(f'Total combinations created {format_numbers(len(st.session_state["final_selection"]))}')
414
+
415
+ if st.checkbox('Build all iterations'):
416
+ iterations = len(st.session_state['final_selection'])
417
+ else:
418
+ iterations = st.number_input('Select the number of iterations to perform', min_value=0, step=100,
419
+ value=st.session_state['iterations'], on_change=reset_model_result_dct)
420
+ # st.write("iterations=", iterations)
421
+
422
+
423
+ if st.button('Build Model', on_click=reset_model_result_dct):
424
+ st.session_state['iterations'] = iterations
425
+
426
+ # Section 4 - Model
427
+ # st.session_state['media_data'] = st.session_state['media_data'].fillna(method='ffill')
428
+ st.session_state['media_data'] = st.session_state['media_data'].ffill()
429
+ st.markdown(
430
+ 'Data Split -- Training Period: May 9th, 2023 - October 5th,2023 , Testing Period: October 6th, 2023 - November 7th, 2023 ')
431
+ progress_bar = st.progress(0) # Initialize the progress bar
432
+ # time_remaining_text = st.empty() # Create an empty space for time remaining text
433
+ start_time = time.time() # Record the start time
434
+ progress_text = st.empty()
435
+
436
+ # time_elapsed_text = st.empty()
437
+ # for i, selected_features in enumerate(st.session_state["final_selection"][40000:40000 + int(iterations)]):
438
+ # st.write(st.session_state["final_selection"])
439
+ # for i, selected_features in enumerate(st.session_state["final_selection"]):
440
+
441
+ if is_panel == True:
442
+ for i, selected_features in enumerate(st.session_state["final_selection"][0:int(iterations)]): # srishti
443
+ df = st.session_state['media_data']
444
+
445
+ fet = [var for var in selected_features if len(var) > 0]
446
+ inp_vars_str = " + ".join(fet) # new
447
+
448
+ X = df[fet]
449
+ y = df[target_col]
450
+ ss = MinMaxScaler()
451
+ X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
452
+
453
+ X[target_col] = y # Sprint2
454
+ X[panel_col] = df[panel_col] # Sprint2
455
+
456
+ X_train = X.iloc[:8000]
457
+ X_test = X.iloc[8000:]
458
+ y_train = y.iloc[:8000]
459
+ y_test = y.iloc[8000:]
460
+
461
+ print(X_train.shape)
462
+ # model = sm.OLS(y_train, X_train).fit()
463
+ md_str = target_col + " ~ " + inp_vars_str
464
+ # md = smf.mixedlm("total_approved_accounts_revenue ~ {}".format(inp_vars_str),
465
+ # data=X_train[[target_col] + fet],
466
+ # groups=X_train[panel_col])
467
+ md = smf.mixedlm(md_str,
468
+ data=X_train[[target_col] + fet],
469
+ groups=X_train[panel_col])
470
+ mdf = md.fit()
471
+ predicted_values = mdf.fittedvalues
472
+
473
+ coefficients = mdf.fe_params.to_dict()
474
+ model_positive = [col for col in coefficients.keys() if coefficients[col] > 0]
475
+
476
+ pvalues = [var for var in list(mdf.pvalues) if var <= 0.06]
477
+
478
+ if (len(model_positive) / len(selected_features)) > 0 and (
479
+ len(pvalues) / len(selected_features)) >= 0: # srishti - changed just for testing, revert later
480
+ # predicted_values = model.predict(X_train)
481
+ mape = mean_absolute_percentage_error(y_train, predicted_values)
482
+ r2 = r2_score(y_train, predicted_values)
483
+ adjr2 = 1 - (1 - r2) * (len(y_train) - 1) / (len(y_train) - len(selected_features) - 1)
484
+
485
+ filename = os.path.join(save_path, f"model_{i}.pkl")
486
+ with open(filename, "wb") as f:
487
+ pickle.dump(mdf, f)
488
+ # with open(r"C:\Users\ManojP\Documents\MMM\simopt\Model\model.pkl", 'rb') as file:
489
+ # model = pickle.load(file)
490
+
491
+ st.session_state['Model_results']['Model_object'].append(filename)
492
+ st.session_state['Model_results']['Model_iteration'].append(i)
493
+ st.session_state['Model_results']['Feature_set'].append(fet)
494
+ st.session_state['Model_results']['MAPE'].append(mape)
495
+ st.session_state['Model_results']['R2'].append(r2)
496
+ st.session_state['Model_results']['pos_count'].append(len(model_positive))
497
+ st.session_state['Model_results']['ADJR2'].append(adjr2)
498
+
499
+ current_time = time.time()
500
+ time_taken = current_time - start_time
501
+ time_elapsed_minutes = time_taken / 60
502
+ completed_iterations_text = f"{i + 1}/{iterations}"
503
+ progress_bar.progress((i + 1) / int(iterations))
504
+ progress_text.text(
505
+ f'Completed iterations: {completed_iterations_text},Time Elapsed (min): {time_elapsed_minutes:.2f}')
506
+ st.write(
507
+ f'Out of {st.session_state["iterations"]} iterations : {len(st.session_state["Model_results"]["Model_object"])} valid models')
508
+
509
+ else:
510
+
511
+ for i, selected_features in enumerate(st.session_state["final_selection"][0:int(iterations)]): # srishti
512
+ df = st.session_state['media_data']
513
+
514
+ fet = [var for var in selected_features if len(var) > 0]
515
+ inp_vars_str = " + ".join(fet)
516
+
517
+ X = df[fet]
518
+ y = df[target_col]
519
+ ss = MinMaxScaler()
520
+ X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
521
+ X = sm.add_constant(X)
522
+ X_train = X.iloc[:130]
523
+ X_test = X.iloc[130:]
524
+ y_train = y.iloc[:130]
525
+ y_test = y.iloc[130:]
526
+
527
+ model = sm.OLS(y_train, X_train).fit()
528
+
529
+
530
+ coefficients = model.params.to_list()
531
+ model_positive = [coef for coef in coefficients if coef > 0]
532
+ predicted_values = model.predict(X_train)
533
+ pvalues = [var for var in list(model.pvalues) if var <= 0.06]
534
+
535
+ # if (len(model_possitive) / len(selected_features)) > 0.9 and (len(pvalues) / len(selected_features)) >= 0.8:
536
+ if (len(model_positive) / len(selected_features)) > 0 and (len(pvalues) / len(
537
+ selected_features)) >= 0.5: # srishti - changed just for testing, revert later VALID MODEL CRITERIA
538
+ # predicted_values = model.predict(X_train)
539
+ mape = mean_absolute_percentage_error(y_train, predicted_values)
540
+ adjr2 = model.rsquared_adj
541
+ r2 = model.rsquared
542
+
543
+ filename = os.path.join(save_path, f"model_{i}.pkl")
544
+ with open(filename, "wb") as f:
545
+ pickle.dump(model, f)
546
+ # with open(r"C:\Users\ManojP\Documents\MMM\simopt\Model\model.pkl", 'rb') as file:
547
+ # model = pickle.load(file)
548
+
549
+ st.session_state['Model_results']['Model_object'].append(filename)
550
+ st.session_state['Model_results']['Model_iteration'].append(i)
551
+ st.session_state['Model_results']['Feature_set'].append(fet)
552
+ st.session_state['Model_results']['MAPE'].append(mape)
553
+ st.session_state['Model_results']['R2'].append(r2)
554
+ st.session_state['Model_results']['ADJR2'].append(adjr2)
555
+ st.session_state['Model_results']['pos_count'].append(len(model_positive))
556
+
557
+ current_time = time.time()
558
+ time_taken = current_time - start_time
559
+ time_elapsed_minutes = time_taken / 60
560
+ completed_iterations_text = f"{i + 1}/{iterations}"
561
+ progress_bar.progress((i + 1) / int(iterations))
562
+ progress_text.text(
563
+ f'Completed iterations: {completed_iterations_text},Time Elapsed (min): {time_elapsed_minutes:.2f}')
564
+ st.write(
565
+ f'Out of {st.session_state["iterations"]} iterations : {len(st.session_state["Model_results"]["Model_object"])} valid models')
566
+
567
+ pd.DataFrame(st.session_state['Model_results']).to_csv('model_output.csv')
568
+
569
+
570
+ def to_percentage(value):
571
+ return f'{value * 100:.1f}%'
572
+
573
+ ## Section 5 - Select Model
574
+ st.title('2. Select Models')
575
+ if 'tick' not in st.session_state:
576
+ st.session_state['tick'] = False
577
+ if st.checkbox('Show results of top 10 models (based on MAPE and Adj. R2)', value=st.session_state['tick']):
578
+ st.session_state['tick'] = True
579
+ st.write('Select one model iteration to generate performance metrics for it:')
580
+ data = pd.DataFrame(st.session_state['Model_results'])
581
+ data = data[data['pos_count']==data['pos_count'].max()].reset_index(drop=True) # Sprint4 -- Srishti -- only show models with the lowest num of neg coeffs
582
+ data.sort_values(by=['ADJR2'], ascending=False, inplace=True)
583
+ data.drop_duplicates(subset='Model_iteration', inplace=True)
584
+ top_10 = data.head(10)
585
+ top_10['Rank'] = np.arange(1, len(top_10) + 1, 1)
586
+ top_10[['MAPE', 'R2', 'ADJR2']] = np.round(top_10[['MAPE', 'R2', 'ADJR2']], 4).applymap(to_percentage)
587
+ top_10_table = top_10[['Rank', 'Model_iteration', 'MAPE', 'ADJR2', 'R2']]
588
+ # top_10_table.columns=[['Rank','Model Iteration Index','MAPE','Adjusted R2','R2']]
589
+ gd = GridOptionsBuilder.from_dataframe(top_10_table)
590
+ gd.configure_pagination(enabled=True)
591
+
592
+ gd.configure_selection(
593
+ use_checkbox=True,
594
+ selection_mode="single",
595
+ pre_select_all_rows=False,
596
+ pre_selected_rows=[1],
597
+ )
598
+
599
+ gridoptions = gd.build()
600
+
601
+ table = AgGrid(top_10, gridOptions=gridoptions, update_mode=GridUpdateMode.SELECTION_CHANGED)
602
+
603
+ selected_rows = table.selected_rows
604
+ # if st.session_state["selected_rows"] != selected_rows:
605
+ # st.session_state["build_rc_cb"] = False
606
+ st.session_state["selected_rows"] = selected_rows
607
+ if 'Model' not in st.session_state:
608
+ st.session_state['Model'] = {}
609
+
610
+ # Section 6 - Display Results
611
+
612
+ if len(selected_rows) > 0:
613
+ st.header('2.1 Results Summary')
614
+
615
+ model_object = data[data['Model_iteration'] == selected_rows[0]['Model_iteration']]['Model_object']
616
+ features_set = data[data['Model_iteration'] == selected_rows[0]['Model_iteration']]['Feature_set']
617
+
618
+ with open(str(model_object.values[0]), 'rb') as file:
619
+ # print(file)
620
+ model = pickle.load(file)
621
+ st.write(model.summary())
622
+ st.header('2.2 Actual vs. Predicted Plot')
623
+
624
+ if is_panel :
625
+ df = st.session_state['media_data']
626
+ X = df[features_set.values[0]]
627
+ y = df[target_col]
628
+
629
+ ss = MinMaxScaler()
630
+ X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
631
+
632
+ # Sprint2 changes
633
+ X[target_col] = y # new
634
+ X[panel_col] = df[panel_col]
635
+ X[date_col] = date
636
+
637
+ X_train = X.iloc[:8000]
638
+ X_test = X.iloc[8000:].reset_index(drop=True)
639
+ y_train = y.iloc[:8000]
640
+ y_test = y.iloc[8000:].reset_index(drop=True)
641
+
642
+ test_spends = spends_data[8000:] # Sprint3 - test spends for resp curves
643
+ random_eff_df = get_random_effects(media_data, panel_col, model)
644
+ train_pred = model.fittedvalues
645
+ test_pred = mdf_predict(X_test, model, random_eff_df)
646
+ print("__" * 20, test_pred.isna().sum())
647
+
648
+ else :
649
+ df = st.session_state['media_data']
650
+ X = df[features_set.values[0]]
651
+ y = df[target_col]
652
+
653
+ ss = MinMaxScaler()
654
+ X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
655
+ X = sm.add_constant(X)
656
+
657
+ X[date_col] = date
658
+
659
+ X_train = X.iloc[:130]
660
+ X_test = X.iloc[130:].reset_index(drop=True)
661
+ y_train = y.iloc[:130]
662
+ y_test = y.iloc[130:].reset_index(drop=True)
663
+
664
+ test_spends = spends_data[130:] # Sprint3 - test spends for resp curves
665
+ train_pred = model.predict(X_train[features_set.values[0]+['const']])
666
+ test_pred = model.predict(X_test[features_set.values[0]+['const']])
667
+
668
+
669
+ # save x test to test - srishti
670
+ x_test_to_save = X_test.copy()
671
+ x_test_to_save['Actuals'] = y_test
672
+ x_test_to_save['Predictions'] = test_pred
673
+
674
+ x_train_to_save = X_train.copy()
675
+ x_train_to_save['Actuals'] = y_train
676
+ x_train_to_save['Predictions'] = train_pred
677
+
678
+ x_train_to_save.to_csv('Test/x_train_to_save.csv', index=False)
679
+ x_test_to_save.to_csv('Test/x_test_to_save.csv', index=False)
680
+
681
+ st.session_state['X'] = X_train
682
+ st.session_state['features_set'] = features_set.values[0]
683
+ print("**" * 20, "selected model features : ", features_set.values[0])
684
+ metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(X_train[date_col], y_train, train_pred,
685
+ model, target_column=sel_target_col,
686
+ is_panel=is_panel) # Sprint2
687
+
688
+ st.plotly_chart(actual_vs_predicted_plot, use_container_width=True)
689
+
690
+ st.markdown('## 2.3 Residual Analysis')
691
+ columns = st.columns(2)
692
+ with columns[0]:
693
+ fig = plot_residual_predicted(y_train, train_pred, X_train) # Sprint2
694
+ st.plotly_chart(fig)
695
+
696
+ with columns[1]:
697
+ st.empty()
698
+ fig = qqplot(y_train, train_pred) # Sprint2
699
+ st.plotly_chart(fig)
700
+
701
+ with columns[0]:
702
+ fig = residual_distribution(y_train, train_pred) # Sprint2
703
+ st.pyplot(fig)
704
+
705
+ vif_data = pd.DataFrame()
706
+ # X=X.drop('const',axis=1)
707
+ X_train_orig = X_train.copy() # Sprint2 -- creating a copy of xtrain. Later deleting panel, target & date from xtrain
708
+ del_col_list = list(set([target_col, panel_col, date_col]).intersection(list(X_train.columns)))
709
+ X_train.drop(columns=del_col_list, inplace=True) # Sprint2
710
+
711
+ vif_data["Variable"] = X_train.columns
712
+ vif_data["VIF"] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
713
+ vif_data.sort_values(by=['VIF'], ascending=False, inplace=True)
714
+ vif_data = np.round(vif_data)
715
+ vif_data['VIF'] = vif_data['VIF'].astype(float)
716
+ st.header('2.4 Variance Inflation Factor (VIF)')
717
+ # st.dataframe(vif_data)
718
+ color_mapping = {
719
+ 'darkgreen': (vif_data['VIF'] < 3),
720
+ 'orange': (vif_data['VIF'] >= 3) & (vif_data['VIF'] <= 10),
721
+ 'darkred': (vif_data['VIF'] > 10)
722
+ }
723
+
724
+ # Create a horizontal bar plot
725
+ fig, ax = plt.subplots()
726
+ fig.set_figwidth(10) # Adjust the width of the figure as needed
727
+
728
+ # Sort the bars by descending VIF values
729
+ vif_data = vif_data.sort_values(by='VIF', ascending=False)
730
+
731
+ # Iterate through the color mapping and plot bars with corresponding colors
732
+ for color, condition in color_mapping.items():
733
+ subset = vif_data[condition]
734
+ bars = ax.barh(subset["Variable"], subset["VIF"], color=color, label=color)
735
+
736
+ # Add text annotations on top of the bars
737
+ for bar in bars:
738
+ width = bar.get_width()
739
+ ax.annotate(f'{width:}', xy=(width, bar.get_y() + bar.get_height() / 2), xytext=(5, 0),
740
+ textcoords='offset points', va='center')
741
+
742
+ # Customize the plot
743
+ ax.set_xlabel('VIF Values')
744
+ # ax.set_title('2.4 Variance Inflation Factor (VIF)')
745
+ # ax.legend(loc='upper right')
746
+
747
+ # Display the plot in Streamlit
748
+ st.pyplot(fig)
749
+
750
+ with st.expander('Results Summary Test data'):
751
+ # ss = MinMaxScaler()
752
+ # X_test = pd.DataFrame(ss.fit_transform(X_test), columns=X_test.columns)
753
+ st.header('2.2 Actual vs. Predicted Plot')
754
+
755
+ metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(X_test[date_col], y_test,
756
+ test_pred, model,
757
+ target_column=sel_target_col,
758
+ is_panel=is_panel) # Sprint2
759
+
760
+ st.plotly_chart(actual_vs_predicted_plot, use_container_width=True)
761
+
762
+ st.markdown('## 2.3 Residual Analysis')
763
+ columns = st.columns(2)
764
+ with columns[0]:
765
+ fig = plot_residual_predicted(y, test_pred, X_test) # Sprint2
766
+ st.plotly_chart(fig)
767
+
768
+ with columns[1]:
769
+ st.empty()
770
+ fig = qqplot(y, test_pred) # Sprint2
771
+ st.plotly_chart(fig)
772
+
773
+ with columns[0]:
774
+ fig = residual_distribution(y, test_pred) # Sprint2
775
+ st.pyplot(fig)
776
+
777
+ value = False
778
+ save_button_model = st.checkbox('Save this model to tune', key='build_rc_cb') # , on_click=set_save())
779
+
780
+ if save_button_model:
781
+ mod_name = st.text_input('Enter model name')
782
+ if len(mod_name) > 0:
783
+ mod_name = mod_name + "__" + target_col # Sprint4 - adding target col to model name
784
+ if is_panel :
785
+ pred_train= model.fittedvalues
786
+ pred_test= mdf_predict(X_test, model, random_eff_df)
787
+ else :
788
+ st.session_state['features_set'] = st.session_state['features_set'] + ['const']
789
+ pred_train= model.predict(X_train_orig[st.session_state['features_set']])
790
+ pred_test= model.predict(X_test[st.session_state['features_set']])
791
+
792
+ st.session_state['Model'][mod_name] = {"Model_object": model,
793
+ 'feature_set': st.session_state['features_set'],
794
+ 'X_train': X_train_orig,
795
+ 'X_test': X_test,
796
+ 'y_train': y_train,
797
+ 'y_test': y_test,
798
+ 'pred_train':pred_train,
799
+ 'pred_test': pred_test
800
+ }
801
+ st.session_state['X_train'] = X_train_orig
802
+ # st.session_state['X_test'] = X_test
803
+ # st.session_state['y_train'] = y_train
804
+ # st.session_state['y_test'] = y_test
805
+ st.session_state['X_test_spends'] = test_spends
806
+ # st.session_state['base_model'] = model
807
+ # st.session_state['base_model_feature_set'] = st.session_state['features_set']
808
+ st.session_state['saved_model_names'].append(mod_name)
809
+ # Sprint3 additions
810
+ if is_panel :
811
+ random_eff_df = get_random_effects(media_data, panel_col, model)
812
+ st.session_state['random_effects'] = random_eff_df
813
+
814
+ # st.session_state['pred_train'] = model.fittedvalues
815
+ # st.session_state['pred_test'] = mdf_predict(X_test, model, random_eff_df)
816
+ # # End of Sprint3 additions
817
+
818
+ with open("best_models.pkl", "wb") as f:
819
+ pickle.dump(st.session_state['Model'], f)
820
+ st.success(mod_name + ' model saved! Proceed to the next page to tune the model')
821
+ urm = st.session_state['used_response_metrics']
822
+ urm.append(sel_target_col)
823
+ st.session_state['used_response_metrics'] = list(set(urm))
824
+ mod_name = ""
825
+ # Sprint4 - add the formatted name of the target col to used resp metrics
826
+ value = False
pages/4_Saved_Model_Results.py CHANGED
@@ -7,16 +7,14 @@ import statsmodels.api as sm
7
  from sklearn.metrics import mean_absolute_percentage_error
8
  import sys
9
  import os
10
- from utilities import (set_header,
11
- load_local_css,
12
- load_authenticator)
13
  import seaborn as sns
14
  import matplotlib.pyplot as plt
15
  import sweetviz as sv
16
  import tempfile
17
  from sklearn.preprocessing import MinMaxScaler
18
  from st_aggrid import AgGrid
19
- from st_aggrid import GridOptionsBuilder,GridUpdateMode
20
  from st_aggrid import GridOptionsBuilder
21
  import sys
22
  import re
@@ -24,390 +22,586 @@ import re
24
  sys.setrecursionlimit(10**6)
25
 
26
  original_stdout = sys.stdout
27
- sys.stdout = open('temp_stdout.txt', 'w')
28
  sys.stdout.close()
29
  sys.stdout = original_stdout
30
 
31
- st.set_page_config(layout='wide')
32
- load_local_css('styles.css')
33
  set_header()
34
 
35
  for k, v in st.session_state.items():
36
- if k not in ['logout', 'login','config'] and not k.startswith('FormSubmitter'):
37
  st.session_state[k] = v
38
 
39
- authenticator = st.session_state.get('authenticator')
40
  if authenticator is None:
41
  authenticator = load_authenticator()
42
 
43
- name, authentication_status, username = authenticator.login('Login', 'main')
44
- auth_status = st.session_state.get('authentication_status')
45
 
46
  if auth_status == True:
47
- is_state_initiaized = st.session_state.get('initialized',False)
48
  if not is_state_initiaized:
49
- a=1
50
-
51
 
52
  def plot_residual_predicted(actual, predicted, df_):
53
- df_['Residuals'] = actual - pd.Series(predicted)
54
- df_['StdResidual'] = (df_['Residuals'] - df_['Residuals'].mean()) / df_['Residuals'].std()
55
-
56
- # Create a Plotly scatter plot
57
- fig = px.scatter(df_, x=predicted, y='StdResidual', opacity=0.5,color_discrete_sequence=["#11B6BD"])
58
-
59
- # Add horizontal lines
60
- fig.add_hline(y=0, line_dash="dash", line_color="darkorange")
61
- fig.add_hline(y=2, line_color="red")
62
- fig.add_hline(y=-2, line_color="red")
63
-
64
- fig.update_xaxes(title='Predicted')
65
- fig.update_yaxes(title='Standardized Residuals (Actual - Predicted)')
66
-
67
- # Set the same width and height for both figures
68
- fig.update_layout(title='Residuals over Predicted Values', autosize=False, width=600, height=400)
69
-
70
- return fig
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
  def residual_distribution(actual, predicted):
73
- Residuals = actual - pd.Series(predicted)
74
-
75
- # Create a Seaborn distribution plot
76
- sns.set(style="whitegrid")
77
- plt.figure(figsize=(6, 4))
78
- sns.histplot(Residuals, kde=True, color="#11B6BD")
79
-
80
- plt.title(' Distribution of Residuals')
81
- plt.xlabel('Residuals')
82
- plt.ylabel('Probability Density')
83
-
84
- return plt
85
-
86
-
87
  def qqplot(actual, predicted):
88
- Residuals = actual - pd.Series(predicted)
89
- Residuals = pd.Series(Residuals)
90
- Resud_std = (Residuals - Residuals.mean()) / Residuals.std()
91
-
92
- # Create a QQ plot using Plotly with custom colors
93
- fig = go.Figure()
94
- fig.add_trace(go.Scatter(x=sm.ProbPlot(Resud_std).theoretical_quantiles,
95
- y=sm.ProbPlot(Resud_std).sample_quantiles,
96
- mode='markers',
97
- marker=dict(size=5, color="#11B6BD"),
98
- name='QQ Plot'))
99
-
100
- # Add the 45-degree reference line
101
- diagonal_line = go.Scatter(
102
- x=[-2, 2], # Adjust the x values as needed to fit the range of your data
103
- y=[-2, 2], # Adjust the y values accordingly
104
- mode='lines',
105
- line=dict(color='red'), # Customize the line color and style
106
- name=' '
107
  )
108
- fig.add_trace(diagonal_line)
109
-
110
- # Customize the layout
111
- fig.update_layout(title='QQ Plot of Residuals',title_x=0.5, autosize=False, width=600, height=400,
112
- xaxis_title='Theoretical Quantiles', yaxis_title='Sample Quantiles')
113
-
114
- return fig
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
 
116
 
117
  def plot_actual_vs_predicted(date, y, predicted_values, model):
118
 
119
  fig = go.Figure()
120
 
121
- fig.add_trace(go.Scatter(x=date, y=y, mode='lines', name='Actual', line=dict(color='blue')))
122
- fig.add_trace(go.Scatter(x=date, y=predicted_values, mode='lines', name='Predicted', line=dict(color='orange')))
123
-
 
 
 
 
 
 
 
 
 
 
 
 
124
  # Calculate MAPE
125
- mape = mean_absolute_percentage_error(y, predicted_values)*100
126
-
127
  # Calculate R-squared
128
  rss = np.sum((y - predicted_values) ** 2)
129
  tss = np.sum((y - np.mean(y)) ** 2)
130
  r_squared = 1 - (rss / tss)
131
-
132
  # Get the number of predictors
133
  num_predictors = model.df_model
134
-
135
  # Get the number of samples
136
  num_samples = len(y)
137
-
138
  # Calculate Adjusted R-squared
139
- adj_r_squared = 1 - ((1 - r_squared) * ((num_samples - 1) / (num_samples - num_predictors - 1)))
140
- metrics_table = pd.DataFrame({
141
- 'Metric': ['MAPE', 'R-squared', 'AdjR-squared'],
142
- 'Value': [mape, r_squared, adj_r_squared]})
 
 
 
 
 
143
  fig.update_layout(
144
- xaxis=dict(title='Date'),
145
- yaxis=dict(title='Value'),
146
- title=f'MAPE : {mape:.2f}%, AdjR2: {adj_r_squared:.2f}',
147
- xaxis_tickangle=-30
148
  )
149
 
150
- return metrics_table,fig
 
151
  def contributions(X, model):
152
  X1 = X.copy()
153
  for j, col in enumerate(X1.columns):
154
  X1[col] = X1[col] * model.params.values[j]
155
 
156
- return np.round((X1.sum() / sum(X1.sum()) * 100).sort_values(ascending=False), 2)
 
 
157
 
158
- transformed_data=pd.read_csv('transformed_data.csv')
159
 
160
  # hard coded for now, need to get features set from model
161
 
162
- feature_set_dct={'app_installs_-_appsflyer':['paid_search_clicks',
163
- 'fb:_level_achieved_-_tier_1_impressions_lag2',
164
- 'fb:_level_achieved_-_tier_2_clicks_lag2',
165
- 'paid_social_others_impressions_adst.1',
166
- 'ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag2',
167
- 'digital_tactic_others_clicks',
168
- 'kwai_clicks_adst.3',
169
- 'programmaticclicks',
170
- 'indicacao_clicks_adst.1',
171
- 'infleux_clicks_adst.4',
172
- 'influencer_clicks'],
173
-
174
- 'account_requests_-_appsflyer':['paid_search_impressions',
175
- 'fb:_level_achieved_-_tier_1_clicks_adst.1',
176
- 'fb:_level_achieved_-_tier_2_clicks_adst.1',
177
- 'paid_social_others_clicks_lag2',
178
- 'ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag5_adst.1',
179
- 'digital_tactic_others_clicks_adst.1',
180
- 'kwai_clicks_adst.2',
181
- 'programmaticimpressions_lag4_adst.1',
182
- 'indicacao_clicks',
183
- 'infleux_clicks_adst.2',
184
- 'influencer_clicks'],
185
-
186
- 'total_approved_accounts_-_appsflyer':['paid_search_clicks',
187
- 'fb:_level_achieved_-_tier_1_impressions_lag2_adst.1',
188
- 'fb:_level_achieved_-_tier_2_impressions_lag2',
189
- 'paid_social_others_clicks_lag2_adst.2',
190
- 'ga_app:_will_and_cid_pequena_baixo_risco_impressions_lag4',
191
- 'digital_tactic_others_clicks',
192
- 'kwai_impressions_adst.2',
193
- 'programmaticclicks_adst.5',
194
- 'indicacao_clicks_adst.1',
195
- 'infleux_clicks_adst.3',
196
- 'influencer_clicks'],
197
-
198
- 'total_approved_accounts_-_revenue':['paid_search_impressions_adst.5',
199
- 'kwai_impressions_lag2_adst.3',
200
- 'indicacao_clicks_adst.3',
201
- 'infleux_clicks_adst.3',
202
- 'programmaticclicks_adst.4',
203
- 'influencer_clicks_adst.3',
204
- 'fb:_level_achieved_-_tier_1_impressions_adst.2',
205
- 'fb:_level_achieved_-_tier_2_impressions_lag3_adst.5',
206
- 'paid_social_others_impressions_adst.3',
207
- 'ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag3_adst.5',
208
- 'digital_tactic_others_clicks_adst.2']
209
-
210
- }
211
-
212
- #""" the above part should be modified so that we are fetching features set from the saved model"""
213
-
214
-
215
-
216
- def contributions(X, model,target):
 
 
 
217
  X1 = X.copy()
218
  for j, col in enumerate(X1.columns):
219
  X1[col] = X1[col] * model.params.values[j]
220
-
221
- contributions= np.round((X1.sum() / sum(X1.sum()) * 100).sort_values(ascending=False), 2)
222
- contributions=pd.DataFrame(contributions,columns=target).reset_index().rename(columns={'index':'Channel'})
223
- contributions['Channel']=[ re.split(r'_imp|_cli', col)[0] for col in contributions['Channel']]
224
-
 
 
 
 
 
 
 
 
225
  return contributions
226
-
227
 
228
- def model_fit(features_set,target):
229
  X = transformed_data[features_set]
230
- y= transformed_data[target]
231
  ss = MinMaxScaler()
232
  X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
233
  X = sm.add_constant(X)
234
- X_train=X.iloc[:150]
235
- X_test=X.iloc[150:]
236
- y_train=y.iloc[:150]
237
- y_test=y.iloc[150:]
238
  model = sm.OLS(y_train, X_train).fit()
239
  predicted_values_train = model.predict(X_train)
240
  r2 = model.rsquared
241
  adjr2 = model.rsquared_adj
242
  train_mape = mean_absolute_percentage_error(y_train, predicted_values_train)
243
- test_mape=mean_absolute_percentage_error(y_test, model.predict(X_test))
244
- summary=model.summary()
245
- train_contributions=contributions(X_train,model,[target])
246
- return pd.DataFrame({'Model':target,'R2':np.round(r2,2),'ADJr2':np.round(adjr2,2),'Train Mape':np.round(train_mape,2),
247
- 'Test Mape':np.round(test_mape,2),'Summary':summary,'Model_object':model
248
- },index=[0]), train_contributions
 
 
 
 
 
 
 
 
 
 
 
 
249
 
250
- metrics_table=pd.DataFrame()
251
 
252
- if 'contribution_df' not in st.session_state:
253
- st.session_state["contribution_df"]=pd.DataFrame()
254
 
255
- for target,feature_set in feature_set_dct.items():
256
- metrics_table= pd.concat([metrics_table,model_fit(features_set=feature_set,target=target)[0]])
257
- if st.session_state["contribution_df"].empty:
258
- st.session_state["contribution_df"]= model_fit(features_set=feature_set,target=target)[1]
259
- else:
260
- st.session_state["contribution_df"]=pd.merge(st.session_state["contribution_df"],model_fit(features_set=feature_set,target=target)[1])
 
 
 
 
 
 
 
261
 
262
  # st.write(st.session_state["contribution_df"])
263
-
264
-
265
- metrics_table.reset_index(drop=True,inplace=True)
266
-
267
 
 
268
 
269
-
270
-
271
-
272
-
273
-
274
- eda_columns=st.columns(2)
275
  with eda_columns[1]:
276
- eda=st.button('Generate EDA Report',help="Click to generate a bivariate report for the selected response metric from the table below.")
277
-
278
-
 
279
 
280
  # st.markdown('Model Metrics')
281
-
282
- st.title('Contribution Overview')
283
 
284
- contribution_selections=st.multiselect('Select the models to compare contributions',[col for col in st.session_state['contribution_df'].columns if col.lower() != 'channel' ],default=[col for col in st.session_state['contribution_df'].columns if col.lower() != 'channel' ][-1])
285
- trace_data=[]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
 
287
  for selection in contribution_selections:
288
 
289
- trace=go.Bar(x=st.session_state['contribution_df']['Channel'], y=st.session_state['contribution_df'][selection],name=selection,text=np.round(st.session_state['contribution_df'][selection],0).astype(int).astype(str)+'%',textposition='outside')
 
 
 
 
 
 
 
 
 
290
  trace_data.append(trace)
291
 
292
  layout = go.Layout(
293
- title='Metrics Contribution by Channel',
294
- xaxis=dict(title='Channel Name'),
295
- yaxis=dict(title='Metrics Contribution'),
296
- barmode='group'
297
- )
298
  fig = go.Figure(data=trace_data, layout=layout)
299
- st.plotly_chart(fig,use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
300
 
301
- st.title('Analysis of Models Result')
302
- #st.markdown()
303
- gd_table=metrics_table.iloc[:,:-2]
304
- gd=GridOptionsBuilder.from_dataframe(gd_table)
305
- #gd.configure_pagination(enabled=True)
306
- gd.configure_selection(use_checkbox=True)
307
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
 
309
- gridoptions=gd.build()
310
- table = AgGrid(gd_table,gridOptions=gridoptions,fit_columns_on_grid_load=True,height=200)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
  # table=metrics_table.iloc[:,:-2]
312
  # table.insert(0, "Select", False)
313
  # selection_table=st.data_editor(table,column_config={"Select": st.column_config.CheckboxColumn(required=True)})
314
-
315
-
316
 
317
- if len(table.selected_rows)==0:
318
- st.warning("Click on the checkbox to view comprehensive results of the selected model.")
 
 
319
  st.stop()
320
- else:
321
- target_column=table.selected_rows[0]['Model']
322
- feature_set=feature_set_dct[target_column]
323
 
324
  with eda_columns[1]:
325
  if eda:
 
326
  def generate_report_with_target(channel_data, target_feature):
327
- report = sv.analyze([channel_data, "Dataset"], target_feat=target_feature,verbose=False)
 
 
328
  temp_dir = tempfile.mkdtemp()
329
  report_path = os.path.join(temp_dir, "report.html")
330
- report.show_html(filepath=report_path, open_browser=False) # Generate the report as an HTML file
 
 
331
  return report_path
332
-
333
- report_data=transformed_data[feature_set]
334
- report_data[target_column]=transformed_data[target_column]
335
  report_file = generate_report_with_target(report_data, target_column)
336
-
337
  if os.path.exists(report_file):
338
- with open(report_file, 'rb') as f:
339
  st.download_button(
340
  label="Download EDA Report",
341
  data=f.read(),
342
  file_name="report.html",
343
- mime="text/html"
344
  )
345
  else:
346
  st.warning("Report generation failed. Unable to find the report file.")
347
 
348
-
349
-
350
- model=metrics_table[metrics_table['Model']==target_column]['Model_object'].iloc[0]
351
- st.header('Model Summary')
352
  st.write(model.summary())
353
- X=transformed_data[feature_set]
354
- ss=MinMaxScaler()
355
- X=pd.DataFrame(ss.fit_transform(X),columns=X.columns)
356
- X=sm.add_constant(X)
357
- y=transformed_data[target_column]
358
- X_train=X.iloc[:150]
359
- X_test=X.iloc[150:]
360
- y_train=y.iloc[:150]
361
- y_test=y.iloc[150:]
362
- X.index=transformed_data['date']
363
- y.index=transformed_data['date']
364
-
365
- metrics_table_train,fig_train= plot_actual_vs_predicted(X_train.index, y_train, model.predict(X_train), model)
366
- metrics_table_test,fig_test= plot_actual_vs_predicted(X_test.index, y_test, model.predict(X_test), model)
367
-
368
- metrics_table_train=metrics_table_train.set_index('Metric').transpose()
369
- metrics_table_train.index=['Train']
370
- metrics_table_test=metrics_table_test.set_index('Metric').transpose()
371
- metrics_table_test.index=['test']
372
- metrics_table=np.round(pd.concat([metrics_table_train,metrics_table_test]),2)
373
-
374
- st.markdown('Result Overview')
375
- st.dataframe(np.round(metrics_table,2),use_container_width=True)
376
-
377
- st.subheader('Actual vs Predicted Plot Train')
378
-
379
- st.plotly_chart(fig_train,use_container_width=True)
380
- st.subheader('Actual vs Predicted Plot Test')
381
- st.plotly_chart(fig_test,use_container_width=True)
382
-
383
- st.markdown('## Residual Analysis')
384
- columns=st.columns(2)
385
-
386
-
387
- Xtrain1=X_train.copy()
 
 
 
388
  with columns[0]:
389
- fig=plot_residual_predicted(y_train,model.predict(Xtrain1),Xtrain1)
390
  st.plotly_chart(fig)
391
 
392
  with columns[1]:
393
  st.empty()
394
- fig = qqplot(y_train,model.predict(X_train))
395
  st.plotly_chart(fig)
396
 
397
  with columns[0]:
398
- fig=residual_distribution(y_train,model.predict(X_train))
399
  st.pyplot(fig)
400
 
401
 
402
-
403
  elif auth_status == False:
404
- st.error('Username/Password is incorrect')
405
  try:
406
- username_forgot_pw, email_forgot_password, random_password = authenticator.forgot_password('Forgot password')
 
 
407
  if username_forgot_pw:
408
- st.success('New password sent securely')
409
  # Random password to be transferred to the user securely
410
  elif username_forgot_pw == False:
411
- st.error('Username not found')
412
  except Exception as e:
413
  st.error(e)
 
7
  from sklearn.metrics import mean_absolute_percentage_error
8
  import sys
9
  import os
10
+ from utilities import set_header, load_local_css, load_authenticator
 
 
11
  import seaborn as sns
12
  import matplotlib.pyplot as plt
13
  import sweetviz as sv
14
  import tempfile
15
  from sklearn.preprocessing import MinMaxScaler
16
  from st_aggrid import AgGrid
17
+ from st_aggrid import GridOptionsBuilder, GridUpdateMode
18
  from st_aggrid import GridOptionsBuilder
19
  import sys
20
  import re
 
22
  sys.setrecursionlimit(10**6)
23
 
24
  original_stdout = sys.stdout
25
+ sys.stdout = open("temp_stdout.txt", "w")
26
  sys.stdout.close()
27
  sys.stdout = original_stdout
28
 
29
+ st.set_page_config(layout="wide")
30
+ load_local_css("styles.css")
31
  set_header()
32
 
33
  for k, v in st.session_state.items():
34
+ if k not in ["logout", "login", "config"] and not k.startswith("FormSubmitter"):
35
  st.session_state[k] = v
36
 
37
+ authenticator = st.session_state.get("authenticator")
38
  if authenticator is None:
39
  authenticator = load_authenticator()
40
 
41
+ name, authentication_status, username = authenticator.login("Login", "main")
42
+ auth_status = st.session_state.get("authentication_status")
43
 
44
  if auth_status == True:
45
+ is_state_initiaized = st.session_state.get("initialized", False)
46
  if not is_state_initiaized:
47
+ a = 1
 
48
 
49
  def plot_residual_predicted(actual, predicted, df_):
50
+ df_["Residuals"] = actual - pd.Series(predicted)
51
+ df_["StdResidual"] = (df_["Residuals"] - df_["Residuals"].mean()) / df_[
52
+ "Residuals"
53
+ ].std()
54
+
55
+ # Create a Plotly scatter plot
56
+ fig = px.scatter(
57
+ df_,
58
+ x=predicted,
59
+ y="StdResidual",
60
+ opacity=0.5,
61
+ color_discrete_sequence=["#11B6BD"],
62
+ )
63
+
64
+ # Add horizontal lines
65
+ fig.add_hline(y=0, line_dash="dash", line_color="darkorange")
66
+ fig.add_hline(y=2, line_color="red")
67
+ fig.add_hline(y=-2, line_color="red")
68
+
69
+ fig.update_xaxes(title="Predicted")
70
+ fig.update_yaxes(title="Standardized Residuals (Actual - Predicted)")
71
+
72
+ # Set the same width and height for both figures
73
+ fig.update_layout(
74
+ title="Residuals over Predicted Values",
75
+ autosize=False,
76
+ width=600,
77
+ height=400,
78
+ )
79
+
80
+ return fig
81
 
82
  def residual_distribution(actual, predicted):
83
+ Residuals = actual - pd.Series(predicted)
84
+
85
+ # Create a Seaborn distribution plot
86
+ sns.set(style="whitegrid")
87
+ plt.figure(figsize=(6, 4))
88
+ sns.histplot(Residuals, kde=True, color="#11B6BD")
89
+
90
+ plt.title(" Distribution of Residuals")
91
+ plt.xlabel("Residuals")
92
+ plt.ylabel("Probability Density")
93
+
94
+ return plt
95
+
 
96
  def qqplot(actual, predicted):
97
+ Residuals = actual - pd.Series(predicted)
98
+ Residuals = pd.Series(Residuals)
99
+ Resud_std = (Residuals - Residuals.mean()) / Residuals.std()
100
+
101
+ # Create a QQ plot using Plotly with custom colors
102
+ fig = go.Figure()
103
+ fig.add_trace(
104
+ go.Scatter(
105
+ x=sm.ProbPlot(Resud_std).theoretical_quantiles,
106
+ y=sm.ProbPlot(Resud_std).sample_quantiles,
107
+ mode="markers",
108
+ marker=dict(size=5, color="#11B6BD"),
109
+ name="QQ Plot",
 
 
 
 
 
 
110
  )
111
+ )
112
+
113
+ # Add the 45-degree reference line
114
+ diagonal_line = go.Scatter(
115
+ x=[-2, 2], # Adjust the x values as needed to fit the range of your data
116
+ y=[-2, 2], # Adjust the y values accordingly
117
+ mode="lines",
118
+ line=dict(color="red"), # Customize the line color and style
119
+ name=" ",
120
+ )
121
+ fig.add_trace(diagonal_line)
122
+
123
+ # Customize the layout
124
+ fig.update_layout(
125
+ title="QQ Plot of Residuals",
126
+ title_x=0.5,
127
+ autosize=False,
128
+ width=600,
129
+ height=400,
130
+ xaxis_title="Theoretical Quantiles",
131
+ yaxis_title="Sample Quantiles",
132
+ )
133
 
134
+ return fig
135
 
136
  def plot_actual_vs_predicted(date, y, predicted_values, model):
137
 
138
  fig = go.Figure()
139
 
140
+ fig.add_trace(
141
+ go.Scatter(
142
+ x=date, y=y, mode="lines", name="Actual", line=dict(color="blue")
143
+ )
144
+ )
145
+ fig.add_trace(
146
+ go.Scatter(
147
+ x=date,
148
+ y=predicted_values,
149
+ mode="lines",
150
+ name="Predicted",
151
+ line=dict(color="orange"),
152
+ )
153
+ )
154
+
155
  # Calculate MAPE
156
+ mape = mean_absolute_percentage_error(y, predicted_values) * 100
157
+
158
  # Calculate R-squared
159
  rss = np.sum((y - predicted_values) ** 2)
160
  tss = np.sum((y - np.mean(y)) ** 2)
161
  r_squared = 1 - (rss / tss)
162
+
163
  # Get the number of predictors
164
  num_predictors = model.df_model
165
+
166
  # Get the number of samples
167
  num_samples = len(y)
168
+
169
  # Calculate Adjusted R-squared
170
+ adj_r_squared = 1 - (
171
+ (1 - r_squared) * ((num_samples - 1) / (num_samples - num_predictors - 1))
172
+ )
173
+ metrics_table = pd.DataFrame(
174
+ {
175
+ "Metric": ["MAPE", "R-squared", "AdjR-squared"],
176
+ "Value": [mape, r_squared, adj_r_squared],
177
+ }
178
+ )
179
  fig.update_layout(
180
+ xaxis=dict(title="Date"),
181
+ yaxis=dict(title="Value"),
182
+ title=f"MAPE : {mape:.2f}%, AdjR2: {adj_r_squared:.2f}",
183
+ xaxis_tickangle=-30,
184
  )
185
 
186
+ return metrics_table, fig
187
+
188
  def contributions(X, model):
189
  X1 = X.copy()
190
  for j, col in enumerate(X1.columns):
191
  X1[col] = X1[col] * model.params.values[j]
192
 
193
+ return np.round(
194
+ (X1.sum() / sum(X1.sum()) * 100).sort_values(ascending=False), 2
195
+ )
196
 
197
+ transformed_data = pd.read_csv("transformed_data.csv")
198
 
199
  # hard coded for now, need to get features set from model
200
 
201
+ feature_set_dct = {
202
+ "app_installs_-_appsflyer": [
203
+ "paid_search_clicks",
204
+ "fb:_level_achieved_-_tier_1_impressions_lag2",
205
+ "fb:_level_achieved_-_tier_2_clicks_lag2",
206
+ "paid_social_others_impressions_adst.1",
207
+ "ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag2",
208
+ "digital_tactic_others_clicks",
209
+ "kwai_clicks_adst.3",
210
+ "programmaticclicks",
211
+ "indicacao_clicks_adst.1",
212
+ "infleux_clicks_adst.4",
213
+ "influencer_clicks",
214
+ ],
215
+ "account_requests_-_appsflyer": [
216
+ "paid_search_impressions",
217
+ "fb:_level_achieved_-_tier_1_clicks_adst.1",
218
+ "fb:_level_achieved_-_tier_2_clicks_adst.1",
219
+ "paid_social_others_clicks_lag2",
220
+ "ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag5_adst.1",
221
+ "digital_tactic_others_clicks_adst.1",
222
+ "kwai_clicks_adst.2",
223
+ "programmaticimpressions_lag4_adst.1",
224
+ "indicacao_clicks",
225
+ "infleux_clicks_adst.2",
226
+ "influencer_clicks",
227
+ ],
228
+ "total_approved_accounts_-_appsflyer": [
229
+ "paid_search_clicks",
230
+ "fb:_level_achieved_-_tier_1_impressions_lag2_adst.1",
231
+ "fb:_level_achieved_-_tier_2_impressions_lag2",
232
+ "paid_social_others_clicks_lag2_adst.2",
233
+ "ga_app:_will_and_cid_pequena_baixo_risco_impressions_lag4",
234
+ "digital_tactic_others_clicks",
235
+ "kwai_impressions_adst.2",
236
+ "programmaticclicks_adst.5",
237
+ "indicacao_clicks_adst.1",
238
+ "infleux_clicks_adst.3",
239
+ "influencer_clicks",
240
+ ],
241
+ "total_approved_accounts_-_revenue": [
242
+ "paid_search_impressions_adst.5",
243
+ "kwai_impressions_lag2_adst.3",
244
+ "indicacao_clicks_adst.3",
245
+ "infleux_clicks_adst.3",
246
+ "programmaticclicks_adst.4",
247
+ "influencer_clicks_adst.3",
248
+ "fb:_level_achieved_-_tier_1_impressions_adst.2",
249
+ "fb:_level_achieved_-_tier_2_impressions_lag3_adst.5",
250
+ "paid_social_others_impressions_adst.3",
251
+ "ga_app:_will_and_cid_pequena_baixo_risco_clicks_lag3_adst.5",
252
+ "digital_tactic_others_clicks_adst.2",
253
+ ],
254
+ }
255
+
256
+ # """ the above part should be modified so that we are fetching features set from the saved model"""
257
+
258
+ def contributions(X, model, target):
259
  X1 = X.copy()
260
  for j, col in enumerate(X1.columns):
261
  X1[col] = X1[col] * model.params.values[j]
262
+
263
+ contributions = np.round(
264
+ (X1.sum() / sum(X1.sum()) * 100).sort_values(ascending=False), 2
265
+ )
266
+ contributions = (
267
+ pd.DataFrame(contributions, columns=target)
268
+ .reset_index()
269
+ .rename(columns={"index": "Channel"})
270
+ )
271
+ contributions["Channel"] = [
272
+ re.split(r"_imp|_cli", col)[0] for col in contributions["Channel"]
273
+ ]
274
+
275
  return contributions
 
276
 
277
+ def model_fit(features_set, target):
278
  X = transformed_data[features_set]
279
+ y = transformed_data[target]
280
  ss = MinMaxScaler()
281
  X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
282
  X = sm.add_constant(X)
283
+ X_train = X.iloc[:150]
284
+ X_test = X.iloc[150:]
285
+ y_train = y.iloc[:150]
286
+ y_test = y.iloc[150:]
287
  model = sm.OLS(y_train, X_train).fit()
288
  predicted_values_train = model.predict(X_train)
289
  r2 = model.rsquared
290
  adjr2 = model.rsquared_adj
291
  train_mape = mean_absolute_percentage_error(y_train, predicted_values_train)
292
+ test_mape = mean_absolute_percentage_error(y_test, model.predict(X_test))
293
+ summary = model.summary()
294
+ train_contributions = contributions(X_train, model, [target])
295
+ return (
296
+ pd.DataFrame(
297
+ {
298
+ "Model": target,
299
+ "R2": np.round(r2, 2),
300
+ "ADJr2": np.round(adjr2, 2),
301
+ "Train Mape": np.round(train_mape, 2),
302
+ "Test Mape": np.round(test_mape, 2),
303
+ "Summary": summary,
304
+ "Model_object": model,
305
+ },
306
+ index=[0],
307
+ ),
308
+ train_contributions,
309
+ )
310
 
311
+ metrics_table = pd.DataFrame()
312
 
313
+ if "contribution_df" not in st.session_state:
314
+ st.session_state["contribution_df"] = pd.DataFrame()
315
 
316
+ for target, feature_set in feature_set_dct.items():
317
+ metrics_table = pd.concat(
318
+ [metrics_table, model_fit(features_set=feature_set, target=target)[0]]
319
+ )
320
+ if st.session_state["contribution_df"].empty:
321
+ st.session_state["contribution_df"] = model_fit(
322
+ features_set=feature_set, target=target
323
+ )[1]
324
+ else:
325
+ st.session_state["contribution_df"] = pd.merge(
326
+ st.session_state["contribution_df"],
327
+ model_fit(features_set=feature_set, target=target)[1],
328
+ )
329
 
330
  # st.write(st.session_state["contribution_df"])
 
 
 
 
331
 
332
+ metrics_table.reset_index(drop=True, inplace=True)
333
 
334
+ eda_columns = st.columns(2)
 
 
 
 
 
335
  with eda_columns[1]:
336
+ eda = st.button(
337
+ "Generate EDA Report",
338
+ help="Click to generate a bivariate report for the selected response metric from the table below.",
339
+ )
340
 
341
  # st.markdown('Model Metrics')
 
 
342
 
343
+ st.title("Contribution Overview")
344
+
345
+ contribution_selections = st.multiselect(
346
+ "Select the models to compare contributions",
347
+ [
348
+ col
349
+ for col in st.session_state["contribution_df"].columns
350
+ if col.lower() != "channel"
351
+ ],
352
+ default=[
353
+ col
354
+ for col in st.session_state["contribution_df"].columns
355
+ if col.lower() != "channel"
356
+ ][-1],
357
+ )
358
+ trace_data = []
359
 
360
  for selection in contribution_selections:
361
 
362
+ trace = go.Bar(
363
+ x=st.session_state["contribution_df"]["Channel"],
364
+ y=st.session_state["contribution_df"][selection],
365
+ name=selection,
366
+ text=np.round(st.session_state["contribution_df"][selection], 0)
367
+ .astype(int)
368
+ .astype(str)
369
+ + "%",
370
+ textposition="outside",
371
+ )
372
  trace_data.append(trace)
373
 
374
  layout = go.Layout(
375
+ title="Metrics Contribution by Channel",
376
+ xaxis=dict(title="Channel Name"),
377
+ yaxis=dict(title="Metrics Contribution"),
378
+ barmode="group",
379
+ )
380
  fig = go.Figure(data=trace_data, layout=layout)
381
+ st.plotly_chart(fig, use_container_width=True)
382
+
383
+ ############################################ Waterfall Chart ############################################
384
+ # import plotly.graph_objects as go
385
+
386
+ # # Initialize a Plotly figure
387
+ # fig = go.Figure()
388
+
389
+ # for selection in contribution_selections:
390
+ # # Ensure y_values are numeric
391
+ # y_values = st.session_state["contribution_df"][selection].values.astype(float)
392
+
393
+ # # Generating text labels for each bar, ensuring operations are compatible with string formats
394
+ # text_values = [f"{val}%" for val in np.round(y_values, 0).astype(int)]
395
+
396
+ # fig.add_trace(
397
+ # go.Waterfall(
398
+ # name=selection,
399
+ # orientation="v",
400
+ # measure=["relative"]
401
+ # * len(y_values), # Adjust if you have absolute values at certain points
402
+ # x=st.session_state["contribution_df"]["Channel"].tolist(),
403
+ # text=text_values,
404
+ # textposition="outside",
405
+ # y=y_values,
406
+ # increasing={"marker": {"color": "green"}},
407
+ # decreasing={"marker": {"color": "red"}},
408
+ # totals={"marker": {"color": "blue"}},
409
+ # )
410
+ # )
411
+
412
+ # fig.update_layout(
413
+ # title="Metrics Contribution by Channel",
414
+ # xaxis={"title": "Channel Name"},
415
+ # yaxis={"title": "Metrics Contribution"},
416
+ # height=600,
417
+ # )
418
+
419
+ # # Displaying the waterfall chart in Streamlit
420
+ # st.plotly_chart(fig, use_container_width=True)
421
+
422
+ import plotly.graph_objects as go
423
+
424
+ # Initialize a Plotly figure
425
+ fig = go.Figure()
426
 
427
+ for selection in contribution_selections:
428
+ # Ensure contributions are numeric
429
+ contributions = (
430
+ st.session_state["contribution_df"][selection].values.astype(float).tolist()
431
+ )
432
+ channel_names = st.session_state["contribution_df"]["Channel"].tolist()
433
 
434
+ display_name, display_contribution, base_contribution = [], [], 0
435
+ for channel_name, contribution in zip(channel_names, contributions):
436
+ if channel_name != "const":
437
+ display_name.append(channel_name)
438
+ display_contribution.append(contribution)
439
+ else:
440
+ base_contribution = contribution
441
+
442
+ display_name = ["Base Sales"] + display_name
443
+ display_contribution = [base_contribution] + display_contribution
444
+
445
+ # Generating text labels for each bar, ensuring operations are compatible with string formats
446
+ text_values = [
447
+ f"{val}%" for val in np.round(display_contribution, 0).astype(int)
448
+ ]
449
+
450
+ fig.add_trace(
451
+ go.Waterfall(
452
+ orientation="v",
453
+ measure=["relative"]
454
+ * len(
455
+ display_contribution
456
+ ), # Adjust if you have absolute values at certain points
457
+ x=display_name,
458
+ text=text_values,
459
+ textposition="outside",
460
+ y=display_contribution,
461
+ increasing={"marker": {"color": "green"}},
462
+ decreasing={"marker": {"color": "red"}},
463
+ totals={"marker": {"color": "blue"}},
464
+ )
465
+ )
466
 
467
+ fig.update_layout(
468
+ title="Metrics Contribution by Channel",
469
+ xaxis={"title": "Channel Name"},
470
+ yaxis={"title": "Metrics Contribution"},
471
+ height=600,
472
+ )
473
+
474
+ # Displaying the waterfall chart in Streamlit
475
+ st.plotly_chart(fig, use_container_width=True)
476
+
477
+ ############################################ Waterfall Chart ############################################
478
+
479
+ st.title("Analysis of Models Result")
480
+ # st.markdown()
481
+ gd_table = metrics_table.iloc[:, :-2]
482
+
483
+ gd = GridOptionsBuilder.from_dataframe(gd_table)
484
+ # gd.configure_pagination(enabled=True)
485
+ gd.configure_selection(
486
+ use_checkbox=True,
487
+ selection_mode="single",
488
+ pre_select_all_rows=False,
489
+ pre_selected_rows=[1],
490
+ )
491
+
492
+ gridoptions = gd.build()
493
+ table = AgGrid(
494
+ gd_table, gridOptions=gridoptions, fit_columns_on_grid_load=True, height=200
495
+ )
496
  # table=metrics_table.iloc[:,:-2]
497
  # table.insert(0, "Select", False)
498
  # selection_table=st.data_editor(table,column_config={"Select": st.column_config.CheckboxColumn(required=True)})
 
 
499
 
500
+ if len(table.selected_rows) == 0:
501
+ st.warning(
502
+ "Click on the checkbox to view comprehensive results of the selected model."
503
+ )
504
  st.stop()
505
+ else:
506
+ target_column = table.selected_rows[0]["Model"]
507
+ feature_set = feature_set_dct[target_column]
508
 
509
  with eda_columns[1]:
510
  if eda:
511
+
512
  def generate_report_with_target(channel_data, target_feature):
513
+ report = sv.analyze(
514
+ [channel_data, "Dataset"], target_feat=target_feature, verbose=False
515
+ )
516
  temp_dir = tempfile.mkdtemp()
517
  report_path = os.path.join(temp_dir, "report.html")
518
+ report.show_html(
519
+ filepath=report_path, open_browser=False
520
+ ) # Generate the report as an HTML file
521
  return report_path
522
+
523
+ report_data = transformed_data[feature_set]
524
+ report_data[target_column] = transformed_data[target_column]
525
  report_file = generate_report_with_target(report_data, target_column)
526
+
527
  if os.path.exists(report_file):
528
+ with open(report_file, "rb") as f:
529
  st.download_button(
530
  label="Download EDA Report",
531
  data=f.read(),
532
  file_name="report.html",
533
+ mime="text/html",
534
  )
535
  else:
536
  st.warning("Report generation failed. Unable to find the report file.")
537
 
538
+ model = metrics_table[metrics_table["Model"] == target_column]["Model_object"].iloc[
539
+ 0
540
+ ]
541
+ st.header("Model Summary")
542
  st.write(model.summary())
543
+ X = transformed_data[feature_set]
544
+ ss = MinMaxScaler()
545
+ X = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
546
+ X = sm.add_constant(X)
547
+ y = transformed_data[target_column]
548
+ X_train = X.iloc[:150]
549
+ X_test = X.iloc[150:]
550
+ y_train = y.iloc[:150]
551
+ y_test = y.iloc[150:]
552
+ X.index = transformed_data["date"]
553
+ y.index = transformed_data["date"]
554
+
555
+ metrics_table_train, fig_train = plot_actual_vs_predicted(
556
+ X_train.index, y_train, model.predict(X_train), model
557
+ )
558
+ metrics_table_test, fig_test = plot_actual_vs_predicted(
559
+ X_test.index, y_test, model.predict(X_test), model
560
+ )
561
+
562
+ metrics_table_train = metrics_table_train.set_index("Metric").transpose()
563
+ metrics_table_train.index = ["Train"]
564
+ metrics_table_test = metrics_table_test.set_index("Metric").transpose()
565
+ metrics_table_test.index = ["test"]
566
+ metrics_table = np.round(pd.concat([metrics_table_train, metrics_table_test]), 2)
567
+
568
+ st.markdown("Result Overview")
569
+ st.dataframe(np.round(metrics_table, 2), use_container_width=True)
570
+
571
+ st.subheader("Actual vs Predicted Plot Train")
572
+
573
+ st.plotly_chart(fig_train, use_container_width=True)
574
+ st.subheader("Actual vs Predicted Plot Test")
575
+ st.plotly_chart(fig_test, use_container_width=True)
576
+
577
+ st.markdown("## Residual Analysis")
578
+ columns = st.columns(2)
579
+
580
+ Xtrain1 = X_train.copy()
581
  with columns[0]:
582
+ fig = plot_residual_predicted(y_train, model.predict(Xtrain1), Xtrain1)
583
  st.plotly_chart(fig)
584
 
585
  with columns[1]:
586
  st.empty()
587
+ fig = qqplot(y_train, model.predict(X_train))
588
  st.plotly_chart(fig)
589
 
590
  with columns[0]:
591
+ fig = residual_distribution(y_train, model.predict(X_train))
592
  st.pyplot(fig)
593
 
594
 
 
595
  elif auth_status == False:
596
+ st.error("Username/Password is incorrect")
597
  try:
598
+ username_forgot_pw, email_forgot_password, random_password = (
599
+ authenticator.forgot_password("Forgot password")
600
+ )
601
  if username_forgot_pw:
602
+ st.success("New password sent securely")
603
  # Random password to be transferred to the user securely
604
  elif username_forgot_pw == False:
605
+ st.error("Username not found")
606
  except Exception as e:
607
  st.error(e)
pages/5_Model_Tuning_with_panel.py ADDED
@@ -0,0 +1,527 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ MMO Build Sprint 3
3
+ date :
4
+ changes : capability to tune MixedLM as well as simple LR in the same page
5
+ '''
6
+
7
+ import streamlit as st
8
+ import pandas as pd
9
+ from Eda_functions import format_numbers
10
+ import pickle
11
+ from utilities import set_header, load_local_css
12
+ import statsmodels.api as sm
13
+ import re
14
+ from sklearn.preprocessing import MinMaxScaler
15
+ import matplotlib.pyplot as plt
16
+ from statsmodels.stats.outliers_influence import variance_inflation_factor
17
+
18
+ st.set_option('deprecation.showPyplotGlobalUse', False)
19
+ import statsmodels.formula.api as smf
20
+ from Data_prep_functions import *
21
+
22
+ # for i in ["model_tuned", "X_train_tuned", "X_test_tuned", "tuned_model_features", "tuned_model", "tuned_model_dict"] :
23
+
24
+ st.set_page_config(
25
+ page_title="Model Tuning",
26
+ page_icon=":shark:",
27
+ layout="wide",
28
+ initial_sidebar_state='collapsed'
29
+ )
30
+ load_local_css('styles.css')
31
+ set_header()
32
+
33
+ # Sprint3
34
+ # is_panel = st.session_state['is_panel']
35
+ # panel_col = 'markets' # set the panel column
36
+ date_col = 'date'
37
+
38
+ panel_col = [col.lower().replace('.','_').replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in st.session_state['bin_dict']['Panel Level 1'] ] [0]# set the panel column
39
+ is_panel = True if len(panel_col)>0 else False
40
+
41
+
42
+ # flag indicating there is not tuned model till now
43
+
44
+ # Sprint4 - model tuned dict
45
+ if 'Model_Tuned' not in st.session_state:
46
+ st.session_state['Model_Tuned'] = {}
47
+
48
+ st.title('1. Model Tuning')
49
+ # st.write(st.session_state['base_model_feature_set'])
50
+
51
+ if "X_train" not in st.session_state:
52
+ st.error(
53
+ "Oops! It seems there are no saved models available. Please build and save a model from the previous page to proceed.")
54
+ st.stop()
55
+ # X_train=st.session_state['X_train']
56
+ # X_test=st.session_state['X_test']
57
+ # y_train=st.session_state['y_train']
58
+ # y_test=st.session_state['y_test']
59
+ # df=st.session_state['media_data']
60
+
61
+
62
+ # st.write(X_train.columns)
63
+ # st.write(X_test.columns)
64
+ if "is_tuned_model" not in st.session_state:
65
+ st.session_state["is_tuned_model"] = {}
66
+ # Sprint4 - if used_response_metrics is not blank, then select one of the used_response_metrics, else target is revenue by default
67
+ if "used_response_metrics" in st.session_state and st.session_state['used_response_metrics'] != []:
68
+ sel_target_col = st.selectbox("Select the response metric", st.session_state['used_response_metrics'])
69
+ target_col = sel_target_col.lower().replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_")
70
+
71
+ else:
72
+ sel_target_col = 'Total Approved Accounts - Revenue'
73
+ target_col = 'total_approved_accounts_revenue'
74
+
75
+ # Sprint4 - Look through all saved models, only show saved models of the sel resp metric (target_col)
76
+ saved_models = st.session_state['saved_model_names']
77
+ required_saved_models = [m.split("__")[0] for m in saved_models if m.split("__")[1] == target_col]
78
+ sel_model = st.selectbox("Select the model to tune", required_saved_models)
79
+
80
+ with open("best_models.pkl", 'rb') as file:
81
+ model_dict = pickle.load(file)
82
+
83
+ sel_model_dict = model_dict[sel_model + "__" + target_col] # Sprint4 - get the model obj of the selected model
84
+ # st.write(sel_model_dict)
85
+
86
+ X_train = sel_model_dict['X_train']
87
+ X_test = sel_model_dict['X_test']
88
+ y_train = sel_model_dict['y_train']
89
+ y_test = sel_model_dict['y_test']
90
+ df = st.session_state['media_data']
91
+
92
+ if 'selected_model' not in st.session_state:
93
+ st.session_state['selected_model'] = 0
94
+
95
+ # st.write(model_dict[st.session_state["selected_model"]]['X_train'].columns)
96
+
97
+ st.markdown('### 1.1 Event Flags')
98
+ st.markdown('Helps in quantifying the impact of specific occurrences of events')
99
+ with st.expander('Apply Event Flags'):
100
+ # st.session_state["selected_model"]=st.selectbox('Select Model to apply flags',model_dict.keys())
101
+ model = sel_model_dict['Model_object']
102
+ date = st.session_state['date']
103
+ date = pd.to_datetime(date)
104
+ X_train = sel_model_dict['X_train']
105
+
106
+ # features_set= model_dict[st.session_state["selected_model"]]['feature_set']
107
+ features_set = sel_model_dict["feature_set"]
108
+
109
+ col = st.columns(3)
110
+ min_date = min(date)
111
+ max_date = max(date)
112
+ with col[0]:
113
+ start_date = st.date_input('Select Start Date', min_date, min_value=min_date, max_value=max_date)
114
+ with col[1]:
115
+ end_date = st.date_input('Select End Date', max_date, min_value=min_date, max_value=max_date)
116
+ with col[2]:
117
+ repeat = st.selectbox('Repeat Annually', ['Yes', 'No'], index=1)
118
+ if repeat == 'Yes':
119
+ repeat = True
120
+ else:
121
+ repeat = False
122
+
123
+ if 'Flags' not in st.session_state:
124
+ st.session_state['Flags'] = {}
125
+ # print("**"*50)
126
+ # print(y_train)
127
+ # print("**"*50)
128
+ # print(model.fittedvalues)
129
+ if is_panel: # Sprint3
130
+ met, line_values, fig_flag = plot_actual_vs_predicted(X_train[date_col], y_train,
131
+ model.fittedvalues, model,
132
+ target_column=sel_target_col,
133
+ flag=(start_date, end_date),
134
+ repeat_all_years=repeat, is_panel=True)
135
+ st.plotly_chart(fig_flag, use_container_width=True)
136
+
137
+ # create flag on test
138
+ met, test_line_values, fig_flag = plot_actual_vs_predicted(X_test[date_col], y_test,
139
+ sel_model_dict['pred_test'], model,
140
+ target_column=sel_target_col,
141
+ flag=(start_date, end_date),
142
+ repeat_all_years=repeat, is_panel=True)
143
+
144
+ else:
145
+ pred_train=model.predict(X_train[features_set])
146
+ met, line_values, fig_flag = plot_actual_vs_predicted(X_train[date_col], y_train, pred_train, model,
147
+ flag=(start_date, end_date), repeat_all_years=repeat,is_panel=False)
148
+ st.plotly_chart(fig_flag, use_container_width=True)
149
+
150
+ pred_test=model.predict(X_test[features_set])
151
+ met, test_line_values, fig_flag = plot_actual_vs_predicted(X_test[date_col], y_test, pred_test, model,
152
+ flag=(start_date, end_date), repeat_all_years=repeat,is_panel=False)
153
+ flag_name = 'f1_flag'
154
+ flag_name = st.text_input('Enter Flag Name')
155
+ # Sprint4 - add selected target col to flag name
156
+ if st.button('Update flag'):
157
+ st.session_state['Flags'][flag_name + '__'+ target_col] = {}
158
+ st.session_state['Flags'][flag_name + '__'+ target_col]['train'] = line_values
159
+ st.session_state['Flags'][flag_name + '__'+ target_col]['test'] = test_line_values
160
+ # st.write(st.session_state['Flags'][flag_name])
161
+ st.success(f'{flag_name + "__" + target_col} stored')
162
+
163
+ # Sprint4 - only show flag created for the particular target col
164
+ st.write(st.session_state['Flags'].keys() )
165
+ target_model_flags = [f.split("__")[0] for f in st.session_state['Flags'].keys() if f.split("__")[1] == target_col]
166
+ options = list(target_model_flags)
167
+ selected_options = []
168
+ num_columns = 4
169
+ num_rows = -(-len(options) // num_columns)
170
+
171
+ tick = False
172
+ if st.checkbox('Select all'):
173
+ tick = True
174
+ selected_options = []
175
+ for row in range(num_rows):
176
+ cols = st.columns(num_columns)
177
+ for col in cols:
178
+ if options:
179
+ option = options.pop(0)
180
+ selected = col.checkbox(option, value=tick)
181
+ if selected:
182
+ selected_options.append(option)
183
+
184
+ st.markdown('### 1.2 Select Parameters to Apply')
185
+ parameters = st.columns(3)
186
+ with parameters[0]:
187
+ Trend = st.checkbox("**Trend**")
188
+ st.markdown('Helps account for long-term trends or seasonality that could influence advertising effectiveness')
189
+ with parameters[1]:
190
+ week_number = st.checkbox('**Week_number**')
191
+ st.markdown('Assists in detecting and incorporating weekly patterns or seasonality')
192
+ with parameters[2]:
193
+ sine_cosine = st.checkbox('**Sine and Cosine Waves**')
194
+ st.markdown('Helps in capturing cyclical patterns or seasonality in the data')
195
+ #
196
+ # def get_tuned_model():
197
+ # st.session_state['build_tuned_model']=True
198
+
199
+ if st.button('Build model with Selected Parameters and Flags', key='build_tuned_model'):
200
+ new_features = features_set
201
+ st.header('2.1 Results Summary')
202
+ # date=list(df.index)
203
+ # df = df.reset_index(drop=True)
204
+ # st.write(df.head(2))
205
+ # X_train=df[features_set]
206
+ ss = MinMaxScaler()
207
+ if is_panel == True:
208
+ X_train_tuned = X_train[features_set]
209
+ # X_train_tuned = pd.DataFrame(ss.fit_transform(X), columns=X.columns)
210
+ X_train_tuned[target_col] = X_train[target_col]
211
+ X_train_tuned[date_col] = X_train[date_col]
212
+ X_train_tuned[panel_col] = X_train[panel_col]
213
+
214
+ X_test_tuned = X_test[features_set]
215
+ # X_test_tuned = pd.DataFrame(ss.transform(X), columns=X.columns)
216
+ X_test_tuned[target_col] = X_test[target_col]
217
+ X_test_tuned[date_col] = X_test[date_col]
218
+ X_test_tuned[panel_col] = X_test[panel_col]
219
+
220
+ else:
221
+ X_train_tuned = X_train[features_set]
222
+ # X_train_tuned = pd.DataFrame(ss.fit_transform(X_train_tuned), columns=X_train_tuned.columns)
223
+
224
+ X_test_tuned = X_test[features_set]
225
+ # X_test_tuned = pd.DataFrame(ss.transform(X_test_tuned), columns=X_test_tuned.columns)
226
+
227
+ for flag in selected_options:
228
+ # Spirnt4 - added target_col in flag name
229
+ X_train_tuned[flag] = st.session_state['Flags'][flag + "__" + target_col]['train']
230
+ X_test_tuned[flag] = st.session_state['Flags'][flag + "__" + target_col]['test']
231
+
232
+ # test
233
+ # X_train_tuned.to_csv("Test/X_train_tuned_flag.csv",index=False)
234
+ # X_test_tuned.to_csv("Test/X_test_tuned_flag.csv",index=False)
235
+
236
+ # print("()()"*20,flag, len(st.session_state['Flags'][flag]))
237
+ if Trend:
238
+ # Sprint3 - group by panel, calculate trend of each panel spearately. Add trend to new feature set
239
+ if is_panel:
240
+ newdata = pd.DataFrame()
241
+ panel_wise_end_point_train = {}
242
+ for panel, groupdf in X_train_tuned.groupby(panel_col):
243
+ groupdf.sort_values(date_col, inplace=True)
244
+ groupdf['Trend'] = np.arange(1, len(groupdf) + 1, 1)
245
+ newdata = pd.concat([newdata, groupdf])
246
+ panel_wise_end_point_train[panel] = len(groupdf)
247
+ X_train_tuned = newdata.copy()
248
+
249
+ test_newdata = pd.DataFrame()
250
+ for panel, test_groupdf in X_test_tuned.groupby(panel_col):
251
+ test_groupdf.sort_values(date_col, inplace=True)
252
+ start = panel_wise_end_point_train[panel] + 1
253
+ end = start + len(test_groupdf) # should be + 1? - Sprint4
254
+ # print("??"*20, panel, len(test_groupdf), len(np.arange(start, end, 1)), start)
255
+ test_groupdf['Trend'] = np.arange(start, end, 1)
256
+ test_newdata = pd.concat([test_newdata, test_groupdf])
257
+ X_test_tuned = test_newdata.copy()
258
+
259
+ new_features = new_features + ['Trend']
260
+
261
+ else:
262
+ X_train_tuned['Trend'] = np.arange(1, len(X_train_tuned) + 1, 1)
263
+ X_test_tuned['Trend'] = np.arange(len(X_train_tuned) + 1, len(X_train_tuned) + len(X_test_tuned) + 1, 1)
264
+ new_features = new_features + ['Trend']
265
+
266
+
267
+ if week_number:
268
+ # Sprint3 - create weeknumber from date column in xtrain tuned. add week num to new feature set
269
+ if is_panel:
270
+ X_train_tuned[date_col] = pd.to_datetime(X_train_tuned[date_col])
271
+ X_train_tuned['Week_number'] = X_train_tuned[date_col].dt.day_of_week
272
+ if X_train_tuned['Week_number'].nunique() == 1:
273
+ st.write("All dates in the data are of the same week day. Hence Week number can't be used.")
274
+ else:
275
+ X_test_tuned[date_col] = pd.to_datetime(X_test_tuned[date_col])
276
+ X_test_tuned['Week_number'] = X_test_tuned[date_col].dt.day_of_week
277
+ new_features = new_features + ['Week_number']
278
+
279
+ else:
280
+ date = pd.to_datetime(date.values)
281
+ X_train_tuned['Week_number'] = pd.to_datetime(X_train[date_col]).dt.day_of_week
282
+ X_test_tuned['Week_number'] = pd.to_datetime(X_test[date_col]).dt.day_of_week
283
+ new_features = new_features + ['Week_number']
284
+
285
+ if sine_cosine:
286
+ # Sprint3 - create panel wise sine cosine waves in xtrain tuned. add to new feature set
287
+ if is_panel:
288
+ new_features = new_features + ['sine_wave', 'cosine_wave']
289
+ newdata = pd.DataFrame()
290
+ newdata_test = pd.DataFrame()
291
+ groups = X_train_tuned.groupby(panel_col)
292
+ frequency = 2 * np.pi / 365 # Adjust the frequency as needed
293
+
294
+ train_panel_wise_end_point = {}
295
+ for panel, groupdf in groups:
296
+ num_samples = len(groupdf)
297
+ train_panel_wise_end_point[panel] = num_samples
298
+ days_since_start = np.arange(num_samples)
299
+ sine_wave = np.sin(frequency * days_since_start)
300
+ cosine_wave = np.cos(frequency * days_since_start)
301
+ sine_cosine_df = pd.DataFrame({'sine_wave': sine_wave, 'cosine_wave': cosine_wave})
302
+ assert len(sine_cosine_df) == len(groupdf)
303
+ # groupdf = pd.concat([groupdf, sine_cosine_df], axis=1)
304
+ groupdf['sine_wave'] = sine_wave
305
+ groupdf['cosine_wave'] = cosine_wave
306
+ newdata = pd.concat([newdata, groupdf])
307
+
308
+ X_train_tuned = newdata.copy()
309
+
310
+ test_groups = X_test_tuned.groupby(panel_col)
311
+ for panel, test_groupdf in test_groups:
312
+ num_samples = len(test_groupdf)
313
+ start = train_panel_wise_end_point[panel]
314
+ days_since_start = np.arange(start, start + num_samples, 1)
315
+ # print("##", panel, num_samples, start, len(np.arange(start, start+num_samples, 1)))
316
+ sine_wave = np.sin(frequency * days_since_start)
317
+ cosine_wave = np.cos(frequency * days_since_start)
318
+ sine_cosine_df = pd.DataFrame({'sine_wave': sine_wave, 'cosine_wave': cosine_wave})
319
+ assert len(sine_cosine_df) == len(test_groupdf)
320
+ # groupdf = pd.concat([groupdf, sine_cosine_df], axis=1)
321
+ test_groupdf['sine_wave'] = sine_wave
322
+ test_groupdf['cosine_wave'] = cosine_wave
323
+ newdata_test = pd.concat([newdata_test, test_groupdf])
324
+
325
+ X_test_tuned = newdata_test.copy()
326
+
327
+
328
+ else:
329
+ new_features = new_features + ['sine_wave', 'cosine_wave']
330
+
331
+ num_samples = len(X_train_tuned)
332
+ frequency = 2 * np.pi / 365 # Adjust the frequency as needed
333
+ days_since_start = np.arange(num_samples)
334
+ sine_wave = np.sin(frequency * days_since_start)
335
+ cosine_wave = np.cos(frequency * days_since_start)
336
+ sine_cosine_df = pd.DataFrame({'sine_wave': sine_wave, 'cosine_wave': cosine_wave})
337
+ # Concatenate the sine and cosine waves with the scaled X DataFrame
338
+ X_train_tuned = pd.concat([X_train_tuned, sine_cosine_df], axis=1)
339
+
340
+ test_num_samples = len(X_test_tuned)
341
+ start = num_samples
342
+ days_since_start = np.arange(start, start + test_num_samples, 1)
343
+ sine_wave = np.sin(frequency * days_since_start)
344
+ cosine_wave = np.cos(frequency * days_since_start)
345
+ sine_cosine_df = pd.DataFrame({'sine_wave': sine_wave, 'cosine_wave': cosine_wave})
346
+ # Concatenate the sine and cosine waves with the scaled X DataFrame
347
+ X_test_tuned = pd.concat([X_test_tuned, sine_cosine_df], axis=1)
348
+
349
+ # model
350
+ if selected_options:
351
+ new_features = new_features + selected_options
352
+ if is_panel:
353
+ inp_vars_str = " + ".join(new_features)
354
+ new_features=list(set(new_features))
355
+ # X_train_tuned.to_csv("Test/X_train_tuned.csv",index=False)
356
+ # st.write(X_train_tuned[['total_approved_accounts_revenue'] + new_features].dtypes)
357
+ # st.write(X_train_tuned[['total_approved_accounts_revenue', panel_col] + new_features].isna().sum())
358
+ md_str = target_col + " ~ " + inp_vars_str
359
+ md_tuned = smf.mixedlm(md_str,
360
+ data=X_train_tuned[[target_col] + new_features],
361
+ groups=X_train_tuned[panel_col])
362
+ model_tuned = md_tuned.fit()
363
+
364
+ # plot act v pred for original model and tuned model
365
+ metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(X_train[date_col], y_train,
366
+ model.fittedvalues, model,
367
+ target_column=sel_target_col,
368
+ is_panel=True)
369
+ metrics_table_tuned, line, actual_vs_predicted_plot_tuned = plot_actual_vs_predicted(X_train_tuned[date_col],
370
+ X_train_tuned[target_col],
371
+ model_tuned.fittedvalues,
372
+ model_tuned,
373
+ target_column=sel_target_col,
374
+ is_panel=True)
375
+
376
+ else:
377
+ new_features=list(set(new_features))
378
+ # st.write(new_features)
379
+ model_tuned = sm.OLS(y_train, X_train_tuned[new_features]).fit()
380
+ # st.write(X_train_tuned.columns)
381
+ metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(date[:130], y_train,
382
+ model.predict(X_train[features_set]), model,
383
+ target_column=sel_target_col)
384
+ metrics_table_tuned, line, actual_vs_predicted_plot_tuned = plot_actual_vs_predicted(date[:130], y_train,
385
+ model_tuned.predict(
386
+ X_train_tuned),
387
+ model_tuned,
388
+ target_column=sel_target_col)
389
+
390
+ # st.write(metrics_table_tuned)
391
+ mape = np.round(metrics_table.iloc[0, 1], 2)
392
+ r2 = np.round(metrics_table.iloc[1, 1], 2)
393
+ adjr2 = np.round(metrics_table.iloc[2, 1], 2)
394
+
395
+ mape_tuned = np.round(metrics_table_tuned.iloc[0, 1], 2)
396
+ r2_tuned = np.round(metrics_table_tuned.iloc[1, 1], 2)
397
+ adjr2_tuned = np.round(metrics_table_tuned.iloc[2, 1], 2)
398
+
399
+ parameters_ = st.columns(3)
400
+ with parameters_[0]:
401
+ st.metric('R2', r2_tuned, np.round(r2_tuned - r2, 2))
402
+ with parameters_[1]:
403
+ st.metric('Adjusted R2', adjr2_tuned, np.round(adjr2_tuned - adjr2, 2))
404
+ with parameters_[2]:
405
+ st.metric('MAPE', mape_tuned, np.round(mape_tuned - mape, 2), 'inverse')
406
+ st.write(model_tuned.summary())
407
+
408
+ X_train_tuned[date_col] = X_train[date_col]
409
+ X_test_tuned[date_col] = X_test[date_col]
410
+ X_train_tuned[target_col] = y_train
411
+ X_test_tuned[target_col] = y_test
412
+
413
+ st.header('2.2 Actual vs. Predicted Plot')
414
+ # if is_panel:
415
+ # metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(date, y_train, model.predict(X_train),
416
+ # model, target_column='Revenue',is_panel=True)
417
+ # else:
418
+ # metrics_table,line,actual_vs_predicted_plot=plot_actual_vs_predicted(date, y_train, model.predict(X_train), model,target_column='Revenue')
419
+ if is_panel :
420
+ metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(X_train_tuned[date_col],
421
+ X_train_tuned[target_col],
422
+ model_tuned.fittedvalues, model_tuned,
423
+ target_column=sel_target_col,
424
+ is_panel=True)
425
+ else :
426
+ metrics_table, line, actual_vs_predicted_plot = plot_actual_vs_predicted(X_train_tuned[date_col],
427
+ X_train_tuned[target_col],
428
+ model_tuned.predict(X_train_tuned[new_features]),
429
+ model_tuned,
430
+ target_column=sel_target_col,
431
+ is_panel=False)
432
+ # plot_actual_vs_predicted(X_train[date_col], y_train,
433
+ # model.fittedvalues, model,
434
+ # target_column='Revenue',
435
+ # is_panel=is_panel)
436
+
437
+ st.plotly_chart(actual_vs_predicted_plot, use_container_width=True)
438
+
439
+ st.markdown('## 2.3 Residual Analysis')
440
+ if is_panel :
441
+ columns = st.columns(2)
442
+ with columns[0]:
443
+ fig = plot_residual_predicted(y_train, model_tuned.fittedvalues, X_train_tuned)
444
+ st.plotly_chart(fig)
445
+
446
+ with columns[1]:
447
+ st.empty()
448
+ fig = qqplot(y_train, model_tuned.fittedvalues)
449
+ st.plotly_chart(fig)
450
+
451
+ with columns[0]:
452
+ fig = residual_distribution(y_train, model_tuned.fittedvalues)
453
+ st.pyplot(fig)
454
+ else:
455
+ columns = st.columns(2)
456
+ with columns[0]:
457
+ fig = plot_residual_predicted(y_train, model_tuned.predict(X_train_tuned[new_features]), X_train)
458
+ st.plotly_chart(fig)
459
+
460
+ with columns[1]:
461
+ st.empty()
462
+ fig = qqplot(y_train, model_tuned.predict(X_train_tuned[new_features]))
463
+ st.plotly_chart(fig)
464
+
465
+ with columns[0]:
466
+ fig = residual_distribution(y_train, model_tuned.predict(X_train_tuned[new_features]))
467
+ st.pyplot(fig)
468
+
469
+ st.session_state['is_tuned_model'][target_col] = True
470
+ # Sprint4 - saved tuned model in a dict
471
+ st.session_state['Model_Tuned'][sel_model + "__" + target_col] = {
472
+ "Model_object": model_tuned,
473
+ 'feature_set': new_features,
474
+ 'X_train_tuned': X_train_tuned,
475
+ 'X_test_tuned': X_test_tuned
476
+ }
477
+
478
+ # Pending
479
+ # if st.session_state['build_tuned_model']==True:
480
+ if st.session_state['Model_Tuned'] is not None :
481
+ if st.checkbox('Use this model to build response curves', key='save_model'):
482
+ # save_model = st.button('Use this model to build response curves', key='saved_tuned_model')
483
+ # if save_model:
484
+ st.session_state["is_tuned_model"][target_col]=True
485
+ with open("tuned_model.pkl", "wb") as f:
486
+ # pickle.dump(st.session_state['tuned_model'], f)
487
+ pickle.dump(st.session_state['Model_Tuned'], f) # Sprint4
488
+
489
+ # X_test_tuned.to_csv("Test/X_test_tuned_final.csv", index=False)
490
+ # X_train_tuned.to_csv("Test/X_train_tuned.csv", index=False)
491
+ st.success(sel_model + "__" + target_col + ' Tuned saved!')
492
+
493
+
494
+ # if is_panel:
495
+ # # st.session_state["tuned_model_features"] = new_features
496
+ # with open("tuned_model.pkl", "wb") as f:
497
+ # # pickle.dump(st.session_state['tuned_model'], f)
498
+ # pickle.dump(st.session_state['Model_Tuned'], f) # Sprint4
499
+ # st.success(sel_model + "__" + target_col + ' Tuned saved!')
500
+
501
+ # raw_data=df[features_set]
502
+ # columns_raw=[re.split(r"(_lag|_adst)",col)[0] for col in raw_data.columns]
503
+ # raw_data.columns=columns_raw
504
+ # columns_media=[col for col in columns_raw if Categorised_data[col]['BB']=='Media']
505
+ # raw_data=raw_data[columns_media]
506
+
507
+ # raw_data['Date']=list(df.index)
508
+
509
+ # spends_var=[col for col in df.columns if "spends" in col.lower() and 'adst' not in col.lower() and 'lag' not in col.lower()]
510
+ # spends_df=df[spends_var]
511
+ # spends_df['Week']=list(df.index)
512
+
513
+
514
+ # j=0
515
+ # X1=X.copy()
516
+ # col=X1.columns
517
+ # for i in model.params.values:
518
+ # X1[col[j]]=X1.iloc[:,j]*i
519
+ # j+=1
520
+ # contribution_df=X1
521
+ # contribution_df['Date']=list(df.index)
522
+ # excel_file='Overview_data.xlsx'
523
+
524
+ # with pd.ExcelWriter(excel_file,engine='xlsxwriter') as writer:
525
+ # raw_data.to_excel(writer,sheet_name='RAW DATA MMM',index=False)
526
+ # spends_df.to_excel(writer,sheet_name='SPEND INPUT',index=False)
527
+ # contribution_df.to_excel(writer,sheet_name='CONTRIBUTION MMM')
pages/6_Model_Result_Overview.py ADDED
@@ -0,0 +1,348 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ MMO Build Sprint 3
3
+ additions : contributions calculated using tuned Mixed LM model
4
+ pending : contributions calculations using - 1. not tuned Mixed LM model, 2. tuned OLS model, 3. not tuned OLS model
5
+
6
+ MMO Build Sprint 4
7
+ additions : response metrics selection
8
+ pending : contributions calculations using - 1. not tuned Mixed LM model, 2. tuned OLS model, 3. not tuned OLS model
9
+ '''
10
+
11
+ import streamlit as st
12
+ import pandas as pd
13
+ from sklearn.preprocessing import MinMaxScaler
14
+ import pickle
15
+
16
+
17
+
18
+ from utilities_with_panel import (set_header,
19
+ overview_test_data_prep_panel,
20
+ overview_test_data_prep_nonpanel,
21
+ initialize_data,
22
+ load_local_css,
23
+ create_channel_summary,
24
+ create_contribution_pie,
25
+ create_contribuion_stacked_plot,
26
+ create_channel_spends_sales_plot,
27
+ format_numbers,
28
+ channel_name_formating)
29
+
30
+ import plotly.graph_objects as go
31
+ import streamlit_authenticator as stauth
32
+ import yaml
33
+ from yaml import SafeLoader
34
+ import time
35
+
36
+ st.set_page_config(layout='wide')
37
+ load_local_css('styles.css')
38
+ set_header()
39
+
40
+
41
+ def get_random_effects(media_data, panel_col, mdf):
42
+ random_eff_df = pd.DataFrame(columns=[panel_col, "random_effect"])
43
+
44
+ for i, market in enumerate(media_data[panel_col].unique()):
45
+ print(i, end='\r')
46
+ intercept = mdf.random_effects[market].values[0]
47
+ random_eff_df.loc[i, 'random_effect'] = intercept
48
+ random_eff_df.loc[i, panel_col] = market
49
+
50
+ return random_eff_df
51
+
52
+
53
+ def process_train_and_test(train, test, features, panel_col, target_col):
54
+ X1 = train[features]
55
+
56
+ ss = MinMaxScaler()
57
+ X1 = pd.DataFrame(ss.fit_transform(X1), columns=X1.columns)
58
+
59
+ X1[panel_col] = train[panel_col]
60
+ X1[target_col] = train[target_col]
61
+
62
+ if test is not None:
63
+ X2 = test[features]
64
+ X2 = pd.DataFrame(ss.transform(X2), columns=X2.columns)
65
+ X2[panel_col] = test[panel_col]
66
+ X2[target_col] = test[target_col]
67
+ return X1, X2
68
+ return X1
69
+
70
+ def mdf_predict(X_df, mdf, random_eff_df) :
71
+ X=X_df.copy()
72
+ X=pd.merge(X, random_eff_df[[panel_col,'random_effect']], on=panel_col, how='left')
73
+ X['pred_fixed_effect'] = mdf.predict(X)
74
+
75
+ X['pred'] = X['pred_fixed_effect'] + X['random_effect']
76
+ X.to_csv('Test/merged_df_contri.csv',index=False)
77
+ X.drop(columns=['pred_fixed_effect', 'random_effect'], inplace=True)
78
+
79
+ return X
80
+
81
+
82
+ target='Revenue'
83
+
84
+ # is_panel=False
85
+ # is_panel = st.session_state['is_panel']
86
+ panel_col = [col.lower().replace('.','_').replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in st.session_state['bin_dict']['Panel Level 1'] ] [0]# set the panel column
87
+ date_col = 'date'
88
+
89
+ #st.write(media_data)
90
+
91
+ is_panel = True if len(panel_col)>0 else False
92
+
93
+ # panel_col='markets'
94
+ date_col = 'date'
95
+
96
+ # Sprint4 - if used_response_metrics is not blank, then select one of the used_response_metrics, else target is revenue by default
97
+ if "used_response_metrics" in st.session_state and st.session_state['used_response_metrics']!=[]:
98
+ sel_target_col = st.selectbox("Select the response metric", st.session_state['used_response_metrics'])
99
+ target_col = sel_target_col.lower().replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_")
100
+ else :
101
+ sel_target_col = 'Total Approved Accounts - Revenue'
102
+ target_col = 'total_approved_accounts_revenue'
103
+
104
+ # Sprint4 - Look through all saved tuned models, only show saved models of the sel resp metric (target_col)
105
+ # saved_models = st.session_state['saved_model_names']
106
+ # Sprint4 - get the model obj of the selected model
107
+ # st.write(sel_model_dict)
108
+
109
+ # Sprint3 - Contribution
110
+ if is_panel:
111
+ # read tuned mixedLM model
112
+ # if st.session_state["tuned_model"] is not None :
113
+
114
+ if st.session_state["is_tuned_model"][target_col]==True: #Sprint4
115
+ with open("tuned_model.pkl", 'rb') as file:
116
+ model_dict = pickle.load(file)
117
+ saved_models = list(model_dict.keys())
118
+ required_saved_models = [m.split("__")[0] for m in saved_models if m.split("__")[1] == target_col]
119
+ sel_model = st.selectbox("Select the model to review", required_saved_models)
120
+ sel_model_dict = model_dict[sel_model + "__" + target_col]
121
+
122
+ # model=st.session_state["tuned_model"]
123
+ # X_train=st.session_state["X_train_tuned"]
124
+ # X_test=st.session_state["X_test_tuned"]
125
+ # best_feature_set=st.session_state["tuned_model_features"]
126
+ model=sel_model_dict["Model_object"]
127
+ X_train=sel_model_dict["X_train_tuned"]
128
+ X_test=sel_model_dict["X_test_tuned"]
129
+ best_feature_set=sel_model_dict["feature_set"]
130
+
131
+ # st.write("features", best_feature_set)
132
+ # st.write(X_test.columns)
133
+
134
+ else : # if non tuned model to be used # Pending
135
+ with open("best_models.pkl", 'rb') as file:
136
+ model_dict = pickle.load(file)
137
+ saved_models = list(model_dict.keys())
138
+ required_saved_models = [m.split("__")[0] for m in saved_models if m.split("__")[1] == target_col]
139
+ sel_model = st.selectbox("Select the model to review", required_saved_models)
140
+ sel_model_dict = model_dict[sel_model + "__" + target_col]
141
+ model=st.session_state["base_model"]
142
+ X_train = st.session_state['X_train']
143
+ X_test = st.session_state['X_test']
144
+ # y_train = st.session_state['y_train']
145
+ # y_test = st.session_state['y_test']
146
+ best_feature_set = st.session_state['base_model_feature_set']
147
+ # st.write(best_feature_set)
148
+ # st.write(X_test.columns)
149
+
150
+ # Calculate contributions
151
+
152
+ with open("data_import.pkl", "rb") as f:
153
+ data = pickle.load(f)
154
+
155
+ # Accessing the loaded objects
156
+ st.session_state['orig_media_data'] = data["final_df"]
157
+
158
+ st.session_state['orig_media_data'].columns=[col.lower().replace('.','_').replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in st.session_state['orig_media_data'].columns]
159
+
160
+ media_data = st.session_state["media_data"]
161
+
162
+
163
+ # st.session_state['orig_media_data']=st.session_state["media_data"]
164
+
165
+ #st.write(media_data)
166
+
167
+ contri_df = pd.DataFrame()
168
+
169
+ y = []
170
+ y_pred = []
171
+
172
+ random_eff_df = get_random_effects(media_data, panel_col, model)
173
+ random_eff_df['fixed_effect'] = model.fe_params['Intercept']
174
+ random_eff_df['panel_effect'] = random_eff_df['random_effect'] + random_eff_df['fixed_effect']
175
+ # random_eff_df.to_csv("Test/random_eff_df_contri.csv", index=False)
176
+
177
+ coef_df = pd.DataFrame(model.fe_params)
178
+ coef_df.columns = ['coef']
179
+
180
+ # coef_df.reset_index().to_csv("Test/coef_df_contri1.csv",index=False)
181
+ # print(model.fe_params)
182
+
183
+ x_train_contribution = X_train.copy()
184
+ x_test_contribution = X_test.copy()
185
+
186
+ # preprocessing not needed since X_train is already preprocessed
187
+ # X1, X2 = process_train_and_test(x_train_contribution, x_test_contribution, best_feature_set, panel_col, target_col)
188
+ # x_train_contribution[best_feature_set] = X1[best_feature_set]
189
+ # x_test_contribution[best_feature_set] = X2[best_feature_set]
190
+
191
+ x_train_contribution = mdf_predict(x_train_contribution, model, random_eff_df)
192
+ x_test_contribution = mdf_predict(x_test_contribution, model, random_eff_df)
193
+
194
+ x_train_contribution = pd.merge(x_train_contribution, random_eff_df[[panel_col, 'panel_effect']], on=panel_col,
195
+ how='left')
196
+ x_test_contribution = pd.merge(x_test_contribution, random_eff_df[[panel_col, 'panel_effect']], on=panel_col,
197
+ how='left')
198
+
199
+ inp_coef = coef_df['coef'][1:].tolist() # 0th index is intercept
200
+
201
+ for i in range(len(inp_coef)):
202
+ x_train_contribution[str(best_feature_set[i]) + "_contr"] = inp_coef[i] * x_train_contribution[best_feature_set[i]]
203
+ x_test_contribution[str(best_feature_set[i]) + "_contr"] = inp_coef[i] * x_test_contribution[best_feature_set[i]]
204
+
205
+ x_train_contribution['sum_contributions'] = x_train_contribution.filter(regex="contr").sum(axis=1)
206
+ x_train_contribution['sum_contributions'] = x_train_contribution['sum_contributions'] + x_train_contribution['panel_effect']
207
+
208
+ x_test_contribution['sum_contributions'] = x_test_contribution.filter(regex="contr").sum(axis=1)
209
+ x_test_contribution['sum_contributions'] = x_test_contribution['sum_contributions'] + x_test_contribution['panel_effect']
210
+
211
+ # # test
212
+ x_train_contribution.to_csv("Test/x_train_contribution.csv",index=False)
213
+ x_test_contribution.to_csv("Test/x_test_contribution.csv",index=False)
214
+ #
215
+ # st.session_state['orig_media_data'].to_csv("Test/transformed_data.csv",index=False)
216
+ # st.session_state['X_test_spends'].to_csv("Test/test_spends.csv",index=False)
217
+ # # st.write(st.session_state['orig_media_data'].columns)
218
+
219
+ st.write(date_col,panel_col)
220
+ # st.write(x_test_contribution)
221
+
222
+ overview_test_data_prep_panel(x_test_contribution, st.session_state['orig_media_data'], st.session_state['X_test_spends'],
223
+ date_col, panel_col, target_col)
224
+
225
+ else : # NON PANEL
226
+ if st.session_state["is_tuned_model"][target_col]==True: #Sprint4
227
+ with open("tuned_model.pkl", 'rb') as file:
228
+ model_dict = pickle.load(file)
229
+ saved_models = list(model_dict.keys())
230
+ required_saved_models = [m.split("__")[0] for m in saved_models if m.split("__")[1] == target_col]
231
+ sel_model = st.selectbox("Select the model to review", required_saved_models)
232
+ sel_model_dict = model_dict[sel_model + "__" + target_col]
233
+
234
+ model=sel_model_dict["Model_object"]
235
+ X_train=sel_model_dict["X_train_tuned"]
236
+ X_test=sel_model_dict["X_test_tuned"]
237
+ best_feature_set=sel_model_dict["feature_set"]
238
+
239
+ else : #Sprint4
240
+ with open("best_models.pkl", 'rb') as file:
241
+ model_dict = pickle.load(file)
242
+ saved_models = list(model_dict.keys())
243
+ required_saved_models = [m.split("__")[0] for m in saved_models if m.split("__")[1] == target_col]
244
+ sel_model = st.selectbox("Select the model to review", required_saved_models)
245
+ sel_model_dict = model_dict[sel_model + "__" + target_col]
246
+
247
+ model=sel_model_dict["Model_object"]
248
+ X_train=sel_model_dict["X_train"]
249
+ X_test=sel_model_dict["X_test"]
250
+ best_feature_set=sel_model_dict["feature_set"]
251
+
252
+ x_train_contribution = X_train.copy()
253
+ x_test_contribution = X_test.copy()
254
+
255
+ x_train_contribution['pred'] = model.predict(x_train_contribution[best_feature_set])
256
+ x_test_contribution['pred'] = model.predict(x_test_contribution[best_feature_set])
257
+
258
+ for num,i in enumerate(model.params.values):
259
+ col=best_feature_set[num]
260
+ x_train_contribution[col + "_contr"] = X_train[col] * i
261
+ x_test_contribution[col + "_contr"] = X_test[col] * i
262
+
263
+ x_test_contribution.to_csv("Test/x_test_contribution_non_panel.csv",index=False)
264
+ overview_test_data_prep_nonpanel(x_test_contribution, st.session_state['orig_media_data'].copy(), st.session_state['X_test_spends'].copy(), date_col, target_col)
265
+ # for k, v in st.session_sta
266
+ # te.items():
267
+
268
+ # if k not in ['logout', 'login','config'] and not k.startswith('FormSubmitter'):
269
+ # st.session_state[k] = v
270
+
271
+ # authenticator = st.session_state.get('authenticator')
272
+
273
+ # if authenticator is None:
274
+ # authenticator = load_authenticator()
275
+
276
+ # name, authentication_status, username = authenticator.login('Login', 'main')
277
+ # auth_status = st.session_state['authentication_status']
278
+
279
+ # if auth_status:
280
+ # authenticator.logout('Logout', 'main')
281
+
282
+ # is_state_initiaized = st.session_state.get('initialized',False)
283
+ # if not is_state_initiaized:
284
+
285
+ initialize_data(target_col)
286
+ scenario = st.session_state['scenario']
287
+ raw_df = st.session_state['raw_df']
288
+ st.header('Overview of previous spends')
289
+
290
+ # st.write(scenario.actual_total_spends)
291
+ # st.write(scenario.actual_total_sales)
292
+ columns = st.columns((1,1,3))
293
+
294
+ with columns[0]:
295
+ st.metric(label='Spends', value=format_numbers(float(scenario.actual_total_spends)))
296
+ ###print(f"##################### {scenario.actual_total_sales} ##################")
297
+ with columns[1]:
298
+ st.metric(label=target, value=format_numbers(float(scenario.actual_total_sales),include_indicator=False))
299
+
300
+
301
+ actual_summary_df = create_channel_summary(scenario)
302
+ actual_summary_df['Channel'] = actual_summary_df['Channel'].apply(channel_name_formating)
303
+
304
+ columns = st.columns((2,1))
305
+ with columns[0]:
306
+ with st.expander('Channel wise overview'):
307
+ st.markdown(actual_summary_df.style.set_table_styles(
308
+ [{
309
+ 'selector': 'th',
310
+ 'props': [('background-color', '#11B6BD')]
311
+ },
312
+ {
313
+ 'selector' : 'tr:nth-child(even)',
314
+ 'props' : [('background-color', '#11B6BD')]
315
+ }]).to_html(), unsafe_allow_html=True)
316
+
317
+ st.markdown("<hr>",unsafe_allow_html=True)
318
+ ##############################
319
+
320
+ st.plotly_chart(create_contribution_pie(scenario),use_container_width=True)
321
+ st.markdown("<hr>",unsafe_allow_html=True)
322
+
323
+
324
+ ################################3
325
+ st.plotly_chart(create_contribuion_stacked_plot(scenario),use_container_width=True)
326
+ st.markdown("<hr>",unsafe_allow_html=True)
327
+ #######################################
328
+
329
+ selected_channel_name = st.selectbox('Channel', st.session_state['channels_list'] + ['non media'], format_func=channel_name_formating)
330
+ selected_channel = scenario.channels.get(selected_channel_name,None)
331
+
332
+ st.plotly_chart(create_channel_spends_sales_plot(selected_channel), use_container_width=True)
333
+
334
+ st.markdown("<hr>",unsafe_allow_html=True)
335
+
336
+ # elif auth_status == False:
337
+ # st.error('Username/Password is incorrect')
338
+
339
+ # if auth_status != True:
340
+ # try:
341
+ # username_forgot_pw, email_forgot_password, random_password = authenticator.forgot_password('Forgot password')
342
+ # if username_forgot_pw:
343
+ # st.success('New password sent securely')
344
+ # # Random password to be transferred to user securely
345
+ # elif username_forgot_pw == False:
346
+ # st.error('Username not found')
347
+ # except Exception as e:
348
+ # st.error(e)
pages/7_Build_Response_Curves.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import plotly.express as px
3
+ import numpy as np
4
+ import plotly.graph_objects as go
5
+ from utilities_with_panel import channel_name_formating, load_authenticator, initialize_data
6
+ from sklearn.metrics import r2_score
7
+ from collections import OrderedDict
8
+ from classes import class_from_dict,class_to_dict
9
+ import pickle
10
+ import json
11
+ from utilities import (
12
+ load_local_css,
13
+ set_header,
14
+ channel_name_formating,
15
+ )
16
+
17
+ for k, v in st.session_state.items():
18
+ if k not in ['logout', 'login','config'] and not k.startswith('FormSubmitter'):
19
+ st.session_state[k] = v
20
+
21
+ def s_curve(x,K,b,a,x0):
22
+ return K / (1 + b*np.exp(-a*(x-x0)))
23
+
24
+ def save_scenario(scenario_name):
25
+ """
26
+ Save the current scenario with the mentioned name in the session state
27
+
28
+ Parameters
29
+ ----------
30
+ scenario_name
31
+ Name of the scenario to be saved
32
+ """
33
+ if 'saved_scenarios' not in st.session_state:
34
+ st.session_state = OrderedDict()
35
+
36
+ #st.session_state['saved_scenarios'][scenario_name] = st.session_state['scenario'].save()
37
+ st.session_state['saved_scenarios'][scenario_name] = class_to_dict(st.session_state['scenario'])
38
+ st.session_state['scenario_input'] = ""
39
+ print(type(st.session_state['saved_scenarios']))
40
+ with open('../saved_scenarios.pkl', 'wb') as f:
41
+ pickle.dump(st.session_state['saved_scenarios'],f)
42
+
43
+
44
+ def reset_curve_parameters():
45
+ del st.session_state['K']
46
+ del st.session_state['b']
47
+ del st.session_state['a']
48
+ del st.session_state['x0']
49
+
50
+ def update_response_curve():
51
+ # st.session_state['rcs'][selected_channel_name]['K'] = st.session_state['K']
52
+ # st.session_state['rcs'][selected_channel_name]['b'] = st.session_state['b']
53
+ # st.session_state['rcs'][selected_channel_name]['a'] = st.session_state['a']
54
+ # st.session_state['rcs'][selected_channel_name]['x0'] = st.session_state['x0']
55
+ # rcs = st.session_state['rcs']
56
+ _channel_class = st.session_state['scenario'].channels[selected_channel_name]
57
+ _channel_class.update_response_curves({
58
+ 'K' : st.session_state['K'],
59
+ 'b' : st.session_state['b'],
60
+ 'a' : st.session_state['a'],
61
+ 'x0' : st.session_state['x0']})
62
+
63
+
64
+ # authenticator = st.session_state.get('authenticator')
65
+ # if authenticator is None:
66
+ # authenticator = load_authenticator()
67
+
68
+ # name, authentication_status, username = authenticator.login('Login', 'main')
69
+ # auth_status = st.session_state.get('authentication_status')
70
+
71
+ # if auth_status == True:
72
+ # is_state_initiaized = st.session_state.get('initialized',False)
73
+ # if not is_state_initiaized:
74
+ # print("Scenario page state reloaded")
75
+
76
+ # Sprint4 - if used_response_metrics is not blank, then select one of the used_response_metrics, else target is revenue by default
77
+ st.set_page_config(layout='wide')
78
+ load_local_css('styles.css')
79
+ set_header()
80
+
81
+ if "used_response_metrics" in st.session_state and st.session_state['used_response_metrics']!=[]:
82
+ sel_target_col = st.selectbox("Select the response metric", st.session_state['used_response_metrics'])
83
+ target_col = sel_target_col.lower().replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_")
84
+ else :
85
+ sel_target_col = 'Total Approved Accounts - Revenue'
86
+ target_col = 'total_approved_accounts_revenue'
87
+
88
+ initialize_data(target_col)
89
+
90
+ st.subheader("Build response curves")
91
+
92
+ channels_list = st.session_state['channels_list']
93
+ selected_channel_name = st.selectbox('Channel', st.session_state['channels_list'] + ['Others'], format_func=channel_name_formating,on_change=reset_curve_parameters)
94
+
95
+ rcs = {}
96
+ for channel_name in channels_list:
97
+ rcs[channel_name] = st.session_state['scenario'].channels[channel_name].response_curve_params
98
+ # rcs = st.session_state['rcs']
99
+
100
+
101
+ if 'K' not in st.session_state:
102
+ st.session_state['K'] = rcs[selected_channel_name]['K']
103
+ if 'b' not in st.session_state:
104
+ st.session_state['b'] = rcs[selected_channel_name]['b']
105
+ if 'a' not in st.session_state:
106
+ st.session_state['a'] = rcs[selected_channel_name]['a']
107
+ if 'x0' not in st.session_state:
108
+ st.session_state['x0'] = rcs[selected_channel_name]['x0']
109
+
110
+ x = st.session_state['actual_input_df'][selected_channel_name].values
111
+ y = st.session_state['actual_contribution_df'][selected_channel_name].values
112
+
113
+ power = (np.ceil(np.log(x.max()) / np.log(10) )- 3)
114
+
115
+ # fig = px.scatter(x, s_curve(x/10**power,
116
+ # st.session_state['K'],
117
+ # st.session_state['b'],
118
+ # st.session_state['a'],
119
+ # st.session_state['x0']))
120
+
121
+ fig = px.scatter(x=x, y=y)
122
+ fig.add_trace(go.Scatter(x=sorted(x), y=s_curve(sorted(x)/10**power,st.session_state['K'],
123
+ st.session_state['b'],
124
+ st.session_state['a'],
125
+ st.session_state['x0']),
126
+ line=dict(color='red')))
127
+
128
+ fig.update_layout(title_text="Response Curve",showlegend=False)
129
+ fig.update_annotations(font_size=10)
130
+ fig.update_xaxes(title='Spends')
131
+ fig.update_yaxes(title=sel_target_col)
132
+
133
+ st.plotly_chart(fig,use_container_width=True)
134
+
135
+ r2 = r2_score(y, s_curve(x / 10**power,
136
+ st.session_state['K'],
137
+ st.session_state['b'],
138
+ st.session_state['a'],
139
+ st.session_state['x0']))
140
+
141
+ st.metric('R2',round(r2,2))
142
+ columns = st.columns(4)
143
+
144
+ with columns[0]:
145
+ st.number_input('K',key='K',format="%0.5f")
146
+ with columns[1]:
147
+ st.number_input('b',key='b',format="%0.5f")
148
+ with columns[2]:
149
+ st.number_input('a',key='a',step=0.0001,format="%0.5f")
150
+ with columns[3]:
151
+ st.number_input('x0',key='x0',format="%0.5f")
152
+
153
+
154
+ st.button('Update parameters',on_click=update_response_curve)
155
+ st.button('Reset parameters',on_click=reset_curve_parameters)
156
+ scenario_name = st.text_input('Scenario name', key='scenario_input',placeholder='Scenario name',label_visibility='collapsed')
157
+ st.button('Save', on_click=lambda : save_scenario(scenario_name),disabled=len(st.session_state['scenario_input']) == 0)
158
+
159
+ file_name = st.text_input('rcs download file name', key='file_name_input',placeholder='file name',label_visibility='collapsed')
160
+ st.download_button(
161
+ label="Download response curves",
162
+ data=json.dumps(rcs),
163
+ file_name=f"{file_name}.json",
164
+ mime="application/json",
165
+ disabled= len(file_name) == 0,
166
+ )
167
+
168
+
169
+ def s_curve_derivative(x, K, b, a, x0):
170
+ # Derivative of the S-curve function
171
+ return a * b * K * np.exp(-a * (x - x0)) / ((1 + b * np.exp(-a * (x - x0))) ** 2)
172
+
173
+ # Parameters of the S-curve
174
+ K = st.session_state['K']
175
+ b = st.session_state['b']
176
+ a = st.session_state['a']
177
+ x0 = st.session_state['x0']
178
+
179
+ # Optimized spend value obtained from the tool
180
+ optimized_spend = st.number_input('value of x') # Replace this with your optimized spend value
181
+
182
+ # Calculate the slope at the optimized spend value
183
+ slope_at_optimized_spend = s_curve_derivative(optimized_spend, K, b, a, x0)
184
+
185
+ st.write("Slope ", slope_at_optimized_spend)
pages/8_Scenario_Planner.py CHANGED
@@ -23,34 +23,28 @@ import re
23
  import pandas as pd
24
  import plotly.express as px
25
 
26
- target = "Revenue"
27
  st.set_page_config(layout="wide")
28
  load_local_css("styles.css")
29
  set_header()
30
 
31
  for k, v in st.session_state.items():
32
- if k not in ["logout", "login", "config"] and not k.startswith(
33
- "FormSubmitter"
34
- ):
35
  st.session_state[k] = v
36
  # ======================================================== #
37
  # ======================= Functions ====================== #
38
  # ======================================================== #
39
 
40
 
41
- def optimize(key):
42
  """
43
  Optimize the spends for the sales
44
  """
45
 
46
  channel_list = [
47
- key
48
- for key, value in st.session_state["optimization_channels"].items()
49
- if value
50
  ]
51
- # print('channel_list')
52
- # print(channel_list)
53
- # print('@@@@@@@@')
54
  if len(channel_list) > 0:
55
  scenario = st.session_state["scenario"]
56
  if key.lower() == "media spends":
@@ -59,7 +53,8 @@ def optimize(key):
59
  result = st.session_state["scenario"].optimize(
60
  st.session_state["total_spends_change"], channel_list
61
  )
62
- elif key.lower() == "revenue":
 
63
  with status_placeholder:
64
  with st.spinner("Optimizing"):
65
 
@@ -69,14 +64,11 @@ def optimize(key):
69
  for channel_name, modified_spends in result:
70
 
71
  st.session_state[channel_name] = numerize(
72
- modified_spends
73
- * scenario.channels[channel_name].conversion_rate,
74
  1,
75
  )
76
  prev_spends = (
77
- st.session_state["scenario"]
78
- .channels[channel_name]
79
- .actual_total_spends
80
  )
81
  st.session_state[f"{channel_name}_change"] = round(
82
  100 * (modified_spends - prev_spends) / prev_spends, 2
@@ -105,15 +97,46 @@ def save_scenario(scenario_name):
105
  pickle.dump(st.session_state["saved_scenarios"], f)
106
 
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  def update_sales_abs():
 
 
 
 
 
 
 
 
109
  actual_sales = _scenario.actual_total_sales
110
- if validate_input(st.session_state["total_sales_change_abs"]):
 
 
 
111
  modified_sales = extract_number_for_string(
112
  st.session_state["total_sales_change_abs"]
113
  )
114
  st.session_state["total_sales_change"] = round(
115
  ((modified_sales / actual_sales) - 1) * 100
116
  )
 
117
 
118
 
119
  def update_sales():
@@ -122,32 +145,95 @@ def update_sales():
122
  * _scenario.actual_total_sales,
123
  1,
124
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
 
127
  def update_all_spends_abs():
 
 
 
 
 
 
 
 
128
  actual_spends = _scenario.actual_total_spends
129
- if validate_input(st.session_state["total_spends_change_abs"]):
 
 
 
130
  modified_spends = extract_number_for_string(
131
  st.session_state["total_spends_change_abs"]
132
  )
133
- print(modified_spends)
134
- print(actual_spends)
135
-
136
  st.session_state["total_spends_change"] = (
137
  (modified_spends / actual_spends) - 1
138
  ) * 100
 
 
 
139
 
140
  update_all_spends()
141
 
142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  def update_all_spends():
144
  """
145
  Updates spends for all the channels with the given overall spends change
146
  """
147
  percent_change = st.session_state["total_spends_change"]
148
- st.session_state["total_spends_change_abs"] = numerize(
149
- (1 + percent_change / 100) * _scenario.actual_total_spends, 1
150
- )
151
  for channel_name in st.session_state["channels_list"]:
152
  channel = st.session_state["scenario"].channels[channel_name]
153
  current_spends = channel.actual_total_spends
@@ -199,16 +285,10 @@ def update_data(channel_name):
199
  """
200
 
201
  if validate_input(st.session_state[channel_name]):
202
- modified_spends = extract_number_for_string(
203
- st.session_state[channel_name]
204
- )
205
  prev_spends = (
206
- st.session_state["scenario"]
207
- .channels[channel_name]
208
- .actual_total_spends
209
- * st.session_state["scenario"]
210
- .channels[channel_name]
211
- .conversion_rate
212
  )
213
  st.session_state[f"{channel_name}_change"] = round(
214
  100 * (modified_spends - prev_spends) / prev_spends, 2
@@ -216,9 +296,7 @@ def update_data(channel_name):
216
  st.session_state["scenario"].update(
217
  channel_name,
218
  modified_spends
219
- / st.session_state["scenario"]
220
- .channels[channel_name]
221
- .conversion_rate,
222
  )
223
  # st.session_state['scenario'].update(channel_name, modified_spends)
224
  # else:
@@ -249,31 +327,55 @@ def select_all_channels_for_optimization():
249
  st.session_state[f"{channel_name}_selected"] = st.session_state[
250
  "optimze_all_channels"
251
  ]
252
- st.session_state["optimization_channels"][channel_name] = (
253
- st.session_state["optimze_all_channels"]
254
- )
255
 
256
 
257
  def update_penalty():
258
  """
259
  Updates the penalty flag for sales calculation
260
  """
261
- st.session_state["scenario"].update_penalty(
262
- st.session_state["apply_penalty"]
263
- )
264
 
265
 
266
- def reset_scenario():
267
  # #print(st.session_state['default_scenario_dict'])
268
  # st.session_state['scenario'] = class_from_dict(st.session_state['default_scenario_dict'])
269
  # for channel in st.session_state['scenario'].channels.values():
270
  # st.session_state[channel.name] = float(channel.actual_total_spends * channel.conversion_rate)
271
- initialize_data()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  for channel_name in st.session_state["channels_list"]:
273
  st.session_state[f"{channel_name}_selected"] = False
274
  st.session_state[f"{channel_name}_change"] = 0
275
  st.session_state["optimze_all_channels"] = False
276
 
 
 
 
 
 
 
 
 
 
277
 
278
  def format_number(num):
279
  if num >= 1_000_000:
@@ -305,9 +407,7 @@ def summary_plot(data, x, y, title, text_column):
305
  hovertemplate="%{x:.2s}",
306
  )
307
 
308
- fig.update_layout(
309
- xaxis_title=x, yaxis_title="Channel Name", showlegend=False
310
- )
311
  return fig
312
 
313
 
@@ -342,27 +442,21 @@ def calculate_rgba(
342
  relative_position = (current_channel_spends - start_value) / (
343
  left_value - start_value
344
  )
345
- alpha = 0.8 - (
346
- 0.6 * relative_position
347
- ) # Alpha decreases from start to end
348
 
349
  elif left_value < current_channel_spends <= right_value:
350
  color = "green"
351
  relative_position = (current_channel_spends - left_value) / (
352
  right_value - left_value
353
  )
354
- alpha = 0.8 - (
355
- 0.6 * relative_position
356
- ) # Alpha decreases from start to end
357
 
358
  elif right_value < current_channel_spends <= end_value:
359
  color = "red"
360
  relative_position = (current_channel_spends - right_value) / (
361
  end_value - right_value
362
  )
363
- alpha = 0.2 + (
364
- 0.6 * relative_position
365
- ) # Alpha increases from start to end
366
 
367
  else:
368
  # Default case, if the spends are outside the defined ranges
@@ -432,9 +526,7 @@ def plot_response_curves():
432
 
433
  for index in range(len(x_plot)):
434
  marginal_roi.append(
435
- a
436
- * y[index]
437
- * (1 - y[index] / np.maximum(K, np.finfo(float).eps))
438
  )
439
 
440
  x = (
@@ -466,9 +558,7 @@ def plot_response_curves():
466
  st.session_state["scenario"].channels[col].modified_total_spends
467
  * st.session_state["scenario"].channels[col].conversion_rate
468
  )
469
- y_optimal = (
470
- st.session_state["scenario"].channels[col].modified_total_sales
471
- )
472
 
473
  # if col == "Paid_social_others":
474
  # debug_temp(x_optimal * x_actual / x_actual.sum(), power, K, b, a, x0)
@@ -576,7 +666,7 @@ def plot_response_curves():
576
  fig.update_layout(
577
  # height=1000,
578
  # width=1000,
579
- title_text="Response Curves (X: Spends Vs Y: Revenue)",
580
  showlegend=False,
581
  shapes=shapes,
582
  )
@@ -718,12 +808,144 @@ authenticator = stauth.Authenticate(
718
  st.session_state["authenticator"] = authenticator
719
  name, authentication_status, username = authenticator.login("Login", "main")
720
  auth_status = st.session_state.get("authentication_status")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
721
  if auth_status == True:
722
  authenticator.logout("Logout", "main")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
723
  is_state_initiaized = st.session_state.get("initialized", False)
724
- if not is_state_initiaized:
725
- initialize_data()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
726
 
 
727
  channels_list = st.session_state["channels_list"]
728
 
729
  # ======================================================== #
@@ -731,12 +953,16 @@ if auth_status == True:
731
  # ======================================================== #
732
 
733
  # print(list(st.session_state.keys()))
734
-
735
- st.header("Simulation")
736
  main_header = st.columns((2, 2))
737
  sub_header = st.columns((1, 1, 1, 1))
738
  _scenario = st.session_state["scenario"]
739
 
 
 
 
 
 
 
740
  if "total_spends_change_abs" not in st.session_state:
741
  st.session_state["total_spends_change_abs"] = numerize(
742
  _scenario.actual_total_spends, 1
@@ -747,6 +973,16 @@ if auth_status == True:
747
  _scenario.actual_total_sales, 1
748
  )
749
 
 
 
 
 
 
 
 
 
 
 
750
  with main_header[0]:
751
  st.subheader("Actual")
752
 
@@ -754,9 +990,7 @@ if auth_status == True:
754
  st.subheader("Simulated")
755
 
756
  with sub_header[0]:
757
- st.metric(
758
- label="Spends", value=format_numbers(_scenario.actual_total_spends)
759
- )
760
 
761
  with sub_header[1]:
762
  st.metric(
@@ -782,33 +1016,49 @@ if auth_status == True:
782
  delta=numerize(_scenario.delta_sales, 1),
783
  )
784
 
785
- with st.expander("Channel Spends Simulator"):
786
  _columns1 = st.columns((2, 2, 1, 1))
787
  with _columns1[0]:
788
-
789
  optimization_selection = st.selectbox(
790
- "Optimize", options=["Media Spends", "Revenue"], key="optimization_key"
791
  )
 
792
  with _columns1[1]:
793
  st.markdown("#")
 
 
 
 
 
 
 
 
794
  st.checkbox(
795
  label="Optimize all Channels",
796
- key=f"optimze_all_channels",
797
  value=False,
798
  on_change=select_all_channels_for_optimization,
799
  )
800
 
801
  with _columns1[2]:
802
  st.markdown("#")
803
- st.button(
804
- "Optimize",
805
- on_click=optimize,
806
- args=(st.session_state["optimization_key"],),
807
- )
 
 
 
808
 
809
  with _columns1[3]:
810
  st.markdown("#")
811
- st.button("Reset", on_click=reset_scenario)
 
 
 
 
 
812
 
813
  _columns2 = st.columns((2, 2, 2))
814
  if st.session_state["optimization_key"] == "Media Spends":
@@ -819,37 +1069,90 @@ if auth_status == True:
819
  # label_visibility="collapsed",
820
  on_change=update_all_spends_abs,
821
  )
822
- with _columns2[1]:
823
 
 
824
  st.number_input(
825
- "Percent",
826
- key=f"total_spends_change",
 
 
827
  step=1,
828
- on_change=update_all_spends,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
829
  )
830
- elif st.session_state["optimization_key"] == "Revenue":
831
- with _columns2[0]:
832
 
 
 
833
  sales_input = st.text_input(
834
  "Absolute",
835
  key="total_sales_change_abs",
836
  on_change=update_sales_abs,
837
  )
 
838
  with _columns2[1]:
839
  st.number_input(
840
- "Percent change",
841
- key=f"total_sales_change",
 
 
842
  step=1,
843
  on_change=update_sales,
844
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
845
 
846
- with _columns2[2]:
847
- st.markdown("#")
848
- status_placeholder = st.empty()
849
-
850
- st.markdown(
851
- """<hr class="spends-heading-seperator">""", unsafe_allow_html=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
852
  )
 
 
853
  _columns = st.columns((2.5, 2, 1.5, 1.5, 1))
854
  with _columns[0]:
855
  generate_spending_header("Channel")
@@ -862,9 +1165,7 @@ if auth_status == True:
862
  with _columns[4]:
863
  generate_spending_header("Optimize")
864
 
865
- st.markdown(
866
- """<hr class="spends-heading-seperator">""", unsafe_allow_html=True
867
- )
868
 
869
  if "acutual_predicted" not in st.session_state:
870
  st.session_state["acutual_predicted"] = {
@@ -874,9 +1175,7 @@ if auth_status == True:
874
  "Delta": [],
875
  }
876
  for i, channel_name in enumerate(channels_list):
877
- _channel_class = st.session_state["scenario"].channels[
878
- channel_name
879
- ]
880
  _columns = st.columns((2.5, 1.5, 1.5, 1.5, 1))
881
  with _columns[0]:
882
  st.write(channel_name_formating(channel_name))
@@ -885,12 +1184,8 @@ if auth_status == True:
885
  with _columns[1]:
886
  channel_bounds = _channel_class.bounds
887
  channel_spends = float(_channel_class.actual_total_spends)
888
- min_value = float(
889
- (1 + channel_bounds[0] / 100) * channel_spends
890
- )
891
- max_value = float(
892
- (1 + channel_bounds[1] / 100) * channel_spends
893
- )
894
  ##print(st.session_state[channel_name])
895
  spend_input = st.text_input(
896
  channel_name,
@@ -901,9 +1196,11 @@ if auth_status == True:
901
  if not validate_input(spend_input):
902
  st.error("Invalid input")
903
 
 
 
904
  st.number_input(
905
- "Percent change",
906
- key=f"{channel_name}_change",
907
  step=1,
908
  on_change=partial(update_data_by_percent, channel_name),
909
  )
@@ -915,12 +1212,10 @@ if auth_status == True:
915
  * _channel_class.conversion_rate
916
  )
917
  actual_channel_spends = float(
918
- _channel_class.actual_total_spends
919
- * _channel_class.conversion_rate
920
  )
921
  spends_delta = float(
922
- _channel_class.delta_spends
923
- * _channel_class.conversion_rate
924
  )
925
  st.session_state["acutual_predicted"]["Channel_name"].append(
926
  channel_name
@@ -928,12 +1223,10 @@ if auth_status == True:
928
  st.session_state["acutual_predicted"]["Actual_spend"].append(
929
  actual_channel_spends
930
  )
931
- st.session_state["acutual_predicted"][
932
- "Optimized_spend"
933
- ].append(current_channel_spends)
934
- st.session_state["acutual_predicted"]["Delta"].append(
935
- spends_delta
936
  )
 
937
  ## REMOVE
938
  st.metric(
939
  "Spends",
@@ -944,29 +1237,32 @@ if auth_status == True:
944
 
945
  with _columns[3]:
946
  # sales
947
- current_channel_sales = float(
948
- _channel_class.modified_total_sales
949
- )
950
  actual_channel_sales = float(_channel_class.actual_total_sales)
951
  sales_delta = float(_channel_class.delta_sales)
952
  st.metric(
953
  target,
954
- format_numbers(
955
- current_channel_sales, include_indicator=False
956
- ),
957
  delta=numerize(sales_delta, 1),
958
  label_visibility="collapsed",
959
  )
960
 
961
  with _columns[4]:
962
 
 
 
 
 
 
 
 
 
 
963
  st.checkbox(
964
  label="select for optimization",
965
  key=f"{channel_name}_selected",
966
  value=False,
967
- on_change=partial(
968
- select_channel_for_optimization, channel_name
969
- ),
970
  label_visibility="collapsed",
971
  )
972
 
@@ -978,20 +1274,29 @@ if auth_status == True:
978
  # Bins
979
  col = channels_list[i]
980
  x_actual = st.session_state["scenario"].channels[col].actual_spends
981
- x_modified = (
982
- st.session_state["scenario"].channels[col].modified_spends
983
- )
984
 
985
  x_total = x_modified.sum()
986
  power = np.ceil(np.log(x_actual.max()) / np.log(10)) - 3
987
 
988
- K = st.session_state["rcs"][col]["K"]
989
- b = st.session_state["rcs"][col]["b"]
990
- a = st.session_state["rcs"][col]["a"]
991
- x0 = st.session_state["rcs"][col]["x0"]
 
 
 
 
 
 
 
 
992
 
993
  x_plot = np.linspace(0, 5 * x_actual.sum(), 200)
994
 
 
 
 
995
  x, y, marginal_roi = [], [], []
996
  for x_p in x_plot:
997
  x.append(x_p * x_actual / x_actual.sum())
@@ -1001,9 +1306,7 @@ if auth_status == True:
1001
 
1002
  for index in range(len(x_plot)):
1003
  marginal_roi.append(
1004
- a
1005
- * y[index]
1006
- * (1 - y[index] / np.maximum(K, np.finfo(float).eps))
1007
  )
1008
 
1009
  x = (
@@ -1018,12 +1321,18 @@ if auth_status == True:
1018
 
1019
  roi = y / np.maximum(x, np.finfo(float).eps)
1020
 
1021
- start_value, end_value, left_value, right_value = (
1022
- find_segment_value(
1023
- x,
1024
- roi,
1025
- marginal_roi,
1026
- )
 
 
 
 
 
 
1027
  )
1028
 
1029
  rgba = calculate_rgba(
@@ -1034,16 +1343,6 @@ if auth_status == True:
1034
  current_channel_spends,
1035
  )
1036
 
1037
- # Protecting division by zero by adding a small epsilon to denominators
1038
- roi_current = current_channel_sales / np.maximum(
1039
- current_channel_spends, np.finfo(float).eps
1040
- )
1041
- marginal_roi_current = (
1042
- st.session_state["scenario"]
1043
- .channels[col]
1044
- .get_marginal_roi("modified")
1045
- )
1046
-
1047
  with bin_placeholder:
1048
  st.markdown(
1049
  f"""
@@ -1061,7 +1360,7 @@ if auth_status == True:
1061
  unsafe_allow_html=True,
1062
  )
1063
 
1064
- with st.expander("See Response Curves"):
1065
  fig = plot_response_curves()
1066
  st.plotly_chart(fig, use_container_width=True)
1067
 
@@ -1081,19 +1380,11 @@ if auth_status == True:
1081
  )
1082
 
1083
  summary_df = pd.DataFrame(st.session_state["acutual_predicted"])
1084
- summary_df.drop_duplicates(
1085
- subset="Channel_name", keep="last", inplace=True
1086
- )
1087
 
1088
  summary_df_sorted = summary_df.sort_values(by="Delta", ascending=False)
1089
  summary_df_sorted["Delta_percent"] = np.round(
1090
- (
1091
- (
1092
- summary_df_sorted["Optimized_spend"]
1093
- / summary_df_sorted["Actual_spend"]
1094
- )
1095
- - 1
1096
- )
1097
  * 100,
1098
  2,
1099
  )
@@ -1121,9 +1412,9 @@ if auth_status != True:
1121
  authenticator.forgot_password("Forgot password")
1122
  )
1123
  if username_forgot_pw:
1124
- st.session_state["config"]["credentials"]["usernames"][
1125
- username_forgot_pw
1126
- ]["password"] = stauth.Hasher([random_password]).generate()[0]
1127
  send_email(email_forgot_password, random_password)
1128
  st.success("New password sent securely")
1129
  # Random password to be transferred to user securely
 
23
  import pandas as pd
24
  import plotly.express as px
25
 
26
+
27
  st.set_page_config(layout="wide")
28
  load_local_css("styles.css")
29
  set_header()
30
 
31
  for k, v in st.session_state.items():
32
+ if k not in ["logout", "login", "config"] and not k.startswith("FormSubmitter"):
 
 
33
  st.session_state[k] = v
34
  # ======================================================== #
35
  # ======================= Functions ====================== #
36
  # ======================================================== #
37
 
38
 
39
+ def optimize(key, status_placeholder):
40
  """
41
  Optimize the spends for the sales
42
  """
43
 
44
  channel_list = [
45
+ key for key, value in st.session_state["optimization_channels"].items() if value
 
 
46
  ]
47
+
 
 
48
  if len(channel_list) > 0:
49
  scenario = st.session_state["scenario"]
50
  if key.lower() == "media spends":
 
53
  result = st.session_state["scenario"].optimize(
54
  st.session_state["total_spends_change"], channel_list
55
  )
56
+ # elif key.lower() == "revenue":
57
+ else:
58
  with status_placeholder:
59
  with st.spinner("Optimizing"):
60
 
 
64
  for channel_name, modified_spends in result:
65
 
66
  st.session_state[channel_name] = numerize(
67
+ modified_spends * scenario.channels[channel_name].conversion_rate,
 
68
  1,
69
  )
70
  prev_spends = (
71
+ st.session_state["scenario"].channels[channel_name].actual_total_spends
 
 
72
  )
73
  st.session_state[f"{channel_name}_change"] = round(
74
  100 * (modified_spends - prev_spends) / prev_spends, 2
 
97
  pickle.dump(st.session_state["saved_scenarios"], f)
98
 
99
 
100
+ if "allow_spends_update" not in st.session_state:
101
+ st.session_state["allow_spends_update"] = True
102
+
103
+ if "allow_sales_update" not in st.session_state:
104
+ st.session_state["allow_sales_update"] = True
105
+
106
+
107
+ def update_sales_abs_slider():
108
+ actual_sales = _scenario.actual_total_sales
109
+ if validate_input(st.session_state["total_sales_change_abs_slider"]):
110
+ modified_sales = extract_number_for_string(
111
+ st.session_state["total_sales_change_abs_slider"]
112
+ )
113
+ st.session_state["total_sales_change"] = round(
114
+ ((modified_sales / actual_sales) - 1) * 100
115
+ )
116
+ st.session_state["total_sales_change_abs"] = numerize(modified_sales, 1)
117
+
118
+
119
  def update_sales_abs():
120
+ if (
121
+ st.session_state["total_sales_change_abs"]
122
+ in st.session_state["total_sales_change_abs_slider_options"]
123
+ ):
124
+ st.session_state["allow_sales_update"] = True
125
+ else:
126
+ st.session_state["allow_sales_update"] = False
127
+
128
  actual_sales = _scenario.actual_total_sales
129
+ if (
130
+ validate_input(st.session_state["total_sales_change_abs"])
131
+ and st.session_state["allow_sales_update"]
132
+ ):
133
  modified_sales = extract_number_for_string(
134
  st.session_state["total_sales_change_abs"]
135
  )
136
  st.session_state["total_sales_change"] = round(
137
  ((modified_sales / actual_sales) - 1) * 100
138
  )
139
+ st.session_state["total_sales_change_abs_slider"] = numerize(modified_sales, 1)
140
 
141
 
142
  def update_sales():
 
145
  * _scenario.actual_total_sales,
146
  1,
147
  )
148
+ st.session_state["total_sales_change_abs_slider"] = numerize(
149
+ (1 + st.session_state["total_sales_change"] / 100)
150
+ * _scenario.actual_total_sales,
151
+ 1,
152
+ )
153
+
154
+
155
+ def update_all_spends_abs_slider():
156
+ actual_spends = _scenario.actual_total_spends
157
+ if validate_input(st.session_state["total_spends_change_abs_slider"]):
158
+ modified_spends = extract_number_for_string(
159
+ st.session_state["total_spends_change_abs_slider"]
160
+ )
161
+ st.session_state["total_spends_change"] = round(
162
+ ((modified_spends / actual_spends) - 1) * 100
163
+ )
164
+ st.session_state["total_spends_change_abs"] = numerize(modified_spends, 1)
165
+
166
+ update_all_spends()
167
+
168
+
169
+ # def update_all_spends_abs_slider():
170
+ # actual_spends = _scenario.actual_total_spends
171
+ # if validate_input(st.session_state["total_spends_change_abs_slider"]):
172
+ # print("#" * 100)
173
+ # print(st.session_state["total_spends_change_abs_slider"])
174
+ # print("#" * 100)
175
+
176
+ # modified_spends = extract_number_for_string(
177
+ # st.session_state["total_spends_change_abs_slider"]
178
+ # )
179
+ # st.session_state["total_spends_change"] = (
180
+ # (modified_spends / actual_spends) - 1
181
+ # ) * 100
182
+ # st.session_state["total_spends_change_abs"] = st.session_state[
183
+ # "total_spends_change_abs_slider"
184
+ # ]
185
+
186
+ # update_all_spends()
187
 
188
 
189
  def update_all_spends_abs():
190
+ if (
191
+ st.session_state["total_spends_change_abs"]
192
+ in st.session_state["total_spends_change_abs_slider_options"]
193
+ ):
194
+ st.session_state["allow_spends_update"] = True
195
+ else:
196
+ st.session_state["allow_spends_update"] = False
197
+
198
  actual_spends = _scenario.actual_total_spends
199
+ if (
200
+ validate_input(st.session_state["total_spends_change_abs"])
201
+ and st.session_state["allow_spends_update"]
202
+ ):
203
  modified_spends = extract_number_for_string(
204
  st.session_state["total_spends_change_abs"]
205
  )
 
 
 
206
  st.session_state["total_spends_change"] = (
207
  (modified_spends / actual_spends) - 1
208
  ) * 100
209
+ st.session_state["total_spends_change_abs_slider"] = st.session_state[
210
+ "total_spends_change_abs"
211
+ ]
212
 
213
  update_all_spends()
214
 
215
 
216
+ def update_spends():
217
+ st.session_state["total_spends_change_abs"] = numerize(
218
+ (1 + st.session_state["total_spends_change"] / 100)
219
+ * _scenario.actual_total_spends,
220
+ 1,
221
+ )
222
+ st.session_state["total_spends_change_abs_slider"] = numerize(
223
+ (1 + st.session_state["total_spends_change"] / 100)
224
+ * _scenario.actual_total_spends,
225
+ 1,
226
+ )
227
+
228
+ update_all_spends()
229
+
230
+
231
  def update_all_spends():
232
  """
233
  Updates spends for all the channels with the given overall spends change
234
  """
235
  percent_change = st.session_state["total_spends_change"]
236
+
 
 
237
  for channel_name in st.session_state["channels_list"]:
238
  channel = st.session_state["scenario"].channels[channel_name]
239
  current_spends = channel.actual_total_spends
 
285
  """
286
 
287
  if validate_input(st.session_state[channel_name]):
288
+ modified_spends = extract_number_for_string(st.session_state[channel_name])
 
 
289
  prev_spends = (
290
+ st.session_state["scenario"].channels[channel_name].actual_total_spends
291
+ * st.session_state["scenario"].channels[channel_name].conversion_rate
 
 
 
 
292
  )
293
  st.session_state[f"{channel_name}_change"] = round(
294
  100 * (modified_spends - prev_spends) / prev_spends, 2
 
296
  st.session_state["scenario"].update(
297
  channel_name,
298
  modified_spends
299
+ / st.session_state["scenario"].channels[channel_name].conversion_rate,
 
 
300
  )
301
  # st.session_state['scenario'].update(channel_name, modified_spends)
302
  # else:
 
327
  st.session_state[f"{channel_name}_selected"] = st.session_state[
328
  "optimze_all_channels"
329
  ]
330
+ st.session_state["optimization_channels"][channel_name] = st.session_state[
331
+ "optimze_all_channels"
332
+ ]
333
 
334
 
335
  def update_penalty():
336
  """
337
  Updates the penalty flag for sales calculation
338
  """
339
+ st.session_state["scenario"].update_penalty(st.session_state["apply_penalty"])
 
 
340
 
341
 
342
+ def reset_scenario(panel_selected, file_selected, updated_rcs):
343
  # #print(st.session_state['default_scenario_dict'])
344
  # st.session_state['scenario'] = class_from_dict(st.session_state['default_scenario_dict'])
345
  # for channel in st.session_state['scenario'].channels.values():
346
  # st.session_state[channel.name] = float(channel.actual_total_spends * channel.conversion_rate)
347
+ # initialize_data()
348
+
349
+ if panel_selected == "Aggregated":
350
+ initialize_data(
351
+ panel=panel_selected,
352
+ target_file=file_selected,
353
+ updated_rcs=updated_rcs,
354
+ metrics=metrics_selected,
355
+ )
356
+ panel = None
357
+ else:
358
+ initialize_data(
359
+ panel=panel_selected,
360
+ target_file=file_selected,
361
+ updated_rcs=updated_rcs,
362
+ metrics=metrics_selected,
363
+ )
364
+
365
  for channel_name in st.session_state["channels_list"]:
366
  st.session_state[f"{channel_name}_selected"] = False
367
  st.session_state[f"{channel_name}_change"] = 0
368
  st.session_state["optimze_all_channels"] = False
369
 
370
+ st.session_state["total_sales_change"] = 0
371
+
372
+ update_spends()
373
+ update_sales()
374
+
375
+ reset_inputs()
376
+
377
+ # st.rerun()
378
+
379
 
380
  def format_number(num):
381
  if num >= 1_000_000:
 
407
  hovertemplate="%{x:.2s}",
408
  )
409
 
410
+ fig.update_layout(xaxis_title=x, yaxis_title="Channel Name", showlegend=False)
 
 
411
  return fig
412
 
413
 
 
442
  relative_position = (current_channel_spends - start_value) / (
443
  left_value - start_value
444
  )
445
+ alpha = 0.8 - (0.6 * relative_position) # Alpha decreases from start to end
 
 
446
 
447
  elif left_value < current_channel_spends <= right_value:
448
  color = "green"
449
  relative_position = (current_channel_spends - left_value) / (
450
  right_value - left_value
451
  )
452
+ alpha = 0.8 - (0.6 * relative_position) # Alpha decreases from start to end
 
 
453
 
454
  elif right_value < current_channel_spends <= end_value:
455
  color = "red"
456
  relative_position = (current_channel_spends - right_value) / (
457
  end_value - right_value
458
  )
459
+ alpha = 0.2 + (0.6 * relative_position) # Alpha increases from start to end
 
 
460
 
461
  else:
462
  # Default case, if the spends are outside the defined ranges
 
526
 
527
  for index in range(len(x_plot)):
528
  marginal_roi.append(
529
+ a * y[index] * (1 - y[index] / np.maximum(K, np.finfo(float).eps))
 
 
530
  )
531
 
532
  x = (
 
558
  st.session_state["scenario"].channels[col].modified_total_spends
559
  * st.session_state["scenario"].channels[col].conversion_rate
560
  )
561
+ y_optimal = st.session_state["scenario"].channels[col].modified_total_sales
 
 
562
 
563
  # if col == "Paid_social_others":
564
  # debug_temp(x_optimal * x_actual / x_actual.sum(), power, K, b, a, x0)
 
666
  fig.update_layout(
667
  # height=1000,
668
  # width=1000,
669
+ title_text=f"Response Curves (X: Spends Vs Y: {target})",
670
  showlegend=False,
671
  shapes=shapes,
672
  )
 
808
  st.session_state["authenticator"] = authenticator
809
  name, authentication_status, username = authenticator.login("Login", "main")
810
  auth_status = st.session_state.get("authentication_status")
811
+
812
+ import os
813
+ import glob
814
+
815
+
816
+ def get_excel_names(directory):
817
+ # Create a list to hold the final parts of the filenames
818
+ last_portions = []
819
+
820
+ # Patterns to match Excel files (.xlsx and .xls) that contain @#
821
+ patterns = [
822
+ os.path.join(directory, "*@#*.xlsx"),
823
+ os.path.join(directory, "*@#*.xls"),
824
+ ]
825
+
826
+ # Process each pattern
827
+ for pattern in patterns:
828
+ files = glob.glob(pattern)
829
+
830
+ # Extracting the last portion after @# for each file
831
+ for file in files:
832
+ base_name = os.path.basename(file)
833
+ last_portion = base_name.split("@#")[-1]
834
+ last_portion = last_portion.replace(".xlsx", "").replace(
835
+ ".xls", ""
836
+ ) # Removing extensions
837
+ last_portions.append(last_portion)
838
+
839
+ return last_portions
840
+
841
+
842
+ def name_formating(channel_name):
843
+ # Replace underscores with spaces
844
+ name_mod = channel_name.replace("_", " ")
845
+
846
+ # Capitalize the first letter of each word
847
+ name_mod = name_mod.title()
848
+
849
+ return name_mod
850
+
851
+
852
+ @st.cache_resource(show_spinner=False)
853
+ def panel_fetch(file_selected):
854
+ raw_data_mmm_df = pd.read_excel(file_selected, sheet_name="RAW DATA MMM")
855
+
856
+ if "Panel" in raw_data_mmm_df.columns:
857
+ panel = list(set(raw_data_mmm_df["Panel"]))
858
+ else:
859
+ raw_data_mmm_df = None
860
+ panel = None
861
+
862
+ return panel
863
+
864
+
865
+ def reset_inputs():
866
+ if "total_spends_change_abs" in st.session_state:
867
+ del st.session_state.total_spends_change_abs
868
+ if "total_spends_change" in st.session_state:
869
+ del st.session_state.total_spends_change
870
+ if "total_spends_change_abs_slider" in st.session_state:
871
+ del st.session_state.total_spends_change_abs_slider
872
+
873
+ if "total_sales_change_abs" in st.session_state:
874
+ del st.session_state.total_sales_change_abs
875
+ if "total_sales_change" in st.session_state:
876
+ del st.session_state.total_sales_change
877
+ if "total_sales_change_abs_slider" in st.session_state:
878
+ del st.session_state.total_sales_change_abs_slider
879
+
880
+ st.session_state["initialized"] = False
881
+
882
+
883
  if auth_status == True:
884
  authenticator.logout("Logout", "main")
885
+
886
+ st.header("Simulation")
887
+ col1, col2 = st.columns([1, 1])
888
+
889
+ # Response Metrics
890
+ directory = "metrics_level_data"
891
+ metrics_list = get_excel_names(directory)
892
+ metrics_selected = col1.selectbox(
893
+ "Response Metrics",
894
+ metrics_list,
895
+ format_func=name_formating,
896
+ index=0,
897
+ on_change=reset_inputs,
898
+ )
899
+
900
+ # Target
901
+ target = name_formating(metrics_selected)
902
+
903
+ file_selected = (
904
+ f".\metrics_level_data\Overview_data_test_panel@#{metrics_selected}.xlsx"
905
+ )
906
+
907
+ # Panel List
908
+ panel_list = panel_fetch(file_selected)
909
+
910
+ # Panel Selected
911
+ panel_selected = col2.selectbox(
912
+ "Panel",
913
+ ["Aggregated"] + panel_list,
914
+ index=0,
915
+ on_change=reset_inputs,
916
+ )
917
+
918
+ if "update_rcs" in st.session_state:
919
+ updated_rcs = st.session_state["update_rcs"]
920
+ else:
921
+ updated_rcs = None
922
+
923
+ if "first_time" not in st.session_state:
924
+ st.session_state["first_time"] = True
925
+
926
+ # Check if state is initiaized
927
  is_state_initiaized = st.session_state.get("initialized", False)
928
+ if not is_state_initiaized or st.session_state["first_time"]:
929
+ # initialize_data()
930
+ if panel_selected == "Aggregated":
931
+ initialize_data(
932
+ panel=panel_selected,
933
+ target_file=file_selected,
934
+ updated_rcs=updated_rcs,
935
+ metrics=metrics_selected,
936
+ )
937
+ panel = None
938
+ else:
939
+ initialize_data(
940
+ panel=panel_selected,
941
+ target_file=file_selected,
942
+ updated_rcs=updated_rcs,
943
+ metrics=metrics_selected,
944
+ )
945
+ st.session_state["initialized"] = True
946
+ st.session_state["first_time"] = False
947
 
948
+ # Channels List
949
  channels_list = st.session_state["channels_list"]
950
 
951
  # ======================================================== #
 
953
  # ======================================================== #
954
 
955
  # print(list(st.session_state.keys()))
 
 
956
  main_header = st.columns((2, 2))
957
  sub_header = st.columns((1, 1, 1, 1))
958
  _scenario = st.session_state["scenario"]
959
 
960
+ if "total_spends_change" not in st.session_state:
961
+ st.session_state.total_spends_change = 0
962
+
963
+ if "total_sales_change" not in st.session_state:
964
+ st.session_state.total_sales_change = 0
965
+
966
  if "total_spends_change_abs" not in st.session_state:
967
  st.session_state["total_spends_change_abs"] = numerize(
968
  _scenario.actual_total_spends, 1
 
973
  _scenario.actual_total_sales, 1
974
  )
975
 
976
+ if "total_spends_change_abs_slider" not in st.session_state:
977
+ st.session_state.total_spends_change_abs_slider = numerize(
978
+ _scenario.actual_total_spends, 1
979
+ )
980
+
981
+ if "total_sales_change_abs_slider" not in st.session_state:
982
+ st.session_state.total_sales_change_abs_slider = numerize(
983
+ _scenario.actual_total_sales, 1
984
+ )
985
+
986
  with main_header[0]:
987
  st.subheader("Actual")
988
 
 
990
  st.subheader("Simulated")
991
 
992
  with sub_header[0]:
993
+ st.metric(label="Spends", value=format_numbers(_scenario.actual_total_spends))
 
 
994
 
995
  with sub_header[1]:
996
  st.metric(
 
1016
  delta=numerize(_scenario.delta_sales, 1),
1017
  )
1018
 
1019
+ with st.expander("Channel Spends Simulator", expanded=True):
1020
  _columns1 = st.columns((2, 2, 1, 1))
1021
  with _columns1[0]:
 
1022
  optimization_selection = st.selectbox(
1023
+ "Optimize", options=["Media Spends", target], key="optimization_key"
1024
  )
1025
+
1026
  with _columns1[1]:
1027
  st.markdown("#")
1028
+ # if st.checkbox(
1029
+ # label="Optimize all Channels",
1030
+ # key="optimze_all_channels",
1031
+ # value=False,
1032
+ # # on_change=select_all_channels_for_optimization,
1033
+ # ):
1034
+ # select_all_channels_for_optimization()
1035
+
1036
  st.checkbox(
1037
  label="Optimize all Channels",
1038
+ key="optimze_all_channels",
1039
  value=False,
1040
  on_change=select_all_channels_for_optimization,
1041
  )
1042
 
1043
  with _columns1[2]:
1044
  st.markdown("#")
1045
+ # st.button(
1046
+ # "Optimize",
1047
+ # on_click=optimize,
1048
+ # args=(st.session_state["optimization_key"]),
1049
+ # use_container_width=True,
1050
+ # )
1051
+
1052
+ optimize_placeholder = st.empty()
1053
 
1054
  with _columns1[3]:
1055
  st.markdown("#")
1056
+ st.button(
1057
+ "Reset",
1058
+ on_click=reset_scenario,
1059
+ args=(panel_selected, file_selected, updated_rcs),
1060
+ use_container_width=True,
1061
+ )
1062
 
1063
  _columns2 = st.columns((2, 2, 2))
1064
  if st.session_state["optimization_key"] == "Media Spends":
 
1069
  # label_visibility="collapsed",
1070
  on_change=update_all_spends_abs,
1071
  )
 
1072
 
1073
+ with _columns2[1]:
1074
  st.number_input(
1075
+ "Percent Change",
1076
+ key="total_spends_change",
1077
+ min_value=-50,
1078
+ max_value=50,
1079
  step=1,
1080
+ on_change=update_spends,
1081
+ )
1082
+
1083
+ with _columns2[2]:
1084
+ min_value = round(_scenario.actual_total_spends * 0.5)
1085
+ max_value = round(_scenario.actual_total_spends * 1.5)
1086
+ st.session_state["total_spends_change_abs_slider_options"] = [
1087
+ numerize(value, 1)
1088
+ for value in range(min_value, max_value + 1, int(1e4))
1089
+ ]
1090
+
1091
+ st.select_slider(
1092
+ "Absolute Slider",
1093
+ options=st.session_state["total_spends_change_abs_slider_options"],
1094
+ key="total_spends_change_abs_slider",
1095
+ on_change=update_all_spends_abs_slider,
1096
  )
 
 
1097
 
1098
+ elif st.session_state["optimization_key"] == target:
1099
+ with _columns2[0]:
1100
  sales_input = st.text_input(
1101
  "Absolute",
1102
  key="total_sales_change_abs",
1103
  on_change=update_sales_abs,
1104
  )
1105
+
1106
  with _columns2[1]:
1107
  st.number_input(
1108
+ "Percent Change",
1109
+ key="total_sales_change",
1110
+ min_value=-50,
1111
+ max_value=50,
1112
  step=1,
1113
  on_change=update_sales,
1114
  )
1115
+ with _columns2[2]:
1116
+ min_value = round(_scenario.actual_total_sales * 0.5)
1117
+ max_value = round(_scenario.actual_total_sales * 1.5)
1118
+ st.session_state["total_sales_change_abs_slider_options"] = [
1119
+ numerize(value, 1)
1120
+ for value in range(min_value, max_value + 1, int(1e5))
1121
+ ]
1122
+
1123
+ st.select_slider(
1124
+ "Absolute Slider",
1125
+ options=st.session_state["total_sales_change_abs_slider_options"],
1126
+ key="total_sales_change_abs_slider",
1127
+ on_change=update_sales_abs_slider,
1128
+ )
1129
 
1130
+ if (
1131
+ not st.session_state["allow_sales_update"]
1132
+ and optimization_selection == target
1133
+ ):
1134
+ st.warning("Invalid Input")
1135
+
1136
+ if (
1137
+ not st.session_state["allow_spends_update"]
1138
+ and optimization_selection == "Media Spends"
1139
+ ):
1140
+ st.warning("Invalid Input")
1141
+
1142
+ status_placeholder = st.empty()
1143
+
1144
+ # if optimize_placeholder.button("Optimize", use_container_width=True):
1145
+ # optimize(st.session_state["optimization_key"], status_placeholder)
1146
+ # st.rerun()
1147
+
1148
+ optimize_placeholder.button(
1149
+ "Optimize",
1150
+ on_click=optimize,
1151
+ args=(st.session_state["optimization_key"], status_placeholder),
1152
+ use_container_width=True,
1153
  )
1154
+
1155
+ st.markdown("""<hr class="spends-heading-seperator">""", unsafe_allow_html=True)
1156
  _columns = st.columns((2.5, 2, 1.5, 1.5, 1))
1157
  with _columns[0]:
1158
  generate_spending_header("Channel")
 
1165
  with _columns[4]:
1166
  generate_spending_header("Optimize")
1167
 
1168
+ st.markdown("""<hr class="spends-heading-seperator">""", unsafe_allow_html=True)
 
 
1169
 
1170
  if "acutual_predicted" not in st.session_state:
1171
  st.session_state["acutual_predicted"] = {
 
1175
  "Delta": [],
1176
  }
1177
  for i, channel_name in enumerate(channels_list):
1178
+ _channel_class = st.session_state["scenario"].channels[channel_name]
 
 
1179
  _columns = st.columns((2.5, 1.5, 1.5, 1.5, 1))
1180
  with _columns[0]:
1181
  st.write(channel_name_formating(channel_name))
 
1184
  with _columns[1]:
1185
  channel_bounds = _channel_class.bounds
1186
  channel_spends = float(_channel_class.actual_total_spends)
1187
+ min_value = float((1 + channel_bounds[0] / 100) * channel_spends)
1188
+ max_value = float((1 + channel_bounds[1] / 100) * channel_spends)
 
 
 
 
1189
  ##print(st.session_state[channel_name])
1190
  spend_input = st.text_input(
1191
  channel_name,
 
1196
  if not validate_input(spend_input):
1197
  st.error("Invalid input")
1198
 
1199
+ channel_name_current = f"{channel_name}_change"
1200
+
1201
  st.number_input(
1202
+ "Percent Change",
1203
+ key=channel_name_current,
1204
  step=1,
1205
  on_change=partial(update_data_by_percent, channel_name),
1206
  )
 
1212
  * _channel_class.conversion_rate
1213
  )
1214
  actual_channel_spends = float(
1215
+ _channel_class.actual_total_spends * _channel_class.conversion_rate
 
1216
  )
1217
  spends_delta = float(
1218
+ _channel_class.delta_spends * _channel_class.conversion_rate
 
1219
  )
1220
  st.session_state["acutual_predicted"]["Channel_name"].append(
1221
  channel_name
 
1223
  st.session_state["acutual_predicted"]["Actual_spend"].append(
1224
  actual_channel_spends
1225
  )
1226
+ st.session_state["acutual_predicted"]["Optimized_spend"].append(
1227
+ current_channel_spends
 
 
 
1228
  )
1229
+ st.session_state["acutual_predicted"]["Delta"].append(spends_delta)
1230
  ## REMOVE
1231
  st.metric(
1232
  "Spends",
 
1237
 
1238
  with _columns[3]:
1239
  # sales
1240
+ current_channel_sales = float(_channel_class.modified_total_sales)
 
 
1241
  actual_channel_sales = float(_channel_class.actual_total_sales)
1242
  sales_delta = float(_channel_class.delta_sales)
1243
  st.metric(
1244
  target,
1245
+ format_numbers(current_channel_sales, include_indicator=False),
 
 
1246
  delta=numerize(sales_delta, 1),
1247
  label_visibility="collapsed",
1248
  )
1249
 
1250
  with _columns[4]:
1251
 
1252
+ # if st.checkbox(
1253
+ # label="select for optimization",
1254
+ # key=f"{channel_name}_selected",
1255
+ # value=False,
1256
+ # # on_change=partial(select_channel_for_optimization, channel_name),
1257
+ # label_visibility="collapsed",
1258
+ # ):
1259
+ # select_channel_for_optimization(channel_name)
1260
+
1261
  st.checkbox(
1262
  label="select for optimization",
1263
  key=f"{channel_name}_selected",
1264
  value=False,
1265
+ on_change=partial(select_channel_for_optimization, channel_name),
 
 
1266
  label_visibility="collapsed",
1267
  )
1268
 
 
1274
  # Bins
1275
  col = channels_list[i]
1276
  x_actual = st.session_state["scenario"].channels[col].actual_spends
1277
+ x_modified = st.session_state["scenario"].channels[col].modified_spends
 
 
1278
 
1279
  x_total = x_modified.sum()
1280
  power = np.ceil(np.log(x_actual.max()) / np.log(10)) - 3
1281
 
1282
+ updated_rcs_key = f"{metrics_selected}#@{panel_selected}#@{channel_name}"
1283
+
1284
+ if updated_rcs and updated_rcs_key in list(updated_rcs.keys()):
1285
+ K = updated_rcs[updated_rcs_key]["K"]
1286
+ b = updated_rcs[updated_rcs_key]["b"]
1287
+ a = updated_rcs[updated_rcs_key]["a"]
1288
+ x0 = updated_rcs[updated_rcs_key]["x0"]
1289
+ else:
1290
+ K = st.session_state["rcs"][col]["K"]
1291
+ b = st.session_state["rcs"][col]["b"]
1292
+ a = st.session_state["rcs"][col]["a"]
1293
+ x0 = st.session_state["rcs"][col]["x0"]
1294
 
1295
  x_plot = np.linspace(0, 5 * x_actual.sum(), 200)
1296
 
1297
+ # Append current_channel_spends to the end of x_plot
1298
+ x_plot = np.append(x_plot, current_channel_spends)
1299
+
1300
  x, y, marginal_roi = [], [], []
1301
  for x_p in x_plot:
1302
  x.append(x_p * x_actual / x_actual.sum())
 
1306
 
1307
  for index in range(len(x_plot)):
1308
  marginal_roi.append(
1309
+ a * y[index] * (1 - y[index] / np.maximum(K, np.finfo(float).eps))
 
 
1310
  )
1311
 
1312
  x = (
 
1321
 
1322
  roi = y / np.maximum(x, np.finfo(float).eps)
1323
 
1324
+ roi_current, marginal_roi_current = roi[-1], marginal_roi[-1]
1325
+ x, y, roi, marginal_roi = (
1326
+ x[:-1],
1327
+ y[:-1],
1328
+ roi[:-1],
1329
+ marginal_roi[:-1],
1330
+ ) # Drop data for current spends
1331
+
1332
+ start_value, end_value, left_value, right_value = find_segment_value(
1333
+ x,
1334
+ roi,
1335
+ marginal_roi,
1336
  )
1337
 
1338
  rgba = calculate_rgba(
 
1343
  current_channel_spends,
1344
  )
1345
 
 
 
 
 
 
 
 
 
 
 
1346
  with bin_placeholder:
1347
  st.markdown(
1348
  f"""
 
1360
  unsafe_allow_html=True,
1361
  )
1362
 
1363
+ with st.expander("See Response Curves", expanded=True):
1364
  fig = plot_response_curves()
1365
  st.plotly_chart(fig, use_container_width=True)
1366
 
 
1380
  )
1381
 
1382
  summary_df = pd.DataFrame(st.session_state["acutual_predicted"])
1383
+ summary_df.drop_duplicates(subset="Channel_name", keep="last", inplace=True)
 
 
1384
 
1385
  summary_df_sorted = summary_df.sort_values(by="Delta", ascending=False)
1386
  summary_df_sorted["Delta_percent"] = np.round(
1387
+ ((summary_df_sorted["Optimized_spend"] / summary_df_sorted["Actual_spend"]) - 1)
 
 
 
 
 
 
1388
  * 100,
1389
  2,
1390
  )
 
1412
  authenticator.forgot_password("Forgot password")
1413
  )
1414
  if username_forgot_pw:
1415
+ st.session_state["config"]["credentials"]["usernames"][username_forgot_pw][
1416
+ "password"
1417
+ ] = stauth.Hasher([random_password]).generate()[0]
1418
  send_email(email_forgot_password, random_password)
1419
  st.success("New password sent securely")
1420
  # Random password to be transferred to user securely
requirements.txt CHANGED
@@ -1,102 +1,94 @@
1
- altair==5.2.0
2
- annotated-types==0.6.0
3
- attrs==23.2.0
4
- bcrypt==4.1.2
5
- blinker==1.7.0
6
- cachetools==5.3.2
7
- certifi==2024.2.2
8
- charset-normalizer==3.3.2
9
- click==8.1.7
10
- colorama==0.4.6
11
- contourpy==1.2.0
12
- cycler==0.12.1
13
- dacite==1.8.1
14
- et-xmlfile==1.1.0
15
- extra-streamlit-components==0.1.56
16
- fonttools==4.49.0
17
- gitdb==4.0.11
18
- GitPython==3.1.42
19
- htmlmin==0.1.12
20
- idna==3.6
21
- ImageHash==4.3.1
22
- importlib-metadata==7.0.1
23
- importlib-resources==6.1.1
24
- Jinja2==3.1.3
25
- joblib==1.3.2
26
- jsonschema==4.21.1
27
- jsonschema-specifications==2023.12.1
28
- kiwisolver==1.4.5
29
- llvmlite==0.41.1
30
- markdown-it-py==3.0.0
31
- MarkupSafe==2.1.5
32
- matplotlib==3.7.0
33
- matplotlib-inline==0.1.6
34
- mdurl==0.1.2
35
- multimethod==1.11.2
36
- networkx==3.2.1
37
- numba==0.58.1
38
- numerize==0.12
39
- numpy==1.23.5
40
- openpyxl==3.0.10
41
- packaging==23.2
42
- pandas==1.5.2
43
- pandas-profiling==3.6.6
44
- patsy==0.5.6
45
- phik==0.12.4
46
- pillow==10.2.0
47
- pip==24.0
48
- plotly==5.11.0
49
- plotly-express==0.4.1
50
- protobuf==4.25.3
51
- pyarrow==15.0.0
52
- pydantic==2.6.3
53
- pydantic-core==2.16.3
54
- pydantic-settings==2.2.1
55
- pydeck==0.8.1b0
56
- Pygments==2.17.2
57
- PyJWT==2.8.0
58
- pyparsing==3.1.1
59
- python-dateutil==2.8.2
60
- python-decouple==3.8
61
- python-dotenv==1.0.1
62
- pytz==2024.1
63
- PyWavelets==1.5.0
64
- PyYAML==6.0.1
65
- referencing==0.33.0
66
- requests==2.31.0
67
- rich==13.7.0
68
- rpds-py==0.18.0
69
- scikit-learn==1.1.3
70
- scipy==1.11.4
71
- seaborn==0.12.2
72
- setuptools==69.1.0
73
- six==1.16.0
74
- smmap==5.0.1
75
- statsmodels==0.14.0
76
- streamlit==1.31.0
77
- streamlit-aggrid==0.3.4.post3
78
- streamlit-authenticator==0.2.1
79
- streamlit-chat==0.1.1
80
- streamlit-pandas-profiling==0.1.3
81
- sweetviz==2.2.1
82
- tangled-up-in-unicode==0.2.0
83
- tenacity==8.2.3
84
- threadpoolctl==3.3.0
85
- toml==0.10.2
86
- toolz==0.12.1
87
- tornado==6.4
88
- tqdm==4.66.2
89
- traitlets==5.14.1
90
- typeguard==4.1.5
91
- typing-extensions==4.9.0
92
- tzdata==2024.1
93
- tzlocal==5.2
94
- urllib3==2.2.1
95
- validators==0.22.0
96
- visions==0.7.5
97
- watchdog==4.0.0
98
- wheel==0.42.0
99
- wordcloud==1.9.3
100
- ydata-profiling==4.6.5
101
- zipp==3.17.0
102
-
 
1
+ altair == 4.2.0
2
+ attrs == 23.1.0
3
+ bcrypt == 4.0.1
4
+ blinker == 1.6.2
5
+ cachetools == 5.3.1
6
+ certifi == 2023.7.22
7
+ charset-normalizer == 3.2.0
8
+ click == 8.1.7
9
+ colorama == 0.4.6
10
+ contourpy == 1.1.1
11
+ cycler == 0.11.0
12
+ dacite == 1.8.1
13
+ entrypoints == 0.4
14
+ et-xmlfile == 1.1.0
15
+ extra-streamlit-components == 0.1.56
16
+ fonttools == 4.42.1
17
+ gitdb == 4.0.10
18
+ GitPython == 3.1.35
19
+ htmlmin == 0.1.12
20
+ idna == 3.4
21
+ ImageHash == 4.3.1
22
+ importlib-metadata == 6.8.0
23
+ importlib-resources == 6.1.0
24
+ Jinja2 == 3.1.2
25
+ joblib == 1.3.2
26
+ jsonschema == 4.19.0
27
+ jsonschema-specifications== 2023.7.1
28
+ kaleido == 0.2.1
29
+ kiwisolver == 1.4.5
30
+ markdown-it-py == 3.0.0
31
+ MarkupSafe == 2.1.3
32
+ matplotlib == 3.7.0
33
+ mdurl == 0.1.2
34
+ networkx == 3.1
35
+ numerize == 0.12
36
+ numpy == 1.23.5
37
+ openpyxl>=3.1.0
38
+ packaging == 23.1
39
+ pandas == 1.5.2
40
+ pandas-profiling == 3.6.6
41
+ patsy == 0.5.3
42
+ phik == 0.12.3
43
+ Pillow == 10.0.0
44
+ pip == 23.2.1
45
+ plotly == 5.11.0
46
+ protobuf == 3.20.3
47
+ pyarrow == 13.0.0
48
+ pydantic == 1.10.13
49
+ pydeck == 0.8.1b0
50
+ Pygments == 2.16.1
51
+ PyJWT == 2.8.0
52
+ Pympler == 1.0.1
53
+ pyparsing == 3.1.1
54
+ python-dateutil == 2.8.2
55
+ python-decouple == 3.8
56
+ pytz == 2023.3.post1
57
+ PyWavelets == 1.4.1
58
+ PyYAML == 6.0.1
59
+ referencing == 0.30.2
60
+ requests == 2.31.0
61
+ rich == 13.5.2
62
+ rpds-py == 0.10.2
63
+ scikit-learn == 1.1.3
64
+ scipy == 1.9.3
65
+ seaborn == 0.12.2
66
+ semver == 3.0.1
67
+ setuptools == 68.1.2
68
+ six == 1.16.0
69
+ smmap == 5.0.0
70
+ statsmodels == 0.14.0
71
+ streamlit == 1.16.0
72
+ streamlit-aggrid == 0.3.4.post3
73
+ streamlit-authenticator == 0.2.1
74
+ streamlit-pandas-profiling== 0.1.3
75
+ sweetviz == 2.2.1
76
+ tangled-up-in-unicode == 0.2.0
77
+ tenacity == 8.2.3
78
+ threadpoolctl == 3.2.0
79
+ toml == 0.10.2
80
+ toolz == 0.12.0
81
+ tornado == 6.3.3
82
+ tqdm == 4.66.1
83
+ typeguard == 2.13.3
84
+ typing_extensions == 4.7.1
85
+ tzdata == 2023.3
86
+ tzlocal == 5.0.1
87
+ urllib3 == 2.0.4
88
+ validators == 0.22.0
89
+ visions == 0.7.5
90
+ watchdog == 3.0.0
91
+ wheel == 0.41.2
92
+ wordcloud == 1.9.2
93
+ ydata-profiling == 4.5.1
94
+ zipp == 3.16.2
 
 
 
 
 
 
 
 
summary_df.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0250c8a092d14c32845f27f6cddb2ac8131f8c280d38489294da847adf61c4e7
3
  size 1482
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f2aa1b3c4f759d4179abf2dbed90751ec0849b3750a1019827173d2152954ac
3
  size 1482
tuned_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9871c17d7d10846b84c31343a1b9fc3ad87c1a67fa8bf8b10b2199032a1581be
3
+ size 4287842
upf_data_converted_old.csv ADDED
The diff for this file is too large to render. See raw diff
 
upf_data_converted_old.xlsx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92716069afa2c16a8afb6494da6d5f93878558de0215b1b9334ffeb997fdc6b6
3
+ size 1561111
upf_data_converted_randomized_resp_metrics.csv ADDED
The diff for this file is too large to render. See raw diff
 
upf_data_converted_randomized_resp_metrics.xlsx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf24972737d4c10d274ce6e3165551442e662992623754dbef11155f4b177531
3
+ size 1893805
utilities.py CHANGED
@@ -12,7 +12,6 @@ import io
12
  import plotly
13
  from pathlib import Path
14
  import pickle
15
- import streamlit_authenticator as stauth
16
  import yaml
17
  from yaml import SafeLoader
18
  from streamlit.components.v1 import html
@@ -24,27 +23,59 @@ import os
24
  import base64
25
 
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
 
29
- color_palette = ['#F3F3F0', '#5E7D7E', '#2FA1FF', '#00EDED', '#00EAE4', '#304550', '#EDEBEB', '#7FBEFD', '#003059', '#A2F3F3', '#E1D6E2', '#B6B6B6']
30
 
 
31
 
32
- CURRENCY_INDICATOR = '$'
33
 
34
  def load_authenticator():
35
- with open('config.yaml') as file:
36
  config = yaml.load(file, Loader=SafeLoader)
37
- st.session_state['config'] = config
38
  authenticator = stauth.Authenticate(
39
- config['credentials'],
40
- config['cookie']['name'],
41
- config['cookie']['key'],
42
- config['cookie']['expiry_days'],
43
- config['preauthorized']
44
  )
45
- st.session_state['authenticator'] = authenticator
46
  return authenticator
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  def nav_page(page_name, timeout_secs=3):
49
  nav_script = """
50
  <script type="text/javascript">
@@ -67,7 +98,10 @@ def nav_page(page_name, timeout_secs=3):
67
  attempt_nav_page("%s", new Date(), %d);
68
  });
69
  </script>
70
- """ % (page_name, timeout_secs)
 
 
 
71
  html(nav_script)
72
 
73
 
@@ -92,23 +126,18 @@ data_url = base64.b64encode(contents).decode("utf-8")
92
 
93
  file_.close()
94
 
95
-
96
 
97
- DATA_PATH = './data'
98
 
99
- IMAGES_PATH = './data/images_224_224'
100
 
101
-
102
 
103
  def load_local_css(file_name):
104
 
105
  with open(file_name) as f:
106
 
107
- st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
108
 
109
-
110
-
111
-
112
 
113
  # def set_header():
114
 
@@ -129,24 +158,24 @@ data_url1 = base64.b64encode(contents1).decode("utf-8")
129
 
130
  file_1.close()
131
 
132
-
133
-
134
- DATA_PATH1 = './data'
135
-
136
- IMAGES_PATH1 = './data/images_224_224'
137
-
138
 
 
139
 
 
140
 
141
 
142
  def set_header():
143
- return st.markdown(f"""<div class='main-header'>
 
144
  <!-- <h1></h1> -->
145
  <div >
146
  <img class='blend-logo' src="data:image;base64,{data_url1}", alt="Logo">
147
  </div>
148
  <img class='blend-logo' src="data:image;base64,{data_url}", alt="Logo">
149
- </div>""", unsafe_allow_html=True)
 
 
 
150
 
151
  # def set_header():
152
  # logo_path = "./path/to/your/local/LIME_logo.png" # Replace with the actual file path
@@ -157,51 +186,87 @@ def set_header():
157
  # </div>""", unsafe_allow_html=True)
158
 
159
 
160
- def s_curve(x,K,b,a,x0):
161
- return K / (1 + b * np.exp(-a*(x-x0)))
 
 
 
 
 
 
 
 
 
162
 
163
- def initialize_data():
 
 
 
 
 
 
 
 
 
 
 
164
  # uopx_conv_rates = {'streaming_impressions' : 0.007,'digital_impressions' : 0.007,'search_clicks' : 0.00719,'tv_impressions' : 0.000173,
165
  # "digital_clicks":0.005,"streaming_clicks":0.004,'streaming_spends':1,"tv_spends":1,"search_spends":1,
166
  # "digital_spends":1}
167
- #print('State initialized')
168
- excel = pd.read_excel("Overview_data_test.xlsx",sheet_name=None)
169
- raw_df = excel['RAW DATA MMM']
170
-
171
- spend_df = excel['SPEND INPUT']
172
- contri_df = excel['CONTRIBUTION MMM']
173
- #Revenue_df = excel['Revenue']
174
-
175
- ## remove sesonalities, indices etc ...
176
- exclude_columns = ['Date',
177
- 'Region',
178
- 'Controls_Grammarly_Index_SeasonalAVG',
179
- 'Controls_Quillbot_Index',
180
- 'Daily_Positive_Outliers',
181
- 'External_RemoteClass_Index',
182
- 'Intervals ON 20190520-20190805 | 20200518-20200803 | 20210517-20210802',
183
- 'Intervals ON 20190826-20191209 | 20200824-20201207 | 20210823-20211206',
184
- 'Intervals ON 20201005-20201019',
185
- 'Promotion_PercentOff',
186
- 'Promotion_TimeBased',
187
- 'Seasonality_Indicator_Chirstmas',
188
- 'Seasonality_Indicator_NewYears_Days',
189
- 'Seasonality_Indicator_Thanksgiving',
190
- 'Trend 20200302 / 20200803',
191
- ]
192
- raw_df['Date']=pd.to_datetime(raw_df['Date'])
193
- contri_df['Date']=pd.to_datetime(contri_df['Date'])
194
- input_df = raw_df.sort_values(by='Date')
195
- output_df = contri_df.sort_values(by='Date')
196
- spend_df['Week'] = pd.to_datetime(spend_df['Week'], format='%Y-%m-%d', errors='coerce')
197
- spend_df.sort_values(by='Week', inplace=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
 
199
  # spend_df['Week'] = pd.to_datetime(spend_df['Week'], errors='coerce')
200
  # spend_df = spend_df.sort_values(by='Week')
201
-
202
 
203
  channel_list = [col for col in input_df.columns if col not in exclude_columns]
204
-
 
205
  response_curves = {}
206
  mapes = {}
207
  rmses = {}
@@ -215,14 +280,14 @@ def initialize_data():
215
  dates = input_df.Date.values
216
  actual_output_dic = {}
217
  actual_input_dic = {}
218
-
219
  for inp_col in channel_list:
220
- #st.write(inp_col)
221
  spends = input_df[inp_col].values
222
  x = spends.copy()
223
- # upper limit for penalty
224
- upper_limits[inp_col] = 2*x.max()
225
-
226
  # contribution
227
  out_col = [_col for _col in output_df.columns if _col.startswith(inp_col)][0]
228
  y = output_df[out_col].values.copy()
@@ -230,96 +295,141 @@ def initialize_data():
230
  actual_input_dic[inp_col] = x.copy()
231
  ##output cols aggregation
232
  output_cols.append(out_col)
233
-
234
  ## scale the input
235
- power = (np.ceil(np.log(x.max()) / np.log(10) )- 3)
236
- if power >= 0 :
237
  x = x / 10**power
238
-
239
-
240
- x = x.astype('float64')
241
- y = y.astype('float64')
242
- #print('#printing yyyyyyyyy')
243
- #print(inp_col)
244
- #print(x.max())
245
- #print(y.max())
246
- bounds = ((0, 0, 0, 0), (3*y.max(), 1000, 1, x.max()))
247
-
248
- #bounds = ((y.max(), 3*y.max()),(0,1000),(0,1),(0,x.max()))
249
- params,_ = curve_fit(s_curve,x,y,p0=(2*y.max(),0.01,1e-5,x.max()),
250
- bounds=bounds,
251
- maxfev=int(1e5))
 
 
 
 
252
  mape = (100 * abs(1 - s_curve(x, *params) / y.clip(min=1))).mean()
253
- rmse = np.sqrt(((y - s_curve(x,*params))**2).mean())
254
- r2_ = r2_score(y, s_curve(x,*params))
 
 
 
 
 
 
 
 
 
 
 
255
 
256
- response_curves[inp_col] = {'K' : params[0], 'b' : params[1], 'a' : params[2], 'x0' : params[3]}
257
  mapes[inp_col] = mape
258
  rmses[inp_col] = rmse
259
  r2[inp_col] = r2_
260
  powers[inp_col] = power
261
-
262
-
263
  ## conversion rates
264
- spend_col = [_col for _col in spend_df.columns if _col.startswith(inp_col.rsplit('_',1)[0])][0]
265
-
266
- #print('#printing spendssss')
267
- #print(spend_col)
268
- conv = (spend_df.set_index('Week')[spend_col] / input_df.set_index('Date')[inp_col].clip(lower=1)).reset_index()
269
- conv.rename(columns={'index':'Week'},inplace=True)
270
- conv['year'] = conv.Week.dt.year
271
- conv_rates[inp_col] = list(conv.drop('Week',axis=1).mean().to_dict().values())[0]
 
 
 
 
 
 
 
 
 
272
  ##print('Before',conv_rates[inp_col])
273
  # conv_rates[inp_col] = uopx_conv_rates[inp_col]
274
  ##print('After',(conv_rates[inp_col]))
275
-
276
-
277
- channel = Channel(name=inp_col,dates=dates,
278
- spends=spends,
279
- # conversion_rate = np.mean(list(conv_rates[inp_col].values())),
280
- conversion_rate = conv_rates[inp_col],
281
- response_curve_type='s-curve',
282
- response_curve_params={'K' : params[0], 'b' : params[1], 'a' : params[2], 'x0' : params[3]},
283
- bounds=np.array([-10,10]))
 
 
 
 
 
 
 
284
  channels[inp_col] = channel
285
  if sales is None:
286
  sales = channel.actual_sales
287
  else:
288
  sales += channel.actual_sales
289
- other_contributions = output_df.drop([*output_cols], axis=1).sum(axis=1, numeric_only = True).values
290
- correction = output_df.drop('Date',axis=1).sum(axis=1).values - (sales + other_contributions)
291
- scenario = Scenario(name='default', channels=channels, constant=other_contributions, correction = correction)
 
 
 
 
 
 
 
 
 
292
  ## setting session variables
293
- st.session_state['initialized'] = True
294
- st.session_state['actual_df'] = input_df
295
- st.session_state['raw_df'] = raw_df
296
- st.session_state['contri_df'] = output_df
297
  default_scenario_dict = class_to_dict(scenario)
298
- st.session_state['default_scenario_dict'] = default_scenario_dict
299
- st.session_state['scenario'] = scenario
300
- st.session_state['channels_list'] = channel_list
301
- st.session_state['optimization_channels'] = {channel_name : False for channel_name in channel_list}
302
- st.session_state['rcs'] = response_curves
303
- st.session_state['powers'] = powers
304
- st.session_state['actual_contribution_df'] = pd.DataFrame(actual_output_dic)
305
- st.session_state['actual_input_df'] = pd.DataFrame(actual_input_dic)
306
-
 
 
 
307
  for channel in channels.values():
308
- st.session_state[channel.name] = numerize(channel.actual_total_spends * channel.conversion_rate,1)
309
-
310
- st.session_state['xlsx_buffer'] = io.BytesIO()
311
-
312
-
313
- if Path('../saved_scenarios.pkl').exists():
314
- with open('../saved_scenarios.pkl','rb') as f:
315
- st.session_state['saved_scenarios'] = pickle.load(f)
 
316
  else:
317
- st.session_state['saved_scenarios'] = OrderedDict()
318
-
319
- st.session_state['total_spends_change'] = 0
320
- st.session_state['optimization_channels'] = {channel_name : False for channel_name in channel_list}
321
- st.session_state['disable_download_button'] = True
322
-
 
 
 
323
  # def initialize_data():
324
  # # fetch data from excel
325
  # output = pd.read_excel('data.xlsx',sheet_name=None)
@@ -335,17 +445,17 @@ def initialize_data():
335
  # channel_list.append(col)
336
  # else:
337
  # pass
338
-
339
  # ## NOTE : Considered only Desktop spends for all calculations
340
  # acutal_df = raw_df[raw_df.Region == 'Desktop'].copy()
341
  # ## NOTE : Considered one year of data
342
  # acutal_df = acutal_df[acutal_df.Date>'2020-12-31']
343
  # actual_df = acutal_df.drop('Region',axis=1).sort_values(by='Date')[[*channel_list,'Date']]
344
-
345
  # ##load response curves
346
  # with open('./grammarly_response_curves.json','r') as f:
347
  # response_curves = json.load(f)
348
-
349
  # ## create channel dict for scenario creation
350
  # dates = actual_df.Date.values
351
  # channels = {}
@@ -363,15 +473,15 @@ def initialize_data():
363
  # response_curve_type=response_curve_type,
364
  # response_curve_params=response_curve_params,
365
  # bounds=np.array([-30,30]))
366
-
367
  # channels[name] = channel
368
  # else:
369
  # constant = info_dict.get('value',0.) * len(dates)
370
-
371
  # ## create scenario
372
  # scenario = Scenario(name='default', channels=channels, constant=constant)
373
  # default_scenario_dict = class_to_dict(scenario)
374
-
375
 
376
  # ## setting session variables
377
  # st.session_state['initialized'] = True
@@ -385,7 +495,7 @@ def initialize_data():
385
  # for channel in channels.values():
386
  # if channel.name not in st.session_state:
387
  # st.session_state[channel.name] = float(channel.actual_total_spends)
388
-
389
  # if 'xlsx_buffer' not in st.session_state:
390
  # st.session_state['xlsx_buffer'] = io.BytesIO()
391
 
@@ -394,51 +504,121 @@ def initialize_data():
394
  # if Path('../saved_scenarios.pkl').exists():
395
  # with open('../saved_scenarios.pkl','rb') as f:
396
  # st.session_state['saved_scenarios'] = pickle.load(f)
397
-
398
  # else:
399
  # st.session_state['saved_scenarios'] = OrderedDict()
400
 
401
  # if 'total_spends_change' not in st.session_state:
402
  # st.session_state['total_spends_change'] = 0
403
-
404
  # if 'optimization_channels' not in st.session_state:
405
  # st.session_state['optimization_channels'] = {channel_name : False for channel_name in channel_list}
406
-
407
  # if 'disable_download_button' not in st.session_state:
408
  # st.session_state['disable_download_button'] = True
409
-
410
-
411
  def create_channel_summary(scenario):
412
 
413
  # Provided data
414
  data = {
415
- 'Channel': ['Paid Search', 'Ga will cid baixo risco', 'Digital tactic others', 'Fb la tier 1', 'Fb la tier 2', 'Paid social others', 'Programmatic', 'Kwai', 'Indicacao', 'Infleux', 'Influencer'],
416
- 'Spends': ['$ 11.3K', '$ 155.2K', '$ 50.7K', '$ 125.4K', '$ 125.2K', '$ 105K', '$ 3.3M', '$ 47.5K', '$ 55.9K', '$ 632.3K', '$ 48.3K'],
417
- 'Revenue': ['558.0K', '3.5M', '5.2M', '3.1M', '3.1M', '2.1M', '20.8M', '1.6M', '728.4K', '22.9M', '4.8M']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418
  }
419
 
420
  # Create DataFrame
421
  df = pd.DataFrame(data)
422
 
423
  # Convert currency strings to numeric values
424
- df['Spends'] = df['Spends'].replace({'\$': '', 'K': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(int)
425
- df['Revenue'] = df['Revenue'].replace({'\$': '', 'K': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(int)
 
 
 
 
 
 
 
 
 
 
426
 
427
  # Calculate ROI
428
- df['ROI'] = ((df['Revenue'] - df['Spends']) / df['Spends'])
429
 
430
  # Format columns
431
  format_currency = lambda x: f"${x:,.1f}"
432
  format_roi = lambda x: f"{x:.1f}"
433
 
434
- df['Spends'] = ['$ 11.3K', '$ 155.2K', '$ 50.7K', '$ 125.4K', '$ 125.2K', '$ 105K', '$ 3.3M', '$ 47.5K', '$ 55.9K', '$ 632.3K', '$ 48.3K']
435
- df['Revenue'] = ['$ 536.3K', '$ 3.4M', '$ 5M', '$ 3M', '$ 3M', '$ 2M', '$ 20M', '$ 1.5M', '$ 7.1M', '$ 22M', '$ 4.6M']
436
- df['ROI'] = df['ROI'].apply(format_roi)
437
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
438
  return df
439
 
440
 
441
- #@st.cache(allow_output_mutation=True)
442
  # def create_contribution_pie(scenario):
443
  # #c1f7dc
444
  # colors_map = {col:color for col,color in zip(st.session_state['channels_list'],plotly.colors.n_colors(plotly.colors.hex_to_rgb('#BE6468'), plotly.colors.hex_to_rgb('#E7B8B7'),23))}
@@ -470,23 +650,23 @@ def create_channel_summary(scenario):
470
  # weekly_spends_data = []
471
  # weekly_sales_data = []
472
  # for channel_name in st.session_state['channels_list']:
473
- # weekly_spends_data.append((go.Bar(x=x,
474
  # y=scenario.channels[channel_name].actual_spends * scenario.channels[channel_name].conversion_rate,
475
- # name=channel_name_formating(channel_name),
476
  # hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}",
477
  # legendgroup=channel_name)))
478
- # weekly_sales_data.append((go.Bar(x=x,
479
  # y=scenario.channels[channel_name].actual_sales,
480
- # name=channel_name_formating(channel_name),
481
  # hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
482
  # legendgroup=channel_name, showlegend=False)))
483
  # for _d in weekly_spends_data:
484
  # weekly_contribution_fig.add_trace(_d, row=1, col=1)
485
  # for _d in weekly_sales_data:
486
  # weekly_contribution_fig.add_trace(_d, row=1, col=2)
487
- # weekly_contribution_fig.add_trace(go.Bar(x=x,
488
  # y=scenario.constant + scenario.correction,
489
- # name='Non Media',
490
  # hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}"), row=1, col=2)
491
  # weekly_contribution_fig.update_layout(barmode='stack', title='Channel contribuion by week', xaxis_title='Date')
492
  # weekly_contribution_fig.update_xaxes(showgrid=False)
@@ -524,14 +704,50 @@ def create_channel_summary(scenario):
524
 
525
 
526
  def create_contribution_pie():
527
- color_palette = ['#F3F3F0', '#5E7D7E', '#2FA1FF', '#00EDED', '#00EAE4', '#304550', '#EDEBEB', '#7FBEFD', '#003059', '#A2F3F3', '#E1D6E2', '#B6B6B6']
528
- total_contribution_fig = make_subplots(rows=1, cols=2, subplot_titles=['Spends', 'Revenue'], specs=[[{"type": "pie"}, {"type": "pie"}]])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
529
 
530
- channels_list = ['Paid Search', 'Ga will cid baixo risco', 'Digital tactic others', 'Fb la tier 1', 'Fb la tier 2', 'Paid social others', 'Programmatic', 'Kwai', 'Indicacao', 'Infleux', 'Influencer', 'Non Media']
 
 
 
 
 
 
 
 
 
 
 
 
 
531
 
532
  # Assign colors from the limited palette to channels
533
- colors_map = {col: color_palette[i % len(color_palette)] for i, col in enumerate(channels_list)}
534
- colors_map['Non Media'] = color_palette[5] # Assign fixed green color for 'Non Media'
 
 
 
 
 
535
 
536
  # Hardcoded values for Spends and Revenue
537
  spends_values = [0.5, 3.36, 1.1, 2.7, 2.7, 2.27, 70.6, 1, 1, 13.7, 1, 0]
@@ -542,10 +758,13 @@ def create_contribution_pie():
542
  go.Pie(
543
  labels=[channel_name for channel_name in channels_list],
544
  values=spends_values,
545
- marker=dict(colors=[colors_map[channel_name] for channel_name in channels_list]),
546
- hole=0.3
 
 
547
  ),
548
- row=1, col=1
 
549
  )
550
 
551
  # Add trace for Revenue pie chart
@@ -553,144 +772,196 @@ def create_contribution_pie():
553
  go.Pie(
554
  labels=[channel_name for channel_name in channels_list],
555
  values=revenue_values,
556
- marker=dict(colors=[colors_map[channel_name] for channel_name in channels_list]),
557
- hole=0.3
 
 
558
  ),
559
- row=1, col=2
 
 
 
 
 
 
 
 
560
  )
561
-
562
- total_contribution_fig.update_traces(textposition='inside', texttemplate='%{percent:.1%}')
563
- total_contribution_fig.update_layout(uniformtext_minsize=12, title='Channel contribution', uniformtext_mode='hide')
564
  return total_contribution_fig
565
 
 
566
  def create_contribuion_stacked_plot(scenario):
567
- weekly_contribution_fig = make_subplots(rows=1, cols=2, subplot_titles=['Spends', 'Revenue'], specs=[[{"type": "bar"}, {"type": "bar"}]])
568
- raw_df = st.session_state['raw_df']
569
- df = raw_df.sort_values(by='Date')
 
 
 
 
 
570
  x = df.Date
571
  weekly_spends_data = []
572
  weekly_sales_data = []
573
-
574
- for i, channel_name in enumerate(st.session_state['channels_list']):
575
  color = color_palette[i % len(color_palette)]
576
-
577
- weekly_spends_data.append(go.Bar(
578
- x=x,
579
- y=scenario.channels[channel_name].actual_spends * scenario.channels[channel_name].conversion_rate,
580
- name=channel_name_formating(channel_name),
581
- hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}",
582
- legendgroup=channel_name,
583
- marker_color=color,
584
- ))
585
-
586
- weekly_sales_data.append(go.Bar(
587
- x=x,
588
- y=scenario.channels[channel_name].actual_sales,
589
- name=channel_name_formating(channel_name),
590
- hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
591
- legendgroup=channel_name,
592
- showlegend=False,
593
- marker_color=color,
594
- ))
595
-
 
 
 
 
 
596
  for _d in weekly_spends_data:
597
  weekly_contribution_fig.add_trace(_d, row=1, col=1)
598
  for _d in weekly_sales_data:
599
  weekly_contribution_fig.add_trace(_d, row=1, col=2)
600
-
601
- weekly_contribution_fig.add_trace(go.Bar(
602
- x=x,
603
- y=scenario.constant + scenario.correction,
604
- name='Non Media',
605
- hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
606
- marker_color=color_palette[-1],
607
- ), row=1, col=2)
608
-
609
- weekly_contribution_fig.update_layout(barmode='stack', title='Channel contribution by week', xaxis_title='Date')
 
 
 
 
 
 
610
  weekly_contribution_fig.update_xaxes(showgrid=False)
611
  weekly_contribution_fig.update_yaxes(showgrid=False)
612
  return weekly_contribution_fig
613
 
 
614
  def create_channel_spends_sales_plot(channel):
615
  if channel is not None:
616
  x = channel.dates
617
  _spends = channel.actual_spends * channel.conversion_rate
618
  _sales = channel.actual_sales
619
  channel_sales_spends_fig = make_subplots(specs=[[{"secondary_y": True}]])
620
- channel_sales_spends_fig.add_trace(go.Bar(
621
- x=x,
622
- y=_sales,
623
- marker_color=color_palette[3], # You can choose a color from the palette
624
- name='Revenue',
625
- hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
626
- ), secondary_y=False)
627
-
628
- channel_sales_spends_fig.add_trace(go.Scatter(
629
- x=x,
630
- y=_spends,
631
- line=dict(color=color_palette[2]), # You can choose another color from the palette
632
- name='Spends',
633
- hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}",
634
- ), secondary_y=True)
635
-
636
- channel_sales_spends_fig.update_layout(xaxis_title='Date', yaxis_title='Revenue', yaxis2_title='Spends ($)', title='Channel spends and Revenue week-wise')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
637
  channel_sales_spends_fig.update_xaxes(showgrid=False)
638
  channel_sales_spends_fig.update_yaxes(showgrid=False)
639
  else:
640
- raw_df = st.session_state['raw_df']
641
- df = raw_df.sort_values(by='Date')
642
  x = df.Date
643
- scenario = class_from_dict(st.session_state['default_scenario_dict'])
644
  _sales = scenario.constant + scenario.correction
645
  channel_sales_spends_fig = make_subplots(specs=[[{"secondary_y": True}]])
646
- channel_sales_spends_fig.add_trace(go.Bar(
647
- x=x,
648
- y=_sales,
649
- marker_color=color_palette[0], # You can choose a color from the palette
650
- name='Revenue',
651
- hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
652
- ), secondary_y=False)
653
-
654
- channel_sales_spends_fig.update_layout(xaxis_title='Date', yaxis_title='Revenue', yaxis2_title='Spends ($)', title='Channel spends and Revenue week-wise')
 
 
 
 
 
 
 
 
 
 
655
  channel_sales_spends_fig.update_xaxes(showgrid=False)
656
  channel_sales_spends_fig.update_yaxes(showgrid=False)
657
-
658
  return channel_sales_spends_fig
659
 
660
- def format_numbers(value, n_decimals=1,include_indicator = True):
 
661
  if include_indicator:
662
- return f'{CURRENCY_INDICATOR} {numerize(value,n_decimals)}'
663
  else:
664
- return f'{numerize(value,n_decimals)}'
665
 
666
 
667
- def decimal_formater(num_string,n_decimals=1):
668
- parts = num_string.split('.')
669
  if len(parts) == 1:
670
- return num_string+'.' + '0'*n_decimals
671
  else:
672
  to_be_padded = n_decimals - len(parts[-1])
673
- if to_be_padded > 0 :
674
- return num_string+'0'*to_be_padded
675
  else:
676
  return num_string
677
-
678
-
679
  def channel_name_formating(channel_name):
680
- name_mod = channel_name.replace('_', ' ')
681
- if name_mod.lower().endswith(' imp'):
682
- name_mod = name_mod.replace('Imp','Spend')
683
- elif name_mod.lower().endswith(' clicks'):
684
- name_mod = name_mod.replace('Clicks','Spend')
685
  return name_mod
686
 
687
 
688
- def send_email(email,message):
689
- s = smtplib.SMTP('smtp.gmail.com', 587)
690
  s.starttls()
691
  s.login("geethu4444@gmail.com", "jgydhpfusuremcol")
692
  s.sendmail("geethu4444@gmail.com", email, message)
693
  s.quit()
694
 
 
695
  if __name__ == "__main__":
696
  initialize_data()
 
12
  import plotly
13
  from pathlib import Path
14
  import pickle
 
15
  import yaml
16
  from yaml import SafeLoader
17
  from streamlit.components.v1 import html
 
23
  import base64
24
 
25
 
26
+ color_palette = [
27
+ "#F3F3F0",
28
+ "#5E7D7E",
29
+ "#2FA1FF",
30
+ "#00EDED",
31
+ "#00EAE4",
32
+ "#304550",
33
+ "#EDEBEB",
34
+ "#7FBEFD",
35
+ "#003059",
36
+ "#A2F3F3",
37
+ "#E1D6E2",
38
+ "#B6B6B6",
39
+ ]
40
 
41
 
42
+ CURRENCY_INDICATOR = "$"
43
 
44
+ import streamlit_authenticator as stauth
45
 
 
46
 
47
  def load_authenticator():
48
+ with open("config.yaml") as file:
49
  config = yaml.load(file, Loader=SafeLoader)
50
+ st.session_state["config"] = config
51
  authenticator = stauth.Authenticate(
52
+ credentials=config["credentials"],
53
+ cookie_name=config["cookie"]["name"],
54
+ key=config["cookie"]["key"],
55
+ cookie_expiry_days=config["cookie"]["expiry_days"],
56
+ preauthorized=config["preauthorized"],
57
  )
58
+ st.session_state["authenticator"] = authenticator
59
  return authenticator
60
 
61
+
62
+ # Authentication
63
+ def authentication():
64
+ with open("config.yaml") as file:
65
+ config = yaml.load(file, Loader=SafeLoader)
66
+
67
+ authenticator = stauth.Authenticate(
68
+ config["credentials"],
69
+ config["cookie"]["name"],
70
+ config["cookie"]["key"],
71
+ config["cookie"]["expiry_days"],
72
+ config["preauthorized"],
73
+ )
74
+
75
+ name, authentication_status, username = authenticator.login("Login", "main")
76
+ return authenticator, name, authentication_status, username
77
+
78
+
79
  def nav_page(page_name, timeout_secs=3):
80
  nav_script = """
81
  <script type="text/javascript">
 
98
  attempt_nav_page("%s", new Date(), %d);
99
  });
100
  </script>
101
+ """ % (
102
+ page_name,
103
+ timeout_secs,
104
+ )
105
  html(nav_script)
106
 
107
 
 
126
 
127
  file_.close()
128
 
 
129
 
130
+ DATA_PATH = "./data"
131
 
132
+ IMAGES_PATH = "./data/images_224_224"
133
 
 
134
 
135
  def load_local_css(file_name):
136
 
137
  with open(file_name) as f:
138
 
139
+ st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
140
 
 
 
 
141
 
142
  # def set_header():
143
 
 
158
 
159
  file_1.close()
160
 
 
 
 
 
 
 
161
 
162
+ DATA_PATH1 = "./data"
163
 
164
+ IMAGES_PATH1 = "./data/images_224_224"
165
 
166
 
167
  def set_header():
168
+ return st.markdown(
169
+ f"""<div class='main-header'>
170
  <!-- <h1></h1> -->
171
  <div >
172
  <img class='blend-logo' src="data:image;base64,{data_url1}", alt="Logo">
173
  </div>
174
  <img class='blend-logo' src="data:image;base64,{data_url}", alt="Logo">
175
+ </div>""",
176
+ unsafe_allow_html=True,
177
+ )
178
+
179
 
180
  # def set_header():
181
  # logo_path = "./path/to/your/local/LIME_logo.png" # Replace with the actual file path
 
186
  # </div>""", unsafe_allow_html=True)
187
 
188
 
189
+ def s_curve(x, K, b, a, x0):
190
+ return K / (1 + b * np.exp(-a * (x - x0)))
191
+
192
+
193
+ def panel_level(input_df, date_column="Date"):
194
+ # Ensure 'Date' is set as the index
195
+ if date_column not in input_df.index.names:
196
+ input_df = input_df.set_index(date_column)
197
+
198
+ # Select numeric columns only (excluding 'Date' since it's now the index)
199
+ numeric_columns_df = input_df.select_dtypes(include="number")
200
 
201
+ # Group by 'Date' (which is the index) and sum the numeric columns
202
+ aggregated_df = numeric_columns_df.groupby(input_df.index).sum()
203
+
204
+ # Reset index if you want 'Date' back as a column
205
+ aggregated_df = aggregated_df.reset_index()
206
+
207
+ return aggregated_df
208
+
209
+
210
+ def initialize_data(
211
+ panel=None, target_file="Overview_data_test.xlsx", updated_rcs=None, metrics=None
212
+ ):
213
  # uopx_conv_rates = {'streaming_impressions' : 0.007,'digital_impressions' : 0.007,'search_clicks' : 0.00719,'tv_impressions' : 0.000173,
214
  # "digital_clicks":0.005,"streaming_clicks":0.004,'streaming_spends':1,"tv_spends":1,"search_spends":1,
215
  # "digital_spends":1}
216
+ # print('State initialized')
217
+
218
+ excel = pd.read_excel(target_file, sheet_name=None)
219
+
220
+ # Extract dataframes for raw data, spend input, and contribution MMM
221
+ raw_df = excel["RAW DATA MMM"]
222
+ spend_df = excel["SPEND INPUT"]
223
+ contri_df = excel["CONTRIBUTION MMM"]
224
+
225
+ # Check if the panel is not None
226
+ if panel is not None and panel != "Aggregated":
227
+ raw_df = raw_df[raw_df["Panel"] == panel].drop(columns=["Panel"])
228
+ spend_df = spend_df[spend_df["Panel"] == panel].drop(columns=["Panel"])
229
+ contri_df = contri_df[contri_df["Panel"] == panel].drop(columns=["Panel"])
230
+ elif panel == "Aggregated":
231
+ raw_df = panel_level(raw_df, date_column="Date")
232
+ spend_df = panel_level(spend_df, date_column="Week")
233
+ contri_df = panel_level(contri_df, date_column="Date")
234
+
235
+ # Revenue_df = excel['Revenue']
236
+
237
+ ## remove sesonalities, indices etc ...
238
+ exclude_columns = [
239
+ "Date",
240
+ "Region",
241
+ "Controls_Grammarly_Index_SeasonalAVG",
242
+ "Controls_Quillbot_Index",
243
+ "Daily_Positive_Outliers",
244
+ "External_RemoteClass_Index",
245
+ "Intervals ON 20190520-20190805 | 20200518-20200803 | 20210517-20210802",
246
+ "Intervals ON 20190826-20191209 | 20200824-20201207 | 20210823-20211206",
247
+ "Intervals ON 20201005-20201019",
248
+ "Promotion_PercentOff",
249
+ "Promotion_TimeBased",
250
+ "Seasonality_Indicator_Chirstmas",
251
+ "Seasonality_Indicator_NewYears_Days",
252
+ "Seasonality_Indicator_Thanksgiving",
253
+ "Trend 20200302 / 20200803",
254
+ ]
255
+ raw_df["Date"] = pd.to_datetime(raw_df["Date"])
256
+ contri_df["Date"] = pd.to_datetime(contri_df["Date"])
257
+ input_df = raw_df.sort_values(by="Date")
258
+ output_df = contri_df.sort_values(by="Date")
259
+ spend_df["Week"] = pd.to_datetime(
260
+ spend_df["Week"], format="%Y-%m-%d", errors="coerce"
261
+ )
262
+ spend_df.sort_values(by="Week", inplace=True)
263
 
264
  # spend_df['Week'] = pd.to_datetime(spend_df['Week'], errors='coerce')
265
  # spend_df = spend_df.sort_values(by='Week')
 
266
 
267
  channel_list = [col for col in input_df.columns if col not in exclude_columns]
268
+ channel_list = list(set(channel_list) - set(["fb_level_achieved_tier_1", "ga_app"]))
269
+
270
  response_curves = {}
271
  mapes = {}
272
  rmses = {}
 
280
  dates = input_df.Date.values
281
  actual_output_dic = {}
282
  actual_input_dic = {}
283
+
284
  for inp_col in channel_list:
285
+ # st.write(inp_col)
286
  spends = input_df[inp_col].values
287
  x = spends.copy()
288
+ # upper limit for penalty
289
+ upper_limits[inp_col] = 2 * x.max()
290
+
291
  # contribution
292
  out_col = [_col for _col in output_df.columns if _col.startswith(inp_col)][0]
293
  y = output_df[out_col].values.copy()
 
295
  actual_input_dic[inp_col] = x.copy()
296
  ##output cols aggregation
297
  output_cols.append(out_col)
298
+
299
  ## scale the input
300
+ power = np.ceil(np.log(x.max()) / np.log(10)) - 3
301
+ if power >= 0:
302
  x = x / 10**power
303
+
304
+ x = x.astype("float64")
305
+ y = y.astype("float64")
306
+ # print('#printing yyyyyyyyy')
307
+ # print(inp_col)
308
+ # print(x.max())
309
+ # print(y.max())
310
+ bounds = ((0, 0, 0, 0), (3 * y.max(), 1000, 1, x.max()))
311
+
312
+ # bounds = ((y.max(), 3*y.max()),(0,1000),(0,1),(0,x.max()))
313
+ params, _ = curve_fit(
314
+ s_curve,
315
+ x,
316
+ y,
317
+ p0=(2 * y.max(), 0.01, 1e-5, x.max()),
318
+ bounds=bounds,
319
+ maxfev=int(1e5),
320
+ )
321
  mape = (100 * abs(1 - s_curve(x, *params) / y.clip(min=1))).mean()
322
+ rmse = np.sqrt(((y - s_curve(x, *params)) ** 2).mean())
323
+ r2_ = r2_score(y, s_curve(x, *params))
324
+
325
+ response_curves[inp_col] = {
326
+ "K": params[0],
327
+ "b": params[1],
328
+ "a": params[2],
329
+ "x0": params[3],
330
+ }
331
+
332
+ updated_rcs_key = f"{metrics}#@{panel}#@{inp_col}"
333
+ if updated_rcs is not None and updated_rcs_key in list(updated_rcs.keys()):
334
+ response_curves[inp_col] = updated_rcs[updated_rcs_key]
335
 
 
336
  mapes[inp_col] = mape
337
  rmses[inp_col] = rmse
338
  r2[inp_col] = r2_
339
  powers[inp_col] = power
340
+
 
341
  ## conversion rates
342
+ spend_col = [
343
+ _col
344
+ for _col in spend_df.columns
345
+ if _col.startswith(inp_col.rsplit("_", 1)[0])
346
+ ][0]
347
+
348
+ # print('#printing spendssss')
349
+ # print(spend_col)
350
+ conv = (
351
+ spend_df.set_index("Week")[spend_col]
352
+ / input_df.set_index("Date")[inp_col].clip(lower=1)
353
+ ).reset_index()
354
+ conv.rename(columns={"index": "Week"}, inplace=True)
355
+ conv["year"] = conv.Week.dt.year
356
+ conv_rates[inp_col] = list(conv.drop("Week", axis=1).mean().to_dict().values())[
357
+ 0
358
+ ]
359
  ##print('Before',conv_rates[inp_col])
360
  # conv_rates[inp_col] = uopx_conv_rates[inp_col]
361
  ##print('After',(conv_rates[inp_col]))
362
+
363
+ channel = Channel(
364
+ name=inp_col,
365
+ dates=dates,
366
+ spends=spends,
367
+ # conversion_rate = np.mean(list(conv_rates[inp_col].values())),
368
+ conversion_rate=conv_rates[inp_col],
369
+ response_curve_type="s-curve",
370
+ response_curve_params={
371
+ "K": params[0],
372
+ "b": params[1],
373
+ "a": params[2],
374
+ "x0": params[3],
375
+ },
376
+ bounds=np.array([-10, 10]),
377
+ )
378
  channels[inp_col] = channel
379
  if sales is None:
380
  sales = channel.actual_sales
381
  else:
382
  sales += channel.actual_sales
383
+ other_contributions = (
384
+ output_df.drop([*output_cols], axis=1).sum(axis=1, numeric_only=True).values
385
+ )
386
+ correction = output_df.drop("Date", axis=1).sum(axis=1).values - (
387
+ sales + other_contributions
388
+ )
389
+ scenario = Scenario(
390
+ name="default",
391
+ channels=channels,
392
+ constant=other_contributions,
393
+ correction=correction,
394
+ )
395
  ## setting session variables
396
+ st.session_state["initialized"] = True
397
+ st.session_state["actual_df"] = input_df
398
+ st.session_state["raw_df"] = raw_df
399
+ st.session_state["contri_df"] = output_df
400
  default_scenario_dict = class_to_dict(scenario)
401
+ st.session_state["default_scenario_dict"] = default_scenario_dict
402
+ st.session_state["scenario"] = scenario
403
+ st.session_state["channels_list"] = channel_list
404
+ st.session_state["optimization_channels"] = {
405
+ channel_name: False for channel_name in channel_list
406
+ }
407
+ st.session_state["rcs"] = response_curves
408
+
409
+ st.session_state["powers"] = powers
410
+ st.session_state["actual_contribution_df"] = pd.DataFrame(actual_output_dic)
411
+ st.session_state["actual_input_df"] = pd.DataFrame(actual_input_dic)
412
+
413
  for channel in channels.values():
414
+ st.session_state[channel.name] = numerize(
415
+ channel.actual_total_spends * channel.conversion_rate, 1
416
+ )
417
+
418
+ st.session_state["xlsx_buffer"] = io.BytesIO()
419
+
420
+ if Path("../saved_scenarios.pkl").exists():
421
+ with open("../saved_scenarios.pkl", "rb") as f:
422
+ st.session_state["saved_scenarios"] = pickle.load(f)
423
  else:
424
+ st.session_state["saved_scenarios"] = OrderedDict()
425
+
426
+ # st.session_state["total_spends_change"] = 0
427
+ st.session_state["optimization_channels"] = {
428
+ channel_name: False for channel_name in channel_list
429
+ }
430
+ st.session_state["disable_download_button"] = True
431
+
432
+
433
  # def initialize_data():
434
  # # fetch data from excel
435
  # output = pd.read_excel('data.xlsx',sheet_name=None)
 
445
  # channel_list.append(col)
446
  # else:
447
  # pass
448
+
449
  # ## NOTE : Considered only Desktop spends for all calculations
450
  # acutal_df = raw_df[raw_df.Region == 'Desktop'].copy()
451
  # ## NOTE : Considered one year of data
452
  # acutal_df = acutal_df[acutal_df.Date>'2020-12-31']
453
  # actual_df = acutal_df.drop('Region',axis=1).sort_values(by='Date')[[*channel_list,'Date']]
454
+
455
  # ##load response curves
456
  # with open('./grammarly_response_curves.json','r') as f:
457
  # response_curves = json.load(f)
458
+
459
  # ## create channel dict for scenario creation
460
  # dates = actual_df.Date.values
461
  # channels = {}
 
473
  # response_curve_type=response_curve_type,
474
  # response_curve_params=response_curve_params,
475
  # bounds=np.array([-30,30]))
476
+
477
  # channels[name] = channel
478
  # else:
479
  # constant = info_dict.get('value',0.) * len(dates)
480
+
481
  # ## create scenario
482
  # scenario = Scenario(name='default', channels=channels, constant=constant)
483
  # default_scenario_dict = class_to_dict(scenario)
484
+
485
 
486
  # ## setting session variables
487
  # st.session_state['initialized'] = True
 
495
  # for channel in channels.values():
496
  # if channel.name not in st.session_state:
497
  # st.session_state[channel.name] = float(channel.actual_total_spends)
498
+
499
  # if 'xlsx_buffer' not in st.session_state:
500
  # st.session_state['xlsx_buffer'] = io.BytesIO()
501
 
 
504
  # if Path('../saved_scenarios.pkl').exists():
505
  # with open('../saved_scenarios.pkl','rb') as f:
506
  # st.session_state['saved_scenarios'] = pickle.load(f)
507
+
508
  # else:
509
  # st.session_state['saved_scenarios'] = OrderedDict()
510
 
511
  # if 'total_spends_change' not in st.session_state:
512
  # st.session_state['total_spends_change'] = 0
513
+
514
  # if 'optimization_channels' not in st.session_state:
515
  # st.session_state['optimization_channels'] = {channel_name : False for channel_name in channel_list}
516
+
517
  # if 'disable_download_button' not in st.session_state:
518
  # st.session_state['disable_download_button'] = True
519
+
520
+
521
  def create_channel_summary(scenario):
522
 
523
  # Provided data
524
  data = {
525
+ "Channel": [
526
+ "Paid Search",
527
+ "Ga will cid baixo risco",
528
+ "Digital tactic others",
529
+ "Fb la tier 1",
530
+ "Fb la tier 2",
531
+ "Paid social others",
532
+ "Programmatic",
533
+ "Kwai",
534
+ "Indicacao",
535
+ "Infleux",
536
+ "Influencer",
537
+ ],
538
+ "Spends": [
539
+ "$ 11.3K",
540
+ "$ 155.2K",
541
+ "$ 50.7K",
542
+ "$ 125.4K",
543
+ "$ 125.2K",
544
+ "$ 105K",
545
+ "$ 3.3M",
546
+ "$ 47.5K",
547
+ "$ 55.9K",
548
+ "$ 632.3K",
549
+ "$ 48.3K",
550
+ ],
551
+ "Revenue": [
552
+ "558.0K",
553
+ "3.5M",
554
+ "5.2M",
555
+ "3.1M",
556
+ "3.1M",
557
+ "2.1M",
558
+ "20.8M",
559
+ "1.6M",
560
+ "728.4K",
561
+ "22.9M",
562
+ "4.8M",
563
+ ],
564
  }
565
 
566
  # Create DataFrame
567
  df = pd.DataFrame(data)
568
 
569
  # Convert currency strings to numeric values
570
+ df["Spends"] = (
571
+ df["Spends"]
572
+ .replace({"\$": "", "K": "*1e3", "M": "*1e6"}, regex=True)
573
+ .map(pd.eval)
574
+ .astype(int)
575
+ )
576
+ df["Revenue"] = (
577
+ df["Revenue"]
578
+ .replace({"\$": "", "K": "*1e3", "M": "*1e6"}, regex=True)
579
+ .map(pd.eval)
580
+ .astype(int)
581
+ )
582
 
583
  # Calculate ROI
584
+ df["ROI"] = (df["Revenue"] - df["Spends"]) / df["Spends"]
585
 
586
  # Format columns
587
  format_currency = lambda x: f"${x:,.1f}"
588
  format_roi = lambda x: f"{x:.1f}"
589
 
590
+ df["Spends"] = [
591
+ "$ 11.3K",
592
+ "$ 155.2K",
593
+ "$ 50.7K",
594
+ "$ 125.4K",
595
+ "$ 125.2K",
596
+ "$ 105K",
597
+ "$ 3.3M",
598
+ "$ 47.5K",
599
+ "$ 55.9K",
600
+ "$ 632.3K",
601
+ "$ 48.3K",
602
+ ]
603
+ df["Revenue"] = [
604
+ "$ 536.3K",
605
+ "$ 3.4M",
606
+ "$ 5M",
607
+ "$ 3M",
608
+ "$ 3M",
609
+ "$ 2M",
610
+ "$ 20M",
611
+ "$ 1.5M",
612
+ "$ 7.1M",
613
+ "$ 22M",
614
+ "$ 4.6M",
615
+ ]
616
+ df["ROI"] = df["ROI"].apply(format_roi)
617
+
618
  return df
619
 
620
 
621
+ # @st.cache(allow_output_mutation=True)
622
  # def create_contribution_pie(scenario):
623
  # #c1f7dc
624
  # colors_map = {col:color for col,color in zip(st.session_state['channels_list'],plotly.colors.n_colors(plotly.colors.hex_to_rgb('#BE6468'), plotly.colors.hex_to_rgb('#E7B8B7'),23))}
 
650
  # weekly_spends_data = []
651
  # weekly_sales_data = []
652
  # for channel_name in st.session_state['channels_list']:
653
+ # weekly_spends_data.append((go.Bar(x=x,
654
  # y=scenario.channels[channel_name].actual_spends * scenario.channels[channel_name].conversion_rate,
655
+ # name=channel_name_formating(channel_name),
656
  # hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}",
657
  # legendgroup=channel_name)))
658
+ # weekly_sales_data.append((go.Bar(x=x,
659
  # y=scenario.channels[channel_name].actual_sales,
660
+ # name=channel_name_formating(channel_name),
661
  # hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
662
  # legendgroup=channel_name, showlegend=False)))
663
  # for _d in weekly_spends_data:
664
  # weekly_contribution_fig.add_trace(_d, row=1, col=1)
665
  # for _d in weekly_sales_data:
666
  # weekly_contribution_fig.add_trace(_d, row=1, col=2)
667
+ # weekly_contribution_fig.add_trace(go.Bar(x=x,
668
  # y=scenario.constant + scenario.correction,
669
+ # name='Non Media',
670
  # hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}"), row=1, col=2)
671
  # weekly_contribution_fig.update_layout(barmode='stack', title='Channel contribuion by week', xaxis_title='Date')
672
  # weekly_contribution_fig.update_xaxes(showgrid=False)
 
704
 
705
 
706
  def create_contribution_pie():
707
+ color_palette = [
708
+ "#F3F3F0",
709
+ "#5E7D7E",
710
+ "#2FA1FF",
711
+ "#00EDED",
712
+ "#00EAE4",
713
+ "#304550",
714
+ "#EDEBEB",
715
+ "#7FBEFD",
716
+ "#003059",
717
+ "#A2F3F3",
718
+ "#E1D6E2",
719
+ "#B6B6B6",
720
+ ]
721
+ total_contribution_fig = make_subplots(
722
+ rows=1,
723
+ cols=2,
724
+ subplot_titles=["Spends", "Revenue"],
725
+ specs=[[{"type": "pie"}, {"type": "pie"}]],
726
+ )
727
 
728
+ channels_list = [
729
+ "Paid Search",
730
+ "Ga will cid baixo risco",
731
+ "Digital tactic others",
732
+ "Fb la tier 1",
733
+ "Fb la tier 2",
734
+ "Paid social others",
735
+ "Programmatic",
736
+ "Kwai",
737
+ "Indicacao",
738
+ "Infleux",
739
+ "Influencer",
740
+ "Non Media",
741
+ ]
742
 
743
  # Assign colors from the limited palette to channels
744
+ colors_map = {
745
+ col: color_palette[i % len(color_palette)]
746
+ for i, col in enumerate(channels_list)
747
+ }
748
+ colors_map["Non Media"] = color_palette[
749
+ 5
750
+ ] # Assign fixed green color for 'Non Media'
751
 
752
  # Hardcoded values for Spends and Revenue
753
  spends_values = [0.5, 3.36, 1.1, 2.7, 2.7, 2.27, 70.6, 1, 1, 13.7, 1, 0]
 
758
  go.Pie(
759
  labels=[channel_name for channel_name in channels_list],
760
  values=spends_values,
761
+ marker=dict(
762
+ colors=[colors_map[channel_name] for channel_name in channels_list]
763
+ ),
764
+ hole=0.3,
765
  ),
766
+ row=1,
767
+ col=1,
768
  )
769
 
770
  # Add trace for Revenue pie chart
 
772
  go.Pie(
773
  labels=[channel_name for channel_name in channels_list],
774
  values=revenue_values,
775
+ marker=dict(
776
+ colors=[colors_map[channel_name] for channel_name in channels_list]
777
+ ),
778
+ hole=0.3,
779
  ),
780
+ row=1,
781
+ col=2,
782
+ )
783
+
784
+ total_contribution_fig.update_traces(
785
+ textposition="inside", texttemplate="%{percent:.1%}"
786
+ )
787
+ total_contribution_fig.update_layout(
788
+ uniformtext_minsize=12, title="Channel contribution", uniformtext_mode="hide"
789
  )
 
 
 
790
  return total_contribution_fig
791
 
792
+
793
  def create_contribuion_stacked_plot(scenario):
794
+ weekly_contribution_fig = make_subplots(
795
+ rows=1,
796
+ cols=2,
797
+ subplot_titles=["Spends", "Revenue"],
798
+ specs=[[{"type": "bar"}, {"type": "bar"}]],
799
+ )
800
+ raw_df = st.session_state["raw_df"]
801
+ df = raw_df.sort_values(by="Date")
802
  x = df.Date
803
  weekly_spends_data = []
804
  weekly_sales_data = []
805
+
806
+ for i, channel_name in enumerate(st.session_state["channels_list"]):
807
  color = color_palette[i % len(color_palette)]
808
+
809
+ weekly_spends_data.append(
810
+ go.Bar(
811
+ x=x,
812
+ y=scenario.channels[channel_name].actual_spends
813
+ * scenario.channels[channel_name].conversion_rate,
814
+ name=channel_name_formating(channel_name),
815
+ hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}",
816
+ legendgroup=channel_name,
817
+ marker_color=color,
818
+ )
819
+ )
820
+
821
+ weekly_sales_data.append(
822
+ go.Bar(
823
+ x=x,
824
+ y=scenario.channels[channel_name].actual_sales,
825
+ name=channel_name_formating(channel_name),
826
+ hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
827
+ legendgroup=channel_name,
828
+ showlegend=False,
829
+ marker_color=color,
830
+ )
831
+ )
832
+
833
  for _d in weekly_spends_data:
834
  weekly_contribution_fig.add_trace(_d, row=1, col=1)
835
  for _d in weekly_sales_data:
836
  weekly_contribution_fig.add_trace(_d, row=1, col=2)
837
+
838
+ weekly_contribution_fig.add_trace(
839
+ go.Bar(
840
+ x=x,
841
+ y=scenario.constant + scenario.correction,
842
+ name="Non Media",
843
+ hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
844
+ marker_color=color_palette[-1],
845
+ ),
846
+ row=1,
847
+ col=2,
848
+ )
849
+
850
+ weekly_contribution_fig.update_layout(
851
+ barmode="stack", title="Channel contribution by week", xaxis_title="Date"
852
+ )
853
  weekly_contribution_fig.update_xaxes(showgrid=False)
854
  weekly_contribution_fig.update_yaxes(showgrid=False)
855
  return weekly_contribution_fig
856
 
857
+
858
  def create_channel_spends_sales_plot(channel):
859
  if channel is not None:
860
  x = channel.dates
861
  _spends = channel.actual_spends * channel.conversion_rate
862
  _sales = channel.actual_sales
863
  channel_sales_spends_fig = make_subplots(specs=[[{"secondary_y": True}]])
864
+ channel_sales_spends_fig.add_trace(
865
+ go.Bar(
866
+ x=x,
867
+ y=_sales,
868
+ marker_color=color_palette[
869
+ 3
870
+ ], # You can choose a color from the palette
871
+ name="Revenue",
872
+ hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
873
+ ),
874
+ secondary_y=False,
875
+ )
876
+
877
+ channel_sales_spends_fig.add_trace(
878
+ go.Scatter(
879
+ x=x,
880
+ y=_spends,
881
+ line=dict(
882
+ color=color_palette[2]
883
+ ), # You can choose another color from the palette
884
+ name="Spends",
885
+ hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}",
886
+ ),
887
+ secondary_y=True,
888
+ )
889
+
890
+ channel_sales_spends_fig.update_layout(
891
+ xaxis_title="Date",
892
+ yaxis_title="Revenue",
893
+ yaxis2_title="Spends ($)",
894
+ title="Channel spends and Revenue week-wise",
895
+ )
896
  channel_sales_spends_fig.update_xaxes(showgrid=False)
897
  channel_sales_spends_fig.update_yaxes(showgrid=False)
898
  else:
899
+ raw_df = st.session_state["raw_df"]
900
+ df = raw_df.sort_values(by="Date")
901
  x = df.Date
902
+ scenario = class_from_dict(st.session_state["default_scenario_dict"])
903
  _sales = scenario.constant + scenario.correction
904
  channel_sales_spends_fig = make_subplots(specs=[[{"secondary_y": True}]])
905
+ channel_sales_spends_fig.add_trace(
906
+ go.Bar(
907
+ x=x,
908
+ y=_sales,
909
+ marker_color=color_palette[
910
+ 0
911
+ ], # You can choose a color from the palette
912
+ name="Revenue",
913
+ hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
914
+ ),
915
+ secondary_y=False,
916
+ )
917
+
918
+ channel_sales_spends_fig.update_layout(
919
+ xaxis_title="Date",
920
+ yaxis_title="Revenue",
921
+ yaxis2_title="Spends ($)",
922
+ title="Channel spends and Revenue week-wise",
923
+ )
924
  channel_sales_spends_fig.update_xaxes(showgrid=False)
925
  channel_sales_spends_fig.update_yaxes(showgrid=False)
926
+
927
  return channel_sales_spends_fig
928
 
929
+
930
+ def format_numbers(value, n_decimals=1, include_indicator=True):
931
  if include_indicator:
932
+ return f"{CURRENCY_INDICATOR} {numerize(value,n_decimals)}"
933
  else:
934
+ return f"{numerize(value,n_decimals)}"
935
 
936
 
937
+ def decimal_formater(num_string, n_decimals=1):
938
+ parts = num_string.split(".")
939
  if len(parts) == 1:
940
+ return num_string + "." + "0" * n_decimals
941
  else:
942
  to_be_padded = n_decimals - len(parts[-1])
943
+ if to_be_padded > 0:
944
+ return num_string + "0" * to_be_padded
945
  else:
946
  return num_string
947
+
948
+
949
  def channel_name_formating(channel_name):
950
+ name_mod = channel_name.replace("_", " ")
951
+ if name_mod.lower().endswith(" imp"):
952
+ name_mod = name_mod.replace("Imp", "Spend")
953
+ elif name_mod.lower().endswith(" clicks"):
954
+ name_mod = name_mod.replace("Clicks", "Spend")
955
  return name_mod
956
 
957
 
958
+ def send_email(email, message):
959
+ s = smtplib.SMTP("smtp.gmail.com", 587)
960
  s.starttls()
961
  s.login("geethu4444@gmail.com", "jgydhpfusuremcol")
962
  s.sendmail("geethu4444@gmail.com", email, message)
963
  s.quit()
964
 
965
+
966
  if __name__ == "__main__":
967
  initialize_data()
utilities_with_panel.py ADDED
@@ -0,0 +1,1018 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from numerize.numerize import numerize
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import json
5
+ from classes import Channel, Scenario
6
+ import numpy as np
7
+ from plotly.subplots import make_subplots
8
+ import plotly.graph_objects as go
9
+ from classes import class_to_dict
10
+ from collections import OrderedDict
11
+ import io
12
+ import plotly
13
+ from pathlib import Path
14
+ import pickle
15
+ import streamlit_authenticator as stauth
16
+ import yaml
17
+ from yaml import SafeLoader
18
+ from streamlit.components.v1 import html
19
+ import smtplib
20
+ from scipy.optimize import curve_fit
21
+ from sklearn.metrics import r2_score
22
+ from classes import class_from_dict
23
+ import os
24
+ import base64
25
+
26
+
27
+
28
+
29
+ color_palette = ['#F3F3F0', '#5E7D7E', '#2FA1FF', '#00EDED', '#00EAE4', '#304550', '#EDEBEB', '#7FBEFD', '#003059', '#A2F3F3', '#E1D6E2', '#B6B6B6']
30
+
31
+
32
+ CURRENCY_INDICATOR = '$'
33
+
34
+ def load_authenticator():
35
+ with open('config.yaml') as file:
36
+ config = yaml.load(file, Loader=SafeLoader)
37
+ st.session_state['config'] = config
38
+ authenticator = stauth.Authenticate(
39
+ config['credentials'],
40
+ config['cookie']['name'],
41
+ config['cookie']['key'],
42
+ config['cookie']['expiry_days'],
43
+ config['preauthorized']
44
+ )
45
+ st.session_state['authenticator'] = authenticator
46
+ return authenticator
47
+
48
+ def nav_page(page_name, timeout_secs=3):
49
+ nav_script = """
50
+ <script type="text/javascript">
51
+ function attempt_nav_page(page_name, start_time, timeout_secs) {
52
+ var links = window.parent.document.getElementsByTagName("a");
53
+ for (var i = 0; i < links.length; i++) {
54
+ if (links[i].href.toLowerCase().endsWith("/" + page_name.toLowerCase())) {
55
+ links[i].click();
56
+ return;
57
+ }
58
+ }
59
+ var elasped = new Date() - start_time;
60
+ if (elasped < timeout_secs * 1000) {
61
+ setTimeout(attempt_nav_page, 100, page_name, start_time, timeout_secs);
62
+ } else {
63
+ alert("Unable to navigate to page '" + page_name + "' after " + timeout_secs + " second(s).");
64
+ }
65
+ }
66
+ window.addEventListener("load", function() {
67
+ attempt_nav_page("%s", new Date(), %d);
68
+ });
69
+ </script>
70
+ """ % (page_name, timeout_secs)
71
+ html(nav_script)
72
+
73
+
74
+ # def load_local_css(file_name):
75
+ # with open(file_name) as f:
76
+ # st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
77
+
78
+
79
+ # def set_header():
80
+ # return st.markdown(f"""<div class='main-header'>
81
+ # <h1>MMM LiME</h1>
82
+ # <img src="https://assets-global.website-files.com/64c8fffb0e95cbc525815b79/64df84637f83a891c1473c51_Vector%20(Stroke).svg ">
83
+ # </div>""", unsafe_allow_html=True)
84
+
85
+ path = os.path.dirname(__file__)
86
+
87
+ file_ = open(f"{path}/mastercard_logo.png", "rb")
88
+
89
+ contents = file_.read()
90
+
91
+ data_url = base64.b64encode(contents).decode("utf-8")
92
+
93
+ file_.close()
94
+
95
+
96
+
97
+ DATA_PATH = './data'
98
+
99
+ IMAGES_PATH = './data/images_224_224'
100
+
101
+ # New - Sprint 2
102
+ if 'bin_dict' not in st.session_state:
103
+
104
+ with open("data_import.pkl", "rb") as f:
105
+ data = pickle.load(f)
106
+
107
+ st.session_state['bin_dict'] = data["bin_dict"]
108
+
109
+ panel_col = [col.lower().replace('.','_').replace('@','_').replace(" ", "_").replace('-', '').replace(':', '').replace("__", "_") for col in st.session_state['bin_dict']['Panel Level 1'] ] [0]# set the panel column
110
+
111
+ is_panel = True if len(panel_col)>0 else False
112
+
113
+ date_col='Date'
114
+ #is_panel = False # flag if set to true - do panel level response curves
115
+
116
+ def load_local_css(file_name):
117
+
118
+ with open(file_name) as f:
119
+
120
+ st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
121
+
122
+
123
+
124
+
125
+
126
+ # def set_header():
127
+
128
+ # return st.markdown(f"""<div class='main-header'>
129
+
130
+ # <h1>H & M Recommendations</h1>
131
+
132
+ # <img src="data:image;base64,{data_url}", alt="Logo">
133
+
134
+ # </div>""", unsafe_allow_html=True)
135
+ path1 = os.path.dirname(__file__)
136
+
137
+ file_1 = open(f"{path}/willbank.png", "rb")
138
+
139
+ contents1 = file_1.read()
140
+
141
+ data_url1 = base64.b64encode(contents1).decode("utf-8")
142
+
143
+ file_1.close()
144
+
145
+
146
+
147
+ DATA_PATH1 = './data'
148
+
149
+ IMAGES_PATH1 = './data/images_224_224'
150
+
151
+
152
+
153
+
154
+
155
+ def set_header():
156
+ return st.markdown(f"""<div class='main-header'>
157
+ <!-- <h1></h1> -->
158
+ <div >
159
+ <img class='blend-logo' src="data:image;base64,{data_url1}", alt="Logo">
160
+ </div>
161
+ <img class='blend-logo' src="data:image;base64,{data_url}", alt="Logo">
162
+ </div>""", unsafe_allow_html=True)
163
+
164
+ # def set_header():
165
+ # logo_path = "./path/to/your/local/LIME_logo.png" # Replace with the actual file path
166
+ # text = "LiME"
167
+ # return st.markdown(f"""<div class='main-header'>
168
+ # <img src="data:image/png;base64,{data_url}" alt="Logo" style="float: left; margin-right: 10px; width: 100px; height: auto;">
169
+ # <h1>{text}</h1>
170
+ # </div>""", unsafe_allow_html=True)
171
+
172
+
173
+ def s_curve(x,K,b,a,x0):
174
+ return K / (1 + b * np.exp(-a*(x-x0)))
175
+
176
+
177
+ def overview_test_data_prep_panel(X, df, spends_X, date_col, panel_col, target_col):
178
+ '''
179
+ function to create the data which is used in initialize data fn
180
+ X : X test with contributions
181
+ df : originally uploaded data (media data) which has raw vars
182
+ spends_X : spends of dates in X test
183
+ '''
184
+
185
+ # define channels
186
+ channels = {'paid_search': ['paid_search_impressions', 'paid_search_clicks'],
187
+
188
+ 'fb_level_achieved_tier_1': ['fb_level_achieved_tier_1_impressions'], #, 'fb:_level_achieved_-_tier_1_clicks'],
189
+
190
+ 'fb_level_achieved_tier_2': ['fb:_level_achieved_tier_2_impressions',
191
+ 'fb_level_achieved_tier_2_clicks'],
192
+
193
+ 'paid_social_others' : ['paid_social_others_impressions', 'paid_social_others_clicks'],
194
+
195
+ 'ga_app': ['ga_app_impressions', 'ga_app_clicks'],
196
+
197
+ 'digital_tactic_others': ['digital_tactic_others_impressions', 'digital_tactic_others_clicks'],
198
+
199
+ 'kwai': ['kwai_impressions', 'kwai_clicks'],
200
+
201
+ 'programmatic': ['programmatic_impressions', 'programmatic_clicks'],
202
+
203
+ # 'affiliates':['affiliates_clicks'],
204
+ #
205
+ # "indicacao":['indicacao_clicks'],
206
+ #
207
+ # "infleux":['infleux_clicks'],
208
+ #
209
+ # "influencer":['influencer_clicks']
210
+ }
211
+
212
+ channel_list = list(channels.keys())
213
+
214
+ # map transformed variable to raw variable name & channel name
215
+ # mapping eg : paid_search_clicks_lag_2 (transformed var) --> paid_search_clicks (raw var) --> paid_search (channel)
216
+ variables = {}
217
+ channel_and_variables = {}
218
+ new_variables = {}
219
+ new_channels_and_variables = {}
220
+
221
+ for transformed_var in [col for col in
222
+ X.drop(columns=[date_col, panel_col, target_col, 'pred', 'panel_effect']).columns if
223
+ "_contr" not in col]:
224
+ if len([col for col in df.columns if col in transformed_var]) == 1:
225
+ raw_var = [col for col in df.columns if col in transformed_var][0]
226
+ variables[transformed_var] = raw_var
227
+ channel_and_variables[raw_var] = [channel for channel, raw_vars in channels.items() if raw_var in raw_vars][
228
+ 0]
229
+ else:
230
+ new_variables[transformed_var] = transformed_var
231
+ new_channels_and_variables[transformed_var] = 'base'
232
+
233
+ # Raw DF
234
+ raw_X = pd.merge(X[[date_col, panel_col]], df[[date_col, panel_col] + list(variables.values())], how='left',
235
+ on=[date_col, panel_col])
236
+ assert len(raw_X) == len(X)
237
+
238
+ raw_X_cols = []
239
+ for i in raw_X.columns:
240
+ if i in channel_and_variables.keys():
241
+ raw_X_cols.append(channel_and_variables[i])
242
+ else:
243
+ raw_X_cols.append(i)
244
+ raw_X.columns = raw_X_cols
245
+
246
+ # Contribution DF
247
+ contr_X = X[[date_col, panel_col, 'panel_effect'] + [col for col in X.columns if
248
+ "_contr" in col and "sum_" not in col]].copy()
249
+ new_variables = [col for col in contr_X.columns if
250
+ "_flag" in col.lower() or "trend" in col.lower() or "sine" in col.lower()]
251
+ if len(new_variables) > 0:
252
+ contr_X['const'] = contr_X[['panel_effect'] + new_variables].sum(axis=1)
253
+ contr_X.drop(columns=['panel_effect'], inplace=True)
254
+ contr_X.drop(columns=new_variables, inplace=True)
255
+ else:
256
+ contr_X.rename(columns={'panel_effect': 'const'}, inplace=True)
257
+
258
+ new_contr_X_cols = []
259
+ for col in contr_X.columns:
260
+ col_clean = col.replace("_contr", "")
261
+ new_contr_X_cols.append(col_clean)
262
+ contr_X.columns = new_contr_X_cols
263
+
264
+ contr_X_cols = []
265
+ for i in contr_X.columns:
266
+ if i in variables.keys():
267
+ contr_X_cols.append(channel_and_variables[variables[i]])
268
+ else:
269
+ contr_X_cols.append(i)
270
+ contr_X.columns = contr_X_cols
271
+
272
+ # Spends DF
273
+ spends_X.columns = [col.replace("_cost", "") for col in spends_X.columns]
274
+
275
+ raw_X.rename(columns={"date": "Date"}, inplace=True)
276
+ contr_X.rename(columns={"date": "Date"}, inplace=True)
277
+ spends_X.rename(columns={'date': 'Week'}, inplace=True)
278
+
279
+ # Create excel
280
+ file_name = "data_test_overview_panel_#" + target_col + ".xlsx"
281
+ with pd.ExcelWriter(file_name) as writer:
282
+ raw_X.to_excel(writer, sheet_name="RAW DATA MMM", index=False)
283
+ contr_X.to_excel(writer, sheet_name="CONTRIBUTION MMM", index=False)
284
+ spends_X.to_excel(writer, sheet_name="SPEND INPUT", index=False)
285
+
286
+
287
+ def overview_test_data_prep_nonpanel(X, df, spends_X, date_col, target_col):
288
+ '''
289
+ function to create the data which is used in initialize data fn
290
+ X : X test with contributions
291
+ df : originally uploaded data (media data) which has raw vars
292
+ spends_X : spends of dates in X test
293
+ '''
294
+ # define channels
295
+ channels = {'paid_search': ['paid_search_impressions', 'paid_search_clicks'],
296
+
297
+ 'fb_level_achieved_tier_1': ['fb_level_achieved_tier_1_impressions', 'fb_level_achieved_tier_1_clicks'],
298
+
299
+ 'fb_level_achieved_tier_2': ['fb_level_achieved_tier_2_impressions',
300
+ 'fb_level_achieved_tier_2_clicks'],
301
+
302
+ 'paid_social_others' : ['paid_social_others_impressions', 'paid_social_others_clicks'],
303
+
304
+ 'ga_app_will_and_cid_pequena_baixo_risco': ['ga_app_will_and_cid_pequena_baixo_risco_impressions', 'ga_app_will_and_cid_pequena_baixo_risco_clicks'],
305
+
306
+ 'digital_tactic_others': ['digital_tactic_others_impressions', 'digital_tactic_others_clicks'],
307
+
308
+ 'kwai': ['kwai_impressions', 'kwai_clicks'],
309
+
310
+ 'programmatic': ['programmatic_impressions', 'programmatic_clicks'],
311
+
312
+ 'affiliates':['affiliates_clicks', 'affiliates_impressions'],
313
+
314
+ "indicacao":['indicacao_clicks', 'indicacao_impressions'],
315
+
316
+ "infleux":['infleux_clicks', 'infleux_impressions'],
317
+
318
+ "influencer":['influencer_clicks', 'influencer_impressions']
319
+ }
320
+
321
+ channel_list = list(channels.keys())
322
+
323
+ # map transformed variable to raw variable name & channel name
324
+ # mapping eg : paid_search_clicks_lag_2 (transformed var) --> paid_search_clicks (raw var) --> paid_search (channel)
325
+ variables = {}
326
+ channel_and_variables = {}
327
+ new_variables = {}
328
+ new_channels_and_variables = {}
329
+
330
+ cols_to_del = list(set([date_col, target_col, 'pred']).intersection((set(X.columns))))
331
+ for transformed_var in [col for col in
332
+ X.drop(columns=cols_to_del).columns if
333
+ "_contr" not in col]: # also has 'const'
334
+ if len([col for col in df.columns if col in transformed_var]) == 1: # col is raw var
335
+ raw_var = [col for col in df.columns if col in transformed_var][0]
336
+ variables[transformed_var] = raw_var
337
+ channel_and_variables[raw_var] = [channel for channel, raw_vars in channels.items() if raw_var in raw_vars][0]
338
+ else: # when no corresponding raw var then base
339
+ new_variables[transformed_var] = transformed_var
340
+ new_channels_and_variables[transformed_var] = 'base'
341
+
342
+ # Raw DF
343
+ raw_X = pd.merge(X[[date_col]], df[[date_col] + list(variables.values())], how='left',
344
+ on=[date_col])
345
+ assert len(raw_X) == len(X)
346
+
347
+ raw_X_cols = []
348
+ for i in raw_X.columns:
349
+ if i in channel_and_variables.keys():
350
+ raw_X_cols.append(channel_and_variables[i])
351
+ else:
352
+ raw_X_cols.append(i)
353
+ raw_X.columns = raw_X_cols
354
+
355
+ # Contribution DF
356
+ contr_X = X[[date_col] + [col for col in X.columns if "_contr" in col and "sum_" not in col]].copy()
357
+ # st.write(contr_X.columns)
358
+ new_variables = [col for col in contr_X.columns if
359
+ "_flag" in col.lower() or "trend" in col.lower() or "sine" in col.lower()]
360
+ if len(new_variables) > 0: # if new vars are available, their contributions should be added to base (called const)
361
+ contr_X['const_contr'] = contr_X[['const_contr'] + new_variables].sum(axis=1)
362
+ contr_X.drop(columns=new_variables, inplace=True)
363
+
364
+
365
+ new_contr_X_cols = []
366
+ for col in contr_X.columns:
367
+ col_clean = col.replace("_contr", "")
368
+ new_contr_X_cols.append(col_clean)
369
+ contr_X.columns = new_contr_X_cols
370
+
371
+ contr_X_cols = []
372
+ for i in contr_X.columns:
373
+ if i in variables.keys():
374
+ contr_X_cols.append(channel_and_variables[variables[i]])
375
+ else:
376
+ contr_X_cols.append(i)
377
+ contr_X.columns = contr_X_cols
378
+
379
+ # Spends DF
380
+ spends_X.columns = [col.replace("_cost", "").replace("_spends", '').replace("_spend", "") for col in spends_X.columns]
381
+
382
+ raw_X.rename(columns={"date": "Date"}, inplace=True)
383
+ contr_X.rename(columns={"date": "Date"}, inplace=True)
384
+ spends_X.rename(columns={'date': 'Week'}, inplace=True)
385
+
386
+ # Create excel
387
+ file_name = "data_test_overview_panel_#" + target_col + ".xlsx"
388
+ with pd.ExcelWriter(file_name) as writer:
389
+ raw_X.to_excel(writer, sheet_name="RAW DATA MMM", index=False)
390
+ contr_X.to_excel(writer, sheet_name="CONTRIBUTION MMM", index=False)
391
+ spends_X.to_excel(writer, sheet_name="SPEND INPUT", index=False)
392
+
393
+
394
+ def initialize_data(target_col):
395
+ # uopx_conv_rates = {'streaming_impressions' : 0.007,'digital_impressions' : 0.007,'search_clicks' : 0.00719,'tv_impressions' : 0.000173,
396
+ # "digital_clicks":0.005,"streaming_clicks":0.004,'streaming_spends':1,"tv_spends":1,"search_spends":1,
397
+ # "digital_spends":1}
398
+ #print('State initialized')
399
+ # excel = pd.read_excel("data_test_overview_panel.xlsx",sheet_name=None)
400
+ excel = pd.read_excel("data_test_overview_panel_#" + target_col + ".xlsx",sheet_name=None)
401
+
402
+ raw_df = excel['RAW DATA MMM']
403
+
404
+ spend_df = excel['SPEND INPUT']
405
+ contri_df = excel['CONTRIBUTION MMM']
406
+ #Revenue_df = excel['Revenue']
407
+
408
+ ## remove sesonalities, indices etc ...
409
+ exclude_columns = ['Date', 'Week',
410
+ 'Region',
411
+ 'Controls_Grammarly_Index_SeasonalAVG',
412
+ 'Controls_Quillbot_Index',
413
+ 'Daily_Positive_Outliers',
414
+ 'External_RemoteClass_Index',
415
+ 'Intervals ON 20190520-20190805 | 20200518-20200803 | 20210517-20210802',
416
+ 'Intervals ON 20190826-20191209 | 20200824-20201207 | 20210823-20211206',
417
+ 'Intervals ON 20201005-20201019',
418
+ 'Promotion_PercentOff',
419
+ 'Promotion_TimeBased',
420
+ 'Seasonality_Indicator_Chirstmas',
421
+ 'Seasonality_Indicator_NewYears_Days',
422
+ 'Seasonality_Indicator_Thanksgiving',
423
+ 'Trend 20200302 / 20200803',
424
+ date_col, panel_col
425
+ ]
426
+
427
+ # Aggregate all 3 dfs to date level (from date-panel level)
428
+ raw_df[date_col]=pd.to_datetime(raw_df[date_col])
429
+ raw_df_aggregations = {c:'sum' for c in raw_df.columns if c not in exclude_columns}
430
+ raw_df = raw_df.groupby(date_col).agg(raw_df_aggregations).reset_index()
431
+
432
+ contri_df[date_col]=pd.to_datetime(contri_df[date_col])
433
+ contri_df_aggregations = {c:'sum' for c in contri_df.columns if c not in exclude_columns}
434
+ contri_df = contri_df.groupby(date_col).agg(contri_df_aggregations).reset_index()
435
+
436
+ input_df = raw_df.sort_values(by=[date_col])
437
+
438
+ output_df = contri_df.sort_values(by=[date_col])
439
+
440
+ spend_df['Week'] = pd.to_datetime(spend_df['Week'], format='%Y-%m-%d', errors='coerce')
441
+ spend_df_aggregations = {c: 'sum' for c in spend_df.columns if c not in exclude_columns}
442
+ spend_df = spend_df.groupby('Week').agg(spend_df_aggregations).reset_index()
443
+ # spend_df['Week'] = pd.to_datetime(spend_df['Week'], errors='coerce')
444
+ # spend_df = spend_df.sort_values(by='Week')
445
+
446
+
447
+ channel_list = [col for col in input_df.columns if col not in exclude_columns]
448
+
449
+ response_curves = {}
450
+ mapes = {}
451
+ rmses = {}
452
+ upper_limits = {}
453
+ powers = {}
454
+ r2 = {}
455
+ conv_rates = {}
456
+ output_cols = []
457
+ channels = {}
458
+ sales = None
459
+ dates = input_df.Date.values
460
+ actual_output_dic = {}
461
+ actual_input_dic = {}
462
+
463
+ # ONLY FOR TESTING
464
+ # channel_list=['programmatic']
465
+ infeasible_channels = [c for c in contri_df.select_dtypes(include=['float', 'int']).columns if contri_df[c].sum()<=0]
466
+ # st.write(infeasible_channels)
467
+ channel_list=list(set(channel_list)-set(infeasible_channels))
468
+
469
+ for inp_col in channel_list:
470
+ st.write(inp_col)
471
+
472
+ # # New - Sprint 2
473
+ # if is_panel:
474
+ # input_df1 = input_df.groupby([date_col]).agg({inp_col:'sum'}).reset_index() # aggregate spends on date
475
+ # spends = input_df1[inp_col].values
476
+ # else :
477
+ # spends = input_df[inp_col].values
478
+ spends = spend_df[inp_col].values
479
+
480
+ x = spends.copy()
481
+ # upper limit for penalty
482
+ upper_limits[inp_col] = 2*x.max()
483
+
484
+
485
+
486
+ # contribution
487
+ # New - Sprint 2
488
+ out_col = [_col for _col in output_df.columns if _col.startswith(inp_col)][0]
489
+ if is_panel :
490
+ output_df1 = output_df.groupby([date_col]).agg({out_col:'sum'}).reset_index()
491
+ y = output_df1[out_col].values.copy()
492
+ else :
493
+ y = output_df[out_col].values.copy()
494
+
495
+ actual_output_dic[inp_col] = y.copy()
496
+ actual_input_dic[inp_col] = x.copy()
497
+ ##output cols aggregation
498
+ output_cols.append(out_col)
499
+
500
+ ## scale the input
501
+ power = (np.ceil(np.log(x.max()) / np.log(10) )- 3)
502
+ if power >= 0 :
503
+ x = x / 10**power
504
+
505
+
506
+ x = x.astype('float64')
507
+ y = y.astype('float64')
508
+ #print('#printing yyyyyyyyy')
509
+ #print(inp_col)
510
+ #print(x.max())
511
+ #print(y.max())
512
+ # st.write(y.max(),x.max())
513
+ print(y.max(),x.max())
514
+ if y.max()<=0.01:
515
+ if x.max()<=0.01 :
516
+ st.write("here-here")
517
+ bounds = ((0, 0, 0, 0), (3 * 0.01, 1000, 1, 0.01))
518
+
519
+ else :
520
+ st.write("here")
521
+ bounds = ((0, 0, 0, 0), (3 * 0.01, 1000, 1, 0.01))
522
+ else :
523
+ bounds = ((0, 0, 0, 0), (3 * y.max(), 1000, 1, x.max()))
524
+ #bounds = ((y.max(), 3*y.max()),(0,1000),(0,1),(0,x.max()))
525
+ params,_ = curve_fit(s_curve,x,y,p0=(2*y.max(),0.01,1e-5,x.max()),
526
+ bounds=bounds,
527
+ maxfev=int(1e5))
528
+ mape = (100 * abs(1 - s_curve(x, *params) / y.clip(min=1))).mean()
529
+ rmse = np.sqrt(((y - s_curve(x,*params))**2).mean())
530
+ r2_ = r2_score(y, s_curve(x,*params))
531
+
532
+ response_curves[inp_col] = {'K' : params[0], 'b' : params[1], 'a' : params[2], 'x0' : params[3]}
533
+ mapes[inp_col] = mape
534
+ rmses[inp_col] = rmse
535
+ r2[inp_col] = r2_
536
+ powers[inp_col] = power
537
+
538
+
539
+ ## conversion rates
540
+ spend_col = [_col for _col in spend_df.columns if _col.startswith(inp_col.rsplit('_',1)[0])][0]
541
+
542
+ #print('#printing spendssss')
543
+ #print(spend_col)
544
+ conv = (spend_df.set_index('Week')[spend_col] / input_df.set_index('Date')[inp_col].clip(lower=1)).reset_index()
545
+ conv.rename(columns={'index':'Week'},inplace=True)
546
+ conv['year'] = conv.Week.dt.year
547
+ conv_rates[inp_col] = list(conv.drop('Week',axis=1).mean().to_dict().values())[0]
548
+ ##print('Before',conv_rates[inp_col])
549
+ # conv_rates[inp_col] = uopx_conv_rates[inp_col]
550
+ ##print('After',(conv_rates[inp_col]))
551
+
552
+
553
+ channel = Channel(name=inp_col,dates=dates,
554
+ spends=spends,
555
+ # conversion_rate = np.mean(list(conv_rates[inp_col].values())),
556
+ conversion_rate = conv_rates[inp_col],
557
+ response_curve_type='s-curve',
558
+ response_curve_params={'K' : params[0], 'b' : params[1], 'a' : params[2], 'x0' : params[3]},
559
+ bounds=np.array([-10,10]))
560
+ channels[inp_col] = channel
561
+ if sales is None:
562
+ sales = channel.actual_sales
563
+ else:
564
+ sales += channel.actual_sales
565
+ # st.write(inp_col, channel.actual_sales)
566
+ # st.write(output_cols)
567
+ other_contributions = output_df.drop([*output_cols], axis=1).sum(axis=1, numeric_only = True).values
568
+ correction = output_df.drop(['Date'],axis=1).sum(axis=1).values - (sales + other_contributions)
569
+
570
+ scenario_test_df=pd.DataFrame(columns=['other_contributions','correction', 'sales'])
571
+ scenario_test_df['other_contributions']=other_contributions
572
+ scenario_test_df['correction']=correction
573
+ scenario_test_df['sales']=sales
574
+ scenario_test_df.to_csv("test/scenario_test_df.csv",index=False)
575
+ output_df.to_csv("test/output_df.csv",index=False)
576
+
577
+ scenario = Scenario(name='default', channels=channels, constant=other_contributions, correction = correction)
578
+ ## setting session variables
579
+ st.session_state['initialized'] = True
580
+ st.session_state['actual_df'] = input_df
581
+ st.session_state['raw_df'] = raw_df
582
+ st.session_state['contri_df'] = output_df
583
+ default_scenario_dict = class_to_dict(scenario)
584
+ st.session_state['default_scenario_dict'] = default_scenario_dict
585
+ st.session_state['scenario'] = scenario
586
+ st.session_state['channels_list'] = channel_list
587
+ st.session_state['optimization_channels'] = {channel_name : False for channel_name in channel_list}
588
+ st.session_state['rcs'] = response_curves
589
+ st.session_state['powers'] = powers
590
+ st.session_state['actual_contribution_df'] = pd.DataFrame(actual_output_dic)
591
+ st.session_state['actual_input_df'] = pd.DataFrame(actual_input_dic)
592
+
593
+ for channel in channels.values():
594
+ st.session_state[channel.name] = numerize(channel.actual_total_spends * channel.conversion_rate,1)
595
+
596
+ st.session_state['xlsx_buffer'] = io.BytesIO()
597
+
598
+
599
+ if Path('../saved_scenarios.pkl').exists():
600
+ with open('../saved_scenarios.pkl','rb') as f:
601
+ st.session_state['saved_scenarios'] = pickle.load(f)
602
+ else:
603
+ st.session_state['saved_scenarios'] = OrderedDict()
604
+
605
+ st.session_state['total_spends_change'] = 0
606
+ st.session_state['optimization_channels'] = {channel_name : False for channel_name in channel_list}
607
+ st.session_state['disable_download_button'] = True
608
+
609
+ # def initialize_data():
610
+ # # fetch data from excel
611
+ # output = pd.read_excel('data.xlsx',sheet_name=None)
612
+ # raw_df = output['RAW DATA MMM']
613
+ # contribution_df = output['CONTRIBUTION MMM']
614
+ # Revenue_df = output['Revenue']
615
+
616
+ # ## channels to be shows
617
+ # channel_list = []
618
+ # for col in raw_df.columns:
619
+ # if 'click' in col.lower() or 'spend' in col.lower() or 'imp' in col.lower():
620
+ # ##print(col)
621
+ # channel_list.append(col)
622
+ # else:
623
+ # pass
624
+
625
+ # ## NOTE : Considered only Desktop spends for all calculations
626
+ # acutal_df = raw_df[raw_df.Region == 'Desktop'].copy()
627
+ # ## NOTE : Considered one year of data
628
+ # acutal_df = acutal_df[acutal_df.Date>'2020-12-31']
629
+ # actual_df = acutal_df.drop('Region',axis=1).sort_values(by='Date')[[*channel_list,'Date']]
630
+
631
+ # ##load response curves
632
+ # with open('./grammarly_response_curves.json','r') as f:
633
+ # response_curves = json.load(f)
634
+
635
+ # ## create channel dict for scenario creation
636
+ # dates = actual_df.Date.values
637
+ # channels = {}
638
+ # rcs = {}
639
+ # constant = 0.
640
+ # for i,info_dict in enumerate(response_curves):
641
+ # name = info_dict.get('name')
642
+ # response_curve_type = info_dict.get('response_curve')
643
+ # response_curve_params = info_dict.get('params')
644
+ # rcs[name] = response_curve_params
645
+ # if name != 'constant':
646
+ # spends = actual_df[name].values
647
+ # channel = Channel(name=name,dates=dates,
648
+ # spends=spends,
649
+ # response_curve_type=response_curve_type,
650
+ # response_curve_params=response_curve_params,
651
+ # bounds=np.array([-30,30]))
652
+
653
+ # channels[name] = channel
654
+ # else:
655
+ # constant = info_dict.get('value',0.) * len(dates)
656
+
657
+ # ## create scenario
658
+ # scenario = Scenario(name='default', channels=channels, constant=constant)
659
+ # default_scenario_dict = class_to_dict(scenario)
660
+
661
+
662
+ # ## setting session variables
663
+ # st.session_state['initialized'] = True
664
+ # st.session_state['actual_df'] = actual_df
665
+ # st.session_state['raw_df'] = raw_df
666
+ # st.session_state['default_scenario_dict'] = default_scenario_dict
667
+ # st.session_state['scenario'] = scenario
668
+ # st.session_state['channels_list'] = channel_list
669
+ # st.session_state['optimization_channels'] = {channel_name : False for channel_name in channel_list}
670
+ # st.session_state['rcs'] = rcs
671
+ # for channel in channels.values():
672
+ # if channel.name not in st.session_state:
673
+ # st.session_state[channel.name] = float(channel.actual_total_spends)
674
+
675
+ # if 'xlsx_buffer' not in st.session_state:
676
+ # st.session_state['xlsx_buffer'] = io.BytesIO()
677
+
678
+ # ## for saving scenarios
679
+ # if 'saved_scenarios' not in st.session_state:
680
+ # if Path('../saved_scenarios.pkl').exists():
681
+ # with open('../saved_scenarios.pkl','rb') as f:
682
+ # st.session_state['saved_scenarios'] = pickle.load(f)
683
+
684
+ # else:
685
+ # st.session_state['saved_scenarios'] = OrderedDict()
686
+
687
+ # if 'total_spends_change' not in st.session_state:
688
+ # st.session_state['total_spends_change'] = 0
689
+
690
+ # if 'optimization_channels' not in st.session_state:
691
+ # st.session_state['optimization_channels'] = {channel_name : False for channel_name in channel_list}
692
+
693
+ # if 'disable_download_button' not in st.session_state:
694
+ # st.session_state['disable_download_button'] = True
695
+ def create_channel_summary(scenario):
696
+ summary_columns = []
697
+
698
+ actual_spends_rows = []
699
+
700
+ actual_sales_rows = []
701
+
702
+ actual_roi_rows = []
703
+
704
+ for channel in scenario.channels.values():
705
+
706
+ name_mod = channel.name.replace('_', ' ')
707
+
708
+ if name_mod.lower().endswith(' imp'):
709
+ name_mod = name_mod.replace('Imp', ' Impressions')
710
+
711
+ print(name_mod, channel.actual_total_spends, channel.conversion_rate,
712
+ channel.actual_total_spends * channel.conversion_rate)
713
+
714
+ summary_columns.append(name_mod)
715
+
716
+ actual_spends_rows.append(format_numbers(float(channel.actual_total_spends * channel.conversion_rate)))
717
+
718
+ actual_sales_rows.append(format_numbers((float(channel.actual_total_sales))))
719
+
720
+ actual_roi_rows.append(decimal_formater(
721
+ format_numbers((channel.actual_total_sales) / (channel.actual_total_spends * channel.conversion_rate),
722
+ include_indicator=False, n_decimals=4), n_decimals=4))
723
+
724
+ actual_summary_df = pd.DataFrame([summary_columns, actual_spends_rows, actual_sales_rows, actual_roi_rows]).T
725
+
726
+ actual_summary_df.columns = ['Channel', 'Spends', 'Prospects', 'ROI']
727
+
728
+ actual_summary_df['Prospects'] = actual_summary_df['Prospects'].map(lambda x: str(x)[1:])
729
+
730
+ return actual_summary_df
731
+
732
+
733
+ # def create_channel_summary(scenario):
734
+ #
735
+ # # Provided data
736
+ # data = {
737
+ # 'Channel': ['Paid Search', 'Ga will cid baixo risco', 'Digital tactic others', 'Fb la tier 1', 'Fb la tier 2', 'Paid social others', 'Programmatic', 'Kwai', 'Indicacao', 'Infleux', 'Influencer'],
738
+ # 'Spends': ['$ 11.3K', '$ 155.2K', '$ 50.7K', '$ 125.4K', '$ 125.2K', '$ 105K', '$ 3.3M', '$ 47.5K', '$ 55.9K', '$ 632.3K', '$ 48.3K'],
739
+ # 'Revenue': ['558.0K', '3.5M', '5.2M', '3.1M', '3.1M', '2.1M', '20.8M', '1.6M', '728.4K', '22.9M', '4.8M']
740
+ # }
741
+ #
742
+ # # Create DataFrame
743
+ # df = pd.DataFrame(data)
744
+ #
745
+ # # Convert currency strings to numeric values
746
+ # df['Spends'] = df['Spends'].replace({'\$': '', 'K': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(int)
747
+ # df['Revenue'] = df['Revenue'].replace({'\$': '', 'K': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(int)
748
+ #
749
+ # # Calculate ROI
750
+ # df['ROI'] = ((df['Revenue'] - df['Spends']) / df['Spends'])
751
+ #
752
+ # # Format columns
753
+ # format_currency = lambda x: f"${x:,.1f}"
754
+ # format_roi = lambda x: f"{x:.1f}"
755
+ #
756
+ # df['Spends'] = ['$ 11.3K', '$ 155.2K', '$ 50.7K', '$ 125.4K', '$ 125.2K', '$ 105K', '$ 3.3M', '$ 47.5K', '$ 55.9K', '$ 632.3K', '$ 48.3K']
757
+ # df['Revenue'] = ['$ 536.3K', '$ 3.4M', '$ 5M', '$ 3M', '$ 3M', '$ 2M', '$ 20M', '$ 1.5M', '$ 7.1M', '$ 22M', '$ 4.6M']
758
+ # df['ROI'] = df['ROI'].apply(format_roi)
759
+ #
760
+ # return df
761
+
762
+
763
+ @st.cache(allow_output_mutation=True)
764
+ def create_contribution_pie(scenario):
765
+ #c1f7dc
766
+ colors_map = {col:color for col,color in zip(st.session_state['channels_list'],plotly.colors.n_colors(plotly.colors.hex_to_rgb('#BE6468'), plotly.colors.hex_to_rgb('#E7B8B7'),23))}
767
+ total_contribution_fig = make_subplots(rows=1, cols=2,subplot_titles=['Spends','Revenue'],specs=[[{"type": "pie"}, {"type": "pie"}]])
768
+ total_contribution_fig.add_trace(
769
+ go.Pie(labels=[channel_name_formating(channel_name) for channel_name in st.session_state['channels_list']] + ['Non Media'],
770
+ values= [round(scenario.channels[channel_name].actual_total_spends * scenario.channels[channel_name].conversion_rate,1) for channel_name in st.session_state['channels_list']] + [0],
771
+ marker=dict(colors = [plotly.colors.label_rgb(colors_map[channel_name]) for channel_name in st.session_state['channels_list']] + ['#F0F0F0']),
772
+ hole=0.3),
773
+ row=1, col=1)
774
+
775
+ total_contribution_fig.add_trace(
776
+ go.Pie(labels=[channel_name_formating(channel_name) for channel_name in st.session_state['channels_list']] + ['Non Media'],
777
+ values= [scenario.channels[channel_name].actual_total_sales for channel_name in st.session_state['channels_list']] + [scenario.correction.sum() + scenario.constant.sum()],
778
+ hole=0.3),
779
+ row=1, col=2)
780
+
781
+ total_contribution_fig.update_traces(textposition='inside',texttemplate='%{percent:.1%}')
782
+ total_contribution_fig.update_layout(uniformtext_minsize=12,title='Channel contribution', uniformtext_mode='hide')
783
+ return total_contribution_fig
784
+
785
+ @st.cache(allow_output_mutation=True)
786
+
787
+ # def create_contribuion_stacked_plot(scenario):
788
+ # weekly_contribution_fig = make_subplots(rows=1, cols=2,subplot_titles=['Spends','Revenue'],specs=[[{"type": "bar"}, {"type": "bar"}]])
789
+ # raw_df = st.session_state['raw_df']
790
+ # df = raw_df.sort_values(by='Date')
791
+ # x = df.Date
792
+ # weekly_spends_data = []
793
+ # weekly_sales_data = []
794
+ # for channel_name in st.session_state['channels_list']:
795
+ # weekly_spends_data.append((go.Bar(x=x,
796
+ # y=scenario.channels[channel_name].actual_spends * scenario.channels[channel_name].conversion_rate,
797
+ # name=channel_name_formating(channel_name),
798
+ # hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}",
799
+ # legendgroup=channel_name)))
800
+ # weekly_sales_data.append((go.Bar(x=x,
801
+ # y=scenario.channels[channel_name].actual_sales,
802
+ # name=channel_name_formating(channel_name),
803
+ # hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
804
+ # legendgroup=channel_name, showlegend=False)))
805
+ # for _d in weekly_spends_data:
806
+ # weekly_contribution_fig.add_trace(_d, row=1, col=1)
807
+ # for _d in weekly_sales_data:
808
+ # weekly_contribution_fig.add_trace(_d, row=1, col=2)
809
+ # weekly_contribution_fig.add_trace(go.Bar(x=x,
810
+ # y=scenario.constant + scenario.correction,
811
+ # name='Non Media',
812
+ # hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}"), row=1, col=2)
813
+ # weekly_contribution_fig.update_layout(barmode='stack', title='Channel contribuion by week', xaxis_title='Date')
814
+ # weekly_contribution_fig.update_xaxes(showgrid=False)
815
+ # weekly_contribution_fig.update_yaxes(showgrid=False)
816
+ # return weekly_contribution_fig
817
+
818
+ # @st.cache(allow_output_mutation=True)
819
+ # def create_channel_spends_sales_plot(channel):
820
+ # if channel is not None:
821
+ # x = channel.dates
822
+ # _spends = channel.actual_spends * channel.conversion_rate
823
+ # _sales = channel.actual_sales
824
+ # channel_sales_spends_fig = make_subplots(specs=[[{"secondary_y": True}]])
825
+ # channel_sales_spends_fig.add_trace(go.Bar(x=x, y=_sales,marker_color='#c1f7dc',name='Revenue', hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}"), secondary_y = False)
826
+ # channel_sales_spends_fig.add_trace(go.Scatter(x=x, y=_spends,line=dict(color='#005b96'),name='Spends',hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}"), secondary_y = True)
827
+ # channel_sales_spends_fig.update_layout(xaxis_title='Date',yaxis_title='Revenue',yaxis2_title='Spends ($)',title='Channel spends and Revenue week wise')
828
+ # channel_sales_spends_fig.update_xaxes(showgrid=False)
829
+ # channel_sales_spends_fig.update_yaxes(showgrid=False)
830
+ # else:
831
+ # raw_df = st.session_state['raw_df']
832
+ # df = raw_df.sort_values(by='Date')
833
+ # x = df.Date
834
+ # scenario = class_from_dict(st.session_state['default_scenario_dict'])
835
+ # _sales = scenario.constant + scenario.correction
836
+ # channel_sales_spends_fig = make_subplots(specs=[[{"secondary_y": True}]])
837
+ # channel_sales_spends_fig.add_trace(go.Bar(x=x, y=_sales,marker_color='#c1f7dc',name='Revenue', hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}"), secondary_y = False)
838
+ # # channel_sales_spends_fig.add_trace(go.Scatter(x=x, y=_spends,line=dict(color='#15C39A'),name='Spends',hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}"), secondary_y = True)
839
+ # channel_sales_spends_fig.update_layout(xaxis_title='Date',yaxis_title='Revenue',yaxis2_title='Spends ($)',title='Channel spends and Revenue week wise')
840
+ # channel_sales_spends_fig.update_xaxes(showgrid=False)
841
+ # channel_sales_spends_fig.update_yaxes(showgrid=False)
842
+ # return channel_sales_spends_fig
843
+
844
+
845
+ # Define a shared color palette
846
+
847
+
848
+ # def create_contribution_pie():
849
+ # color_palette = ['#F3F3F0', '#5E7D7E', '#2FA1FF', '#00EDED', '#00EAE4', '#304550', '#EDEBEB', '#7FBEFD', '#003059', '#A2F3F3', '#E1D6E2', '#B6B6B6']
850
+ # total_contribution_fig = make_subplots(rows=1, cols=2, subplot_titles=['Spends', 'Revenue'], specs=[[{"type": "pie"}, {"type": "pie"}]])
851
+ #
852
+ # channels_list = ['Paid Search', 'Ga will cid baixo risco', 'Digital tactic others', 'Fb la tier 1', 'Fb la tier 2', 'Paid social others', 'Programmatic', 'Kwai', 'Indicacao', 'Infleux', 'Influencer', 'Non Media']
853
+ #
854
+ # # Assign colors from the limited palette to channels
855
+ # colors_map = {col: color_palette[i % len(color_palette)] for i, col in enumerate(channels_list)}
856
+ # colors_map['Non Media'] = color_palette[5] # Assign fixed green color for 'Non Media'
857
+ #
858
+ # # Hardcoded values for Spends and Revenue
859
+ # spends_values = [0.5, 3.36, 1.1, 2.7, 2.7, 2.27, 70.6, 1, 1, 13.7, 1, 0]
860
+ # revenue_values = [1, 4, 5, 3, 3, 2, 50.8, 1.5, 0.7, 13, 0, 16]
861
+ #
862
+ # # Add trace for Spends pie chart
863
+ # total_contribution_fig.add_trace(
864
+ # go.Pie(
865
+ # labels=[channel_name for channel_name in channels_list],
866
+ # values=spends_values,
867
+ # marker=dict(colors=[colors_map[channel_name] for channel_name in channels_list]),
868
+ # hole=0.3
869
+ # ),
870
+ # row=1, col=1
871
+ # )
872
+ #
873
+ # # Add trace for Revenue pie chart
874
+ # total_contribution_fig.add_trace(
875
+ # go.Pie(
876
+ # labels=[channel_name for channel_name in channels_list],
877
+ # values=revenue_values,
878
+ # marker=dict(colors=[colors_map[channel_name] for channel_name in channels_list]),
879
+ # hole=0.3
880
+ # ),
881
+ # row=1, col=2
882
+ # )
883
+ #
884
+ # total_contribution_fig.update_traces(textposition='inside', texttemplate='%{percent:.1%}')
885
+ # total_contribution_fig.update_layout(uniformtext_minsize=12, title='Channel contribution', uniformtext_mode='hide')
886
+ # return total_contribution_fig
887
+
888
+ def create_contribuion_stacked_plot(scenario):
889
+ weekly_contribution_fig = make_subplots(rows=1, cols=2, subplot_titles=['Spends', 'Revenue'], specs=[[{"type": "bar"}, {"type": "bar"}]])
890
+ raw_df = st.session_state['raw_df']
891
+ df = raw_df.sort_values(by='Date')
892
+ x = df.Date
893
+ weekly_spends_data = []
894
+ weekly_sales_data = []
895
+
896
+ for i, channel_name in enumerate(st.session_state['channels_list']):
897
+ color = color_palette[i % len(color_palette)]
898
+
899
+ weekly_spends_data.append(go.Bar(
900
+ x=x,
901
+ y=scenario.channels[channel_name].actual_spends * scenario.channels[channel_name].conversion_rate,
902
+ name=channel_name_formating(channel_name),
903
+ hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}",
904
+ legendgroup=channel_name,
905
+ marker_color=color,
906
+ ))
907
+
908
+ weekly_sales_data.append(go.Bar(
909
+ x=x,
910
+ y=scenario.channels[channel_name].actual_sales,
911
+ name=channel_name_formating(channel_name),
912
+ hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
913
+ legendgroup=channel_name,
914
+ showlegend=False,
915
+ marker_color=color,
916
+ ))
917
+
918
+ for _d in weekly_spends_data:
919
+ weekly_contribution_fig.add_trace(_d, row=1, col=1)
920
+ for _d in weekly_sales_data:
921
+ weekly_contribution_fig.add_trace(_d, row=1, col=2)
922
+
923
+ weekly_contribution_fig.add_trace(go.Bar(
924
+ x=x,
925
+ y=scenario.constant + scenario.correction,
926
+ name='Non Media',
927
+ hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
928
+ marker_color=color_palette[-1],
929
+ ), row=1, col=2)
930
+
931
+ weekly_contribution_fig.update_layout(barmode='stack', title='Channel contribution by week', xaxis_title='Date')
932
+ weekly_contribution_fig.update_xaxes(showgrid=False)
933
+ weekly_contribution_fig.update_yaxes(showgrid=False)
934
+ return weekly_contribution_fig
935
+
936
+ def create_channel_spends_sales_plot(channel):
937
+ if channel is not None:
938
+ x = channel.dates
939
+ _spends = channel.actual_spends * channel.conversion_rate
940
+ _sales = channel.actual_sales
941
+ channel_sales_spends_fig = make_subplots(specs=[[{"secondary_y": True}]])
942
+ channel_sales_spends_fig.add_trace(go.Bar(
943
+ x=x,
944
+ y=_sales,
945
+ marker_color=color_palette[3], # You can choose a color from the palette
946
+ name='Revenue',
947
+ hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
948
+ ), secondary_y=False)
949
+
950
+ channel_sales_spends_fig.add_trace(go.Scatter(
951
+ x=x,
952
+ y=_spends,
953
+ line=dict(color=color_palette[2]), # You can choose another color from the palette
954
+ name='Spends',
955
+ hovertemplate="Date:%{x}<br>Spend:%{y:$.2s}",
956
+ ), secondary_y=True)
957
+
958
+ channel_sales_spends_fig.update_layout(xaxis_title='Date', yaxis_title='Revenue', yaxis2_title='Spends ($)', title='Channel spends and Revenue week-wise')
959
+ channel_sales_spends_fig.update_xaxes(showgrid=False)
960
+ channel_sales_spends_fig.update_yaxes(showgrid=False)
961
+ else:
962
+ raw_df = st.session_state['raw_df']
963
+ df = raw_df.sort_values(by='Date')
964
+ x = df.Date
965
+ scenario = class_from_dict(st.session_state['default_scenario_dict'])
966
+ _sales = scenario.constant + scenario.correction
967
+ channel_sales_spends_fig = make_subplots(specs=[[{"secondary_y": True}]])
968
+ channel_sales_spends_fig.add_trace(go.Bar(
969
+ x=x,
970
+ y=_sales,
971
+ marker_color=color_palette[0], # You can choose a color from the palette
972
+ name='Revenue',
973
+ hovertemplate="Date:%{x}<br>Revenue:%{y:$.2s}",
974
+ ), secondary_y=False)
975
+
976
+ channel_sales_spends_fig.update_layout(xaxis_title='Date', yaxis_title='Revenue', yaxis2_title='Spends ($)', title='Channel spends and Revenue week-wise')
977
+ channel_sales_spends_fig.update_xaxes(showgrid=False)
978
+ channel_sales_spends_fig.update_yaxes(showgrid=False)
979
+
980
+ return channel_sales_spends_fig
981
+
982
+ def format_numbers(value, n_decimals=1,include_indicator = True):
983
+ if include_indicator:
984
+ return f'{CURRENCY_INDICATOR} {numerize(value,n_decimals)}'
985
+ else:
986
+ return f'{numerize(value,n_decimals)}'
987
+
988
+
989
+ def decimal_formater(num_string,n_decimals=1):
990
+ parts = num_string.split('.')
991
+ if len(parts) == 1:
992
+ return num_string+'.' + '0'*n_decimals
993
+ else:
994
+ to_be_padded = n_decimals - len(parts[-1])
995
+ if to_be_padded > 0 :
996
+ return num_string+'0'*to_be_padded
997
+ else:
998
+ return num_string
999
+
1000
+
1001
+ def channel_name_formating(channel_name):
1002
+ name_mod = channel_name.replace('_', ' ')
1003
+ if name_mod.lower().endswith(' imp'):
1004
+ name_mod = name_mod.replace('Imp','Spend')
1005
+ elif name_mod.lower().endswith(' clicks'):
1006
+ name_mod = name_mod.replace('Clicks','Spend')
1007
+ return name_mod
1008
+
1009
+
1010
+ def send_email(email,message):
1011
+ s = smtplib.SMTP('smtp.gmail.com', 587)
1012
+ s.starttls()
1013
+ s.login("geethu4444@gmail.com", "jgydhpfusuremcol")
1014
+ s.sendmail("geethu4444@gmail.com", email, message)
1015
+ s.quit()
1016
+
1017
+ if __name__ == "__main__":
1018
+ initialize_data()